# Libraries

In [2]:
import pickle
import numpy as np
import time
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from paths import SKIMAGE_FEATURES_PATH, IMG2VEC_FEATURES_PATH

# Data Processing (IMG2VEC)
## Import

In [3]:
with open(IMG2VEC_FEATURES_PATH, "rb") as f:
    X, y = pickle.load(f)

## Split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=28, stratify=y)

## Scale

In [5]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# SVM

In [10]:
param_grid = {
    'pca__n_components': [225, 230,235],
    'svm__C': [22, 24, 26, 28],
}

SVM_pipeline = Pipeline([
    ('pca', PCA()),
    ('svm', SVC(kernel='rbf', max_iter=5000, tol=0.001, 
                random_state=28, class_weight="balanced", gamma="scale"))
])

SVM_model = GridSearchCV(SVM_pipeline, param_grid=param_grid, cv=5, scoring='f1_weighted')
SVM_model.fit(X_train_scaled, y_train)

print(f"Best CV score (f1_weighted): {SVM_model.best_score_:.4f}")
print(f"Best Parameters: {SVM_model.best_params_}")



Best CV score (f1_weighted): 0.8353
Best Parameters: {'pca__n_components': 230, 'svm__C': 24}




Initial CV score (f1_weighted): 0.7637\
\
**First Round** \
Best CV score (f1_weighted): 0.8280 \
Best Parameters: {'pca__n_components': 150, 'svm__C': 10, 'svm__gamma': 'scale'}\
\
**Second Round** \
Best CV score (f1_weighted): 0.8345\
Best Parameters: {'pca__n_components': 225, 'svm__C': 20}\
\
**Third Round**\
Best CV score (f1_weighted): 0.8353\
Best Parameters: {'pca__n_components': 230, 'svm__C': 24}

# Random Forest

In [19]:
param_grid = {
    'pca__n_components': [26,28,30,32,34],
    'RF__n_estimators': [230,240,250,260,270],
    'RF__class_weight': ['balanced', "balanced_subsample"]
}

Forest_pipeline = Pipeline([
    ('pca', PCA(n_components=100)),  # or use SelectKBest instead
    ('RF', RandomForestClassifier(n_jobs=-1, random_state=28))
])

Forest_model = GridSearchCV(Forest_pipeline, param_grid=param_grid, cv=5, scoring='f1_weighted')
Forest_model.fit(X_train_scaled, y_train)

print(f"Best CV score (f1_weighted): {Forest_model.best_score_:.4f}")
print(f"Best Parameters: {Forest_model.best_params_}")

Best CV score (f1_weighted): 0.6658
Best Parameters: {'RF__class_weight': 'balanced', 'RF__n_estimators': 230, 'pca__n_components': 28}


Initial CV score (f1_weighted): 0.5803\
\
**First Round**\
Best CV score (f1_weighted): 0.6412\
Best Parameters: {'RF__class_weight': 'balanced', 'RF__n_estimators': 200, 'pca__n_components': 50}\
\
**Second Round**\
Best CV score (f1_weighted): 0.6591\
Best Parameters: {'RF__class_weight': 'balanced', 'RF__n_estimators': 225, 'pca__n_components': 25}\
\
**Third Round**\
Best CV score (f1_weighted): 0.6601\
Best Parameters: {'RF__class_weight': 'balanced', 'RF__n_estimators': 250, 'pca__n_components': 25}\
\
**Fourth Round**\
Best CV score (f1_weighted): 0.6658\
Best Parameters: {'RF__class_weight': 'balanced', 'RF__n_estimators': 230, 'pca__n_components': 28}


# KNN

In [24]:
param_grid = {
    'pca__n_components': [203, 205, 207, 210],
    'KNN__n_neighbors': [7,8,9],
    'KNN__weights': ['uniform', "distance"],
    'KNN__metric': ['euclidean', 'manhattan', 'cosine']
}

KNN_pipeline = Pipeline([
    ('pca', PCA()),  # or use SelectKBest instead
    ('KNN', KNeighborsClassifier())
])

KNN_model = GridSearchCV(KNN_pipeline, param_grid=param_grid, cv=5, scoring='f1_weighted')
KNN_model.fit(X_train_scaled, y_train)

print(f"Best CV score (f1_weighted): {KNN_model.best_score_:.4f}")
print(f"Best Parameters: {KNN_model.best_params_}")

Best CV score (f1_weighted): 0.7841
Best Parameters: {'KNN__metric': 'cosine', 'KNN__n_neighbors': 8, 'KNN__weights': 'distance', 'pca__n_components': 210}


Initial CV score (f1_weighted): 0.7652\
\
**First Round**\
Best CV score (f1_weighted): 0.7807\
Best Parameters: {'KNN__metric': 'cosine', 'KNN__n_neighbors': 7, 'KNN__weights': 'distance', 'pca__n_components': 150}\
\
**Second Round**\
Best CV score (f1_weighted): 0.7821\
Best Parameters: {'KNN__metric': 'cosine', 'KNN__n_neighbors': 7, 'KNN__weights': 'distance', 'pca__n_components': 200}\
\
**Third Round**\
Best CV score (f1_weighted): 0.7841\
Best Parameters: {'KNN__metric': 'cosine', 'KNN__n_neighbors': 8, 'KNN__weights': 'distance', 'pca__n_components': 210}



# Logistic Regresiion

In [None]:
param_grid = {
    'pca__n_components': [225,250,275],
    'LGR__C': np.linspace(135,145,5),
    'LGR__penalty': ['l1']
}

LGR_pipeline = Pipeline([
    ('pca', PCA()),
    ('LGR', LogisticRegression(multi_class='multinomial', solver='saga', max_iter=1000, class_weight='balanced'))
])

LGR_model = GridSearchCV(LGR_pipeline, param_grid=param_grid, cv=5, scoring='f1_weighted')
LGR_model.fit(X_train_scaled, y_train)

print(f"Best CV score (f1_weighted): {LGR_model.best_score_:.4f}")
print(f"Best Parameters: {LGR_model.best_params_}")



Best CV score (f1_weighted): 0.6955
Best Parameters: {'LGR__C': np.float64(135.0), 'LGR__penalty': 'l1', 'pca__n_components': 275}


: 

Initial CV score (f1_weighted): 0.6594\
\
**First Round**\
Best CV score (f1_weighted): 0.6703\
Best Parameters: {'LGR__C': 100, 'LGR__penalty': 'l1', 'pca__n_components': 150}\
\
**Second Round**\
Best CV score (f1_weighted): 0.6903\
Best Parameters: {'LGR__C': np.float64(138.88888888888889), 'LGR__penalty': 'l1', 'pca__n_components': 250}\
\
**Third Round**\
Best CV score (f1_weighted): 0.6955\
Best Parameters: {'LGR__C': np.float64(135.0), 'LGR__penalty': 'l1', 'pca__n_components': 275}