# Libraries

In [1]:
import pickle
import numpy as np
import time
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from paths import SKIMAGE_FEATURES_PATH, IMG2VEC_FEATURES_PATH

# Data Processing (IMG2VEC)
## Import

In [2]:
with open(IMG2VEC_FEATURES_PATH, "rb") as f:
    X, y = pickle.load(f)

## Split

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=28, stratify=y)

## Scale

In [4]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# SVM

In [5]:
SVM_pipeline = Pipeline([
    ('pca', PCA(n_components=100)),  # or use SelectKBest instead
    ('svm', SVC(kernel='rbf', C=1.0, max_iter=5000, tol=0.001, 
                random_state=28, class_weight="balanced"))
])

scores = cross_val_score(SVM_pipeline, X_train_scaled, y_train, cv=5, scoring='f1_weighted')
print(f"Initial CV score (f1_weighted): {scores.mean():.4f}")

Initial CV score (f1_weighted): 0.7637


# Random Forest

In [6]:
Forest_pipeline = Pipeline([
    ('pca', PCA(n_components=100)),  # or use SelectKBest instead
    ('RF', RandomForestClassifier(n_estimators=200, class_weight="balanced_subsample", 
                                   n_jobs=-1, random_state=28))
])

scores = cross_val_score(Forest_pipeline, X_train_scaled, y_train, cv=5, scoring='f1_weighted')
print(f"Initial CV score (f1_weighted): {scores.mean():.4f}")

Initial CV score (f1_weighted): 0.5803


# KNN

In [7]:
KNN_pipeline = Pipeline([
    ('pca', PCA(n_components=100)),  # or use SelectKBest instead
    ('KNN', KNeighborsClassifier(n_neighbors=5, metric="manhattan", weights="distance"))
])

scores = cross_val_score(KNN_pipeline, X_train_scaled, y_train, cv=5, scoring='f1_weighted')
print(f"Initial CV score (f1_weighted): {scores.mean():.4f}")

Initial CV score (f1_weighted): 0.7652


# Logistic Regresiion

In [None]:
LGR_pipeline = Pipeline([
    ('pca', PCA(n_components=100)),
    ('LGR', LogisticRegression(multi_class='multinomial', solver='saga', max_iter=1000, class_weight='balanced'))
])

scores = cross_val_score(LGR_pipeline, X_train_scaled, y_train, cv=5, scoring='f1_weighted')
print(f"Initial CV score (f1_weighted): {scores.mean():.4f}")



Initial CV score (f1_weighted): 0.6594


