# Exploratory model training on minibatch
Trying out som different models to build a pipeline to be trained on the whole dataset.

In [1]:
# Imports
import numpy as np
import polars as pl
import time

from sklearn.model_selection import GridSearchCV, train_test_split 
from sklearn.preprocessing import StandardScaler

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression




In [2]:
# Import np arrays
X = np.load('../data/processed/pln_X_small_features_raw_128x128.npy')
y = np.load('../data/processed/pln_y_small_labels.npy')

In [3]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.23, 
    random_state=42,
    stratify=y
)

In [4]:
# Scaling

scaler = StandardScaler()
X_trained_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [5]:
# Models
rfc = RandomForestClassifier()
knc = KNeighborsClassifier()
svc = SVC()
lr = LogisticRegression()

In [6]:
# GridSearch hyperparameters

# RandomForestClassifier
n_estimators = [50, 100, 150]
maximum_depth = [None, 10, 20, 30]
min_samples_split = [2, 5, 10]

# KNeighborsClassifier
n_neighbors = [2, 5, 10]
weights = ['uniform', 'distance']
metric = ['minkowski', 'euclidean']

# SVC
svc_c = [0.5, 1, 2]
kernel = ['rbf', 'poly', 'sigmoid']

# LogisticRegression
lr_c = [0.1, 1]
max_iter = [100, 300]
param_grid_l1 = {'penalty': ['l1'], 'solver': ['liblinear'], 'C': lr_c, 'max_iter': max_iter}
param_grid_l2 = {'penalty': ['l2'], 'solver': ['lbfgs'], 'C': lr_c, 'max_iter': max_iter}
param_grid_elasticnet = {'penalty': ['elasticnet'], 'solver': ['saga'], 'C': lr_c, 'max_iter': max_iter, 'l1_ratio': [0.5]}


model_dict = {
    'rfc': (
        rfc, 
        {
            'n_estimators': n_estimators,
            'max_depth': maximum_depth,
            'min_samples_split': min_samples_split
        }
    ),
    'knc': (
        knc,
        {
            'n_neighbors':n_neighbors,
            'weights': weights,
            'metric': metric
        }
    ),
    'scv': (
        svc,
        {
            'C': svc_c,
            'kernel': kernel
        }
    ),
    'lr': (
        lr,
        [param_grid_l1, param_grid_l2, param_grid_elasticnet]
    ),
}


In [7]:
# Using GridSearchCV - hyperparams
all_results = {}

for name, (model, param_grid) in model_dict.items():
    start = time.time()
    grid = GridSearchCV(model, param_grid, n_jobs=-1)
    grid.fit(X_trained_scaled, y_train)
    end = time.time()
    elapsed = end - start
    all_results[name] = grid
    print(name, elapsed, grid.best_score_, grid.best_params_)

rfc 33.92333912849426 0.9324675324675324 {'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 100}
knc 1.8457927703857422 0.9272727272727271 {'metric': 'minkowski', 'n_neighbors': 2, 'weights': 'distance'}
scv 20.9615638256073 0.922077922077922 {'C': 2, 'kernel': 'rbf'}




lr 82.70813989639282 0.9220779220779219 {'C': 0.1, 'max_iter': 100, 'penalty': 'l2', 'solver': 'lbfgs'}


Short conclusion
- rfc 33.92333912849426 0.9324675324675324 {'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 100}
- knc 1.8457927703857422 0.9272727272727271 {'metric': 'minkowski', 'n_neighbors': 2, 'weights': 'distance'}
- scv 20.9615638256073 0.922077922077922 {'C': 2, 'kernel': 'rbf'}
- lr 82.70813989639282 0.9220779220779219 {'C': 0.1, 'max_iter': 100, 'penalty': 'l2', 'solver': 'lbfgs'}


## Comparison of the models

In [11]:
results_list = []
for name, grid in all_results.items():
    results_list.append({
        "model": name,
        "best_score": grid.best_score_,
        "best_params": str(grid.best_params_)
    })

results_df = pl.DataFrame(results_list, strict=False)
results_df

model,best_score,best_params
str,f64,str
"""rfc""",0.932468,"""{'max_depth': 20, 'min_samples…"
"""knc""",0.927273,"""{'metric': 'minkowski', 'n_nei…"
"""scv""",0.922078,"""{'C': 2, 'kernel': 'rbf'}"""
"""lr""",0.922078,"""{'C': 0.1, 'max_iter': 100, 'p…"


### Evaluation RandomForestClassifier

In [9]:
# Evaluation RandomForestClassifier


### Evaluation KNeighborClassifier

### Evaluation SVC

### Evaluation LogisticRegression