# Exploratory model training on minibatch
Trying out som different models to build a pipeline to be trained on the whole dataset.

In [26]:
# Imports
import numpy as np
import polars as pl
import time

from sklearn.model_selection import GridSearchCV, train_test_split 
from sklearn.preprocessing import StandardScaler

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression




In [2]:
# Import np arrays
X = np.load('../data/processed/pln_X_small_features_raw_128x128.npy')
y = np.load('../data/processed/pln_y_small_labels.npy')

In [3]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.23, 
    random_state=42,
    stratify=y
)

In [4]:
# Scaling

scaler = StandardScaler()
X_trained_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [5]:
# Models
rfc = RandomForestClassifier()
knc = KNeighborsClassifier()
svc = SVC()
lr = LogisticRegression()

In [18]:
# GridSearch hyperparameters

# RandomForestClassifier
n_estimators = [50, 100, 150]
maximum_depth = [None, 10, 20, 30]
min_samples_split = [2, 5, 10]

# KNeighborsClassifier
n_neighbors = [2, 5, 10]
weights = ['uniform', 'distance']
metric = ['minkowski', 'euclidean']

# SVC
svc_c = [0.5, 1, 2]
kernel = ['rbf', 'poly', 'sigmoid']

# LogisticRegression
lr_c = [0.1, 1]
max_iter = [100, 300]
param_grid_l1 = {'penalty': ['l1'], 'solver': ['liblinear'], 'C': lr_c, 'max_iter': max_iter}
param_grid_l2 = {'penalty': ['l2'], 'solver': ['lbfgs'], 'C': lr_c, 'max_iter': max_iter}
param_grid_elasticnet = {'penalty': ['elasticnet'], 'solver': ['saga'], 'C': lr_c, 'max_iter': max_iter, 'l1_ratio': [0.5]}


model_dict = {
    'rfc': (
        rfc, 
        {
            'n_estimators': n_estimators,
            'max_depth': maximum_depth,
            'min_samples_split': min_samples_split
        }
    ),
    'knc': (
        knc,
        {
            'n_neighbors':n_neighbors,
            'weights': weights,
            'metric': metric
        }
    ),
    'scv': (
        svc,
        {
            'C': svc_c,
            'kernel': kernel
        }
    ),
    'lr': (
        lr,
        [param_grid_l1, param_grid_l2, param_grid_elasticnet]
    ),
}


In [30]:
# Using GridSearchCV - hyperparams
all_results = {}

for name, (model, param_grid) in model_dict.items():
    start = time.time()
    grid = GridSearchCV(model, param_grid, n_jobs=-1)
    grid.fit(X_trained_scaled, y_train)
    end = time.time()
    elapsed = end - start
    all_results[name] = grid
    print(name, elapsed, grid.best_score_, grid.best_params_)

  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)


rfc 30.310571908950806 0.9324675324675324 {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 50}
knc 1.841346263885498 0.9272727272727271 {'metric': 'minkowski', 'n_neighbors': 2, 'weights': 'distance'}


KeyboardInterrupt: 

Short conclusion
rfc 27.853793144226074 0.9324675324675324 {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 100}
knc 1.9162790775299072 0.9272727272727271 {'metric': 'minkowski', 'n_neighbors': 2, 'weights': 'distance'}
scv 19.987316846847534 0.922077922077922 {'C': 2, 'kernel': 'rbf'}
lr 82.52816700935364 0.9220779220779219 {'C': 0.1, 'max_iter': 100, 'penalty': 'l2', 'solver': 'lbfgs'}

## Comparison of the models

In [29]:
results_df = pl.DataFrame(grid.cv_results_, strict=False)
results_df

mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_max_iter,param_penalty,param_solver,param_l1_ratio,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
f64,f64,f64,f64,f64,i64,str,str,f64,struct[5],f64,f64,f64,f64,f64,f64,f64,i32
2.440073,0.084816,0.011854,0.004316,0.1,100,"""l1""","""liblinear""",2.8737e-315,"{0.1,null,100,""l1"",""liblinear""}",0.948052,0.922078,0.909091,0.844156,0.922078,0.909091,0.034848,5
2.372699,0.108598,0.01189,0.002225,0.1,300,"""l1""","""liblinear""",0.0,"{0.1,null,300,""l1"",""liblinear""}",0.948052,0.922078,0.909091,0.844156,0.922078,0.909091,0.034848,5
2.477025,0.084767,0.022691,0.012201,1.0,100,"""l1""","""liblinear""",0.0,"{1.0,null,100,""l1"",""liblinear""}",0.935065,0.935065,0.909091,0.87013,0.935065,0.916883,0.025449,3
3.191606,0.504718,0.024335,0.006219,1.0,300,"""l1""","""liblinear""",0.0,"{1.0,null,300,""l1"",""liblinear""}",0.935065,0.935065,0.909091,0.87013,0.935065,0.916883,0.025449,3
3.464178,0.767957,0.028407,0.00766,0.1,100,"""l2""","""lbfgs""",0.0,"{0.1,null,100,""l2"",""lbfgs""}",0.961039,0.909091,0.896104,0.883117,0.961039,0.922078,0.032855,1
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2.832219,0.315595,0.010993,0.0057,1.0,300,"""l2""","""lbfgs""",0.0,"{1.0,null,300,""l2"",""lbfgs""}",0.922078,0.896104,0.883117,0.883117,0.935065,0.903896,0.021101,7
18.976658,0.616667,0.005233,0.002299,0.1,100,"""elasticnet""","""saga""",0.5,"{0.1,0.5,100,""elasticnet"",""saga""}",0.688312,0.636364,0.61039,0.662338,0.636364,0.646753,0.026488,10
45.64525,0.905586,0.003348,0.000562,0.1,300,"""elasticnet""","""saga""",0.5,"{0.1,0.5,300,""elasticnet"",""saga""}",0.701299,0.662338,0.649351,0.675325,0.649351,0.667532,0.019437,9
19.742463,0.453819,0.003149,0.000449,1.0,100,"""elasticnet""","""saga""",0.5,"{1.0,0.5,100,""elasticnet"",""saga""}",0.571429,0.532468,0.519481,0.519481,0.545455,0.537662,0.019437,11


### Evaluation RandomForestClassifier

In [None]:
# Evaluation RandomForestClassifier


### Evaluation KNeighborClassifier

### Evaluation SVC

### Evaluation LogisticRegression