In [9]:
import os
import sys
import importlib

import pandas as pd
import numpy as np
import sklearn.model_selection as sel
import sklearn.metrics as metrics
import sklearn.neighbors as neighbors
import seaborn as sb

# add project modules to the path
path_to_module = os.path.abspath(os.path.join(os.getcwd(), "..", "src/"))
sys.path.append(path_to_module)

import src.models.train_model as train

In [11]:
path_to_data = os.path.abspath(os.path.join(os.getcwd(),
                                            "..",
                                            "data/processed/"
                                            ))

data_dict = train.load_processed_data(file_path=path_to_data)

loading files from /home/edd/Documents/Projects/minst-ensemble/data/processed: 100%|██████████| 4/4 [00:00<00:00, 76.78it/s]


In [12]:
X = data_dict["X_train"]
y = data_dict["y_train"]

In [13]:
X_train, X_test, y_train, y_test = sel.train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=42,
                                                    shuffle=True,
                                                    stratify=y)

In [14]:
X_train_smol, X_test_smol, y_train_smol, y_test_smol = sel.train_test_split(X,
                                                    y,
                                                    test_size=0.09,
                                                    train_size=0.21,
                                                    random_state=42,
                                                    shuffle=True,
                                                    stratify=y)

In [22]:
kn_clf = neighbors.KNeighborsClassifier(
                                       
                                       )                                        

In [23]:
kn_params = {"n_neighbors" : [x for x in range(2, 7,1)],
              "weights" : ["distance"],
              "leaf_size" : [x for x in range(15,35,5)]
             }


In [26]:
grid_search = sel.GridSearchCV(estimator=kn_clf,
                             param_grid=kn_params,
                             scoring="f1_macro",
                             n_jobs=2,
                             cv=3,
                             verbose=10)

In [27]:
grid_search.fit(X_train_smol, y_train_smol)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:   34.6s
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:  1.2min
[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed:  3.1min
[Parallel(n_jobs=2)]: Done  14 tasks      | elapsed:  4.7min
[Parallel(n_jobs=2)]: Done  21 tasks      | elapsed:  7.5min
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:  9.7min
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed: 13.6min
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed: 17.0min
[Parallel(n_jobs=2)]: Done  57 tasks      | elapsed: 21.5min
[Parallel(n_jobs=2)]: Done  60 out of  60 | elapsed: 22.4min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=2,
             param_grid={'leaf_size': [15, 20, 25, 30],
                         'n_neighbors': [2, 3, 4, 5, 6],
                         'weights': ['distance']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='f1_macro', verbose=10)

In [28]:
grid_search.best_params_

{'leaf_size': 15, 'n_neighbors': 6, 'weights': 'distance'}

In [29]:
grid_search.best_score_

0.9374186017628744

smaller leaf size, more neighbor might improve this

In [30]:
kn_params = {"n_neighbors" : [x for x in range(5, 11,1)],
              "weights" : ["distance"],
              "leaf_size" : [x for x in range(5,20,5)]
             }


In [31]:
grid_search = sel.GridSearchCV(estimator=kn_clf,
                             param_grid=kn_params,
                             scoring="f1_macro",
                             n_jobs=2,
                             cv=3,
                             verbose=10)

In [32]:
grid_search.fit(X_train_smol, y_train_smol)

Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:   50.5s
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:  1.9min
[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed:  4.4min
[Parallel(n_jobs=2)]: Done  14 tasks      | elapsed:  6.7min
[Parallel(n_jobs=2)]: Done  21 tasks      | elapsed: 10.0min
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed: 12.9min
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed: 16.8min
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed: 20.3min
[Parallel(n_jobs=2)]: Done  54 out of  54 | elapsed: 23.5min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=2,
             param_grid={'leaf_size': [5, 10, 15],
                         'n_neighbors': [5, 6, 7, 8, 9, 10],
                         'weights': ['distance']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='f1_macro', verbose=10)

In [33]:
grid_search.best_params_

{'leaf_size': 5, 'n_neighbors': 6, 'weights': 'distance'}

In [34]:
grid_search.best_score_

0.9374186017628744

train on the full dataset to check the params 

In [16]:
kn_clf = neighbors.KNeighborsClassifier(leaf_size=5,
                                        n_neighbors=6,
                                        weights="distance",
                                        n_jobs=2
                                       )         

In [17]:
sel.cross_validate(kn_clf,
                   X_train,
                   y_train,
                   cv=3,
                   n_jobs=2,
                   scoring="f1_macro",
                   return_train_score=True,
                   return_estimator=True,
                   verbose=True
                   )

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   3 out of   3 | elapsed: 17.8min finished


{'fit_time': array([1.1826489 , 1.16768074, 1.15691495]),
 'score_time': array([214.44076252, 207.36919355, 147.39549112]),
 'estimator': (KNeighborsClassifier(algorithm='auto', leaf_size=5, metric='minkowski',
                       metric_params=None, n_jobs=2, n_neighbors=6, p=2,
                       weights='distance'),
  KNeighborsClassifier(algorithm='auto', leaf_size=5, metric='minkowski',
                       metric_params=None, n_jobs=2, n_neighbors=6, p=2,
                       weights='distance'),
  KNeighborsClassifier(algorithm='auto', leaf_size=5, metric='minkowski',
                       metric_params=None, n_jobs=2, n_neighbors=6, p=2,
                       weights='distance')),
 'test_score': array([0.95713274, 0.95645131, 0.95798829]),
 'train_score': array([1., 1., 1.])}

slightly overfitting given that we are perfect with the trainin sets

In [18]:
kn_clf.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=5, metric='minkowski',
                     metric_params=None, n_jobs=2, n_neighbors=6, p=2,
                     weights='distance')

In [21]:
y_pred = kn_clf.predict(X_test)

In [23]:
metrics.f1_score(y_pred, y_test, average="macro")

0.9626594820269178