In [126]:
import os
import sys
import importlib

import pandas as pd
import numpy as np
import seaborn as sb
import sklearn.model_selection as sel
import sklearn.metrics as metrics
import sklearn.ensemble as ensemble

from sklearn import svm


# add project modules to the path
path_to_module = os.path.abspath(os.path.join(os.getcwd(),"..","src/"))
sys.path.append(path_to_module)

import src.models.train_model as train

In [3]:
path_to_data = os.path.abspath(os.path.join(os.getcwd(),
                                            "..",
                                            "data/processed/"
                                           ))

data_dict = train.load_processed_data(file_path=path_to_data)

loading files from /home/edd/Documents/Projects/minst-ensemble/data/processed: 100%|██████████| 4/4 [00:00<00:00, 28.76it/s]


In [4]:
X = data_dict["X_train"]
y = data_dict["y_train"]

In [30]:
X_train, X_test, y_train, y_test = sel.train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=42,
                                                    shuffle=True,
                                                    stratify=y)

In [40]:
X_train_smol, X_test_smol, y_train_smol, y_test_smol = sel.train_test_split(X,
                                                    y,
                                                    test_size=0.09,
                                                    train_size=0.21,
                                                    random_state=42,
                                                    shuffle=True,
                                                    stratify=y)

which kernel is best kernel?

In [33]:
svc_params = {
              "kernel": ["linear", "poly", "rbf", "sigmoid"]
             }

In [34]:
clf = svm.SVC(verbose=True)

In [None]:
grid_search = sel.GridSearchCV(estimator=clf, 
                                 param_grid=svc_params,
                                 scoring = "f1_macro",
                                 n_jobs = 2,
                                 cv=3,
                                 verbose = 10)

use small files as this would take forever

In [43]:
grid_search.fit(X_train_smol, y_train_smol)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed: 19.0min
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed: 20.6min
[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed: 46.1min
[Parallel(n_jobs=2)]: Done  12 out of  12 | elapsed: 56.2min finished


[LibSVM]

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=True),
             iid='warn', n_jobs=2,
             param_grid={'kernel': ['linear', 'poly', 'rbf', 'sigmoid']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='f1_macro', verbose=10)

In [52]:
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,1085.362222,21.757832,43.356639,2.18275,linear,{'kernel': 'linear'},0.664146,0.662834,0.664686,0.663889,0.000778,3
1,29.973888,0.384091,14.112149,0.069193,poly,{'kernel': 'poly'},0.947785,0.948393,0.947712,0.947963,0.000305,1
2,454.907223,6.51378,62.902244,0.306357,rbf,{'kernel': 'rbf'},0.863143,0.866958,0.865564,0.865221,0.001576,2
3,437.872215,2.860258,93.743195,2.242823,sigmoid,{'kernel': 'sigmoid'},0.043294,0.044228,0.041975,0.043166,0.000924,4


In [46]:
grid_search.best_params_

{'kernel': 'poly'}

In [47]:
grid_search.best_score_

0.9479634266670861

ok, that is like, waaaaay better than softmax. Probably as the polynomial and rbf are dealing with a dataset that is not linearly seperable much better.

Lets see about tweaking these with parameter tuning

In [57]:
svc_poly_params = {"C": [0.001, 0.1, 1, 10],
                   "kernel": [ "poly"],
                   "degree": [2,3,5,7],
                   "gamma": ["auto", "scale"]
                   
                  }

In [58]:
grid_search_poly = sel.GridSearchCV(estimator=clf, 
                                 param_grid=svc_poly_params,
                                 scoring = "f1_macro",
                                 n_jobs = 2,
                                 cv=3,
                                 verbose = 10)

In [60]:
grid_search_poly.fit(X_train_smol, y_train_smol)

SyntaxError: invalid syntax (<ipython-input-60-acdac09434b6>, line 56)

In [103]:
pd.DataFrame(grid_search_poly.cv_results_).sort_values("mean_test_score", ascending=False).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_degree,param_gamma,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
27,47.845478,2.695971,16.869949,0.02552,10.0,3,scale,poly,"{'C': 10, 'degree': 3, 'gamma': 'scale', 'kern...",0.953321,0.955662,0.953943,0.954309,0.00099,1
25,35.920633,0.419507,14.934289,0.249739,10.0,2,scale,poly,"{'C': 10, 'degree': 2, 'gamma': 'scale', 'kern...",0.950615,0.951229,0.949359,0.950401,0.000778,2
8,34.807284,0.116297,14.015938,0.059963,0.1,2,auto,poly,"{'C': 0.1, 'degree': 2, 'gamma': 'auto', 'kern...",0.94953,0.95116,0.95015,0.95028,0.000672,3
10,38.177306,0.84582,14.473679,0.075151,0.1,3,auto,poly,"{'C': 0.1, 'degree': 3, 'gamma': 'auto', 'kern...",0.950131,0.949694,0.950318,0.950048,0.000261,4
18,36.81155,0.815769,14.367458,0.070897,1.0,3,auto,poly,"{'C': 1, 'degree': 3, 'gamma': 'auto', 'kernel...",0.947785,0.948393,0.947712,0.947963,0.000305,5


In [62]:
grid_search_poly.best_params_


{'C': 10, 'degree': 3, 'gamma': 'scale', 'kernel': 'poly'}

In [64]:
grid_search_poly.best_score_

0.9543086293530183

In [65]:
svc_rbf_params = {"C": [0.001, 0.1, 1, 10],
                   "kernel": [ "rbf"],
                   "degree": [2,3,5,7],
                   "gamma": ["auto", "scale"]
                   
                  }

In [72]:
grid_search_rbf = sel.GridSearchCV(estimator=clf, 
                                 param_grid=svc_rbf_params,
                                 scoring = "f1_macro",
                                 n_jobs = 2,
                                 cv=3,
                                 verbose = 10)

In [73]:
grid_search_rbf.fit(X_train_smol, y_train_smol)

Fitting 3 folds for each of 32 candidates, totalling 96 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed: 10.4min
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed: 21.4min
[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed: 54.8min
[Parallel(n_jobs=2)]: Done  14 tasks      | elapsed: 80.6min
[Parallel(n_jobs=2)]: Done  21 tasks      | elapsed: 243.5min
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed: 271.2min
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed: 301.6min
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed: 327.5min
[Parallel(n_jobs=2)]: Done  57 tasks      | elapsed: 357.8min
[Parallel(n_jobs=2)]: Done  68 tasks      | elapsed: 387.6min
[Parallel(n_jobs=2)]: Done  81 tasks      | elapsed: 421.8min
[Parallel(n_jobs=2)]: Done  96 out of  96 | elapsed: 457.6min finished


[LibSVM]

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=True),
             iid='warn', n_jobs=2,
             param_grid={'C': [0.001, 0.1, 1, 10], 'degree': [2, 3, 5, 7],
                         'gamma': ['auto', 'scale'], 'kernel': ['rbf']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='f1_macro', verbose=10)

In [74]:
grid_search_rbf.best_params_

{'C': 10, 'degree': 2, 'gamma': 'scale', 'kernel': 'rbf'}

In [75]:
grid_search_rbf.best_score_


0.9594975737493285

marginally better than polynomial

In [76]:
grid_search_rbf.best_estimator_

SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=2, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=True)

we can asses if these models are complementry and therefore could be used in an ensemble by comparing if the results are different on the test data

get test results

In [112]:
Y_pred_rbf = grid_search_rbf.predict(X_test_smol)

In [113]:
Y_pred_poly = grid_search_poly.predict(X_test_smol)

we can look at f1 scores for both

In [114]:
metrics.f1_score(y_test_smol, Y_pred_poly, average="weighted")

0.9619535393525495

In [115]:
metrics.f1_score(y_test_smol, Y_pred_rbf, average="weighted")

0.9659419376594959

somehow, both beat the train set...

this smells a little funny. It is not impossible but implies there is more varience in the training cv splits that the test.



In [120]:
rbf_correct = Y_pred_rbf == y_test_smol

In [121]:
poly_correct = Y_pred_poly == y_test_smol

where they differ

In [123]:
differ = [rbf is not poly for rbf, poly in zip(rbf_correct, poly_correct)]

In [124]:
sum(differ)/len(differ)

0.019555555555555555

so they differ (meaning one is correct and the other is not) on approx %2 of the test set, this should mean we get some improvement by combining these into a ensemble

In [127]:
grid_search_poly.best_params_

{'C': 10, 'degree': 3, 'gamma': 'scale', 'kernel': 'poly'}

In [130]:
grid_search_rbf.best_params_

{'C': 10, 'degree': 2, 'gamma': 'scale', 'kernel': 'rbf'}

In [138]:
clf_svc_poly = svm.SVC(C=10,
                       degree=3,
                       gamma="scale",
                       kernel="poly",
                       probability=True,
                       verbose=True
                      )

clf_svc_rbf = svm.SVC(C=10,
                       degree=2,
                       gamma="scale",
                       kernel="rbf",
                       probability=True,
                       verbose=True
                      )

In [139]:
svc_ensemble = [("poly_svc", clf_svc_poly),
                 ("rbf_svc", clf_svc_rbf)]

In [144]:
svc_hard_ensemble = ensemble.VotingClassifier(estimators=svc_ensemble,
                                              voting="hard",
                                              n_jobs=2
                                              )


In [145]:
svc_hard_ensemble.fit(X_train_smol, y_train_smol)

VotingClassifier(estimators=[('poly_svc',
                              SVC(C=10, cache_size=200, class_weight=None,
                                  coef0=0.0, decision_function_shape='ovr',
                                  degree=3, gamma='scale', kernel='poly',
                                  max_iter=-1, probability=True,
                                  random_state=None, shrinking=True, tol=0.001,
                                  verbose=True)),
                             ('rbf_svc',
                              SVC(C=10, cache_size=200, class_weight=None,
                                  coef0=0.0, decision_function_shape='ovr',
                                  degree=2, gamma='scale', kernel='rbf',
                                  max_iter=-1, probability=True,
                                  random_state=None, shrinking=True, tol=0.001,
                                  verbose=True))],
                 flatten_transform=True, n_jobs=2, voting='hard', weights=Non

In [146]:
Y_pred_hard= svc_hard_ensemble.predict(X_test_smol)

In [149]:
metrics.f1_score(y_test_smol, Y_pred_hard, average="weighted")

0.9637359456256835

not quite as good as rbf, probably because of tie break rules between them, try changing the order of classifiers will change the priority

In [155]:
svc_ensemble_reversed = [("rbf_svc", clf_svc_rbf),
                         ("poly_svc", clf_svc_poly),
                         ]

In [156]:
svc_hard_ensemble_rev = ensemble.VotingClassifier(estimators=svc_ensemble_reversed,
                                              voting="hard",
                                              n_jobs=2
                                              )


In [157]:
svc_hard_ensemble_rev.fit(X_train_smol, y_train_smol)

VotingClassifier(estimators=[('rbf_svc',
                              SVC(C=10, cache_size=200, class_weight=None,
                                  coef0=0.0, decision_function_shape='ovr',
                                  degree=2, gamma='scale', kernel='rbf',
                                  max_iter=-1, probability=True,
                                  random_state=None, shrinking=True, tol=0.001,
                                  verbose=True)),
                             ('poly_svc',
                              SVC(C=10, cache_size=200, class_weight=None,
                                  coef0=0.0, decision_function_shape='ovr',
                                  degree=3, gamma='scale', kernel='poly',
                                  max_iter=-1, probability=True,
                                  random_state=None, shrinking=True, tol=0.001,
                                  verbose=True))],
                 flatten_transform=True, n_jobs=2, voting='hard', weights=Non

In [158]:
Y_pred_hard_rev= svc_hard_ensemble_rev.predict(X_test_smol)

In [159]:
metrics.f1_score(y_test_smol, Y_pred_hard_rev, average="weighted")

0.9637359456256835

In [150]:
svc_soft_ensemble = ensemble.VotingClassifier(estimators=svc_ensemble,
                                              voting="soft",
                                              n_jobs=2
                                              )


In [151]:
svc_soft_ensemble.fit(X_train_smol, y_train_smol)

VotingClassifier(estimators=[('poly_svc',
                              SVC(C=10, cache_size=200, class_weight=None,
                                  coef0=0.0, decision_function_shape='ovr',
                                  degree=3, gamma='scale', kernel='poly',
                                  max_iter=-1, probability=True,
                                  random_state=None, shrinking=True, tol=0.001,
                                  verbose=True)),
                             ('rbf_svc',
                              SVC(C=10, cache_size=200, class_weight=None,
                                  coef0=0.0, decision_function_shape='ovr',
                                  degree=2, gamma='scale', kernel='rbf',
                                  max_iter=-1, probability=True,
                                  random_state=None, shrinking=True, tol=0.001,
                                  verbose=True))],
                 flatten_transform=True, n_jobs=2, voting='soft', weights=Non

In [152]:
Y_pred_soft = svc_soft_ensemble.predict(X_test_smol)

In [153]:
metrics.f1_score(y_test_smol, Y_pred_soft, average="weighted")

0.967181891448374

soft voting gives some improvement, only 0.02 though. Likely reason better than hard as this combines the two rather than have to tie-break between them