In [2]:
# Open datasets

import pickle
import numpy as np
from matplotlib import pyplot as plt

def openPickledDataset(filename):
    with open(filename, 'rb') as x_train_pickle:
        return np.array(pickle.load(x_train_pickle))

# Open x_train and x_test
x_total = openPickledDataset('data/x_train.pkl') / 255.0
x_test = openPickledDataset('data/x_test.pkl') / 255.0
n_examples = len(x_total)

# Open y_train and convert to numbers
y_dictionary = {'big_cats':0, 'butterfly':1, 'cat':2, 'chicken':3, 'cow':4, 'dog':5, 
    'elephant':6, 'goat':7, 'horse':8, 'spider':9, 'squirrel':10}
y_total_names = openPickledDataset('data/y_train.pkl')
y_total = np.zeros(y_total_names.shape, dtype=int)
for index, name in enumerate(y_total_names):
    y_total[index] = y_dictionary[name]


# Pour montrer les images
# plt.imshow(x_train[0], interpolation='nearest', cmap='gray')
# plt.show()


In [3]:
# Splitting and scaling
from sklearn.model_selection import train_test_split

# Flatten data for simple techniques
x_total_flat = np.reshape(x_total, (x_total.shape[0], -1))

# Keep holdout set for validation
x_train_flat, x_valid_flat, y_train, y_valid = train_test_split(x_total_flat, y_total, test_size=0.25, random_state=100)

In [4]:
# Standard functions for basic classifiers
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import GridSearchCV

def showGridSearchScores(grid_search, classifier_name):
    print(f'\n{classifier_name} grid search:')
    for mean, params in zip(grid_search.cv_results_['mean_test_score'], 
                            grid_search.cv_results_['params']):
        print(f'{params} : {mean:.3f}')

    print(f'Best parameters : {grid_search.best_params_}')
    return grid_search.best_params_

def printMicroF1Score(y_pred, y_true, title):
    f1 = f1_score(list(y_pred), list(y_true), average='micro')
    print(title + f', F1-micro: {f1:.3f}')

scoring = 'f1_micro'

In [60]:
# Decision tree baseline
from sklearn.tree import DecisionTreeClassifier

best_decision_tree = DecisionTreeClassifier(max_depth=100, ccp_alpha=0.0009)
best_decision_tree.fit(x_train_flat, y_train)

y_pred_decision_tree = best_decision_tree.predict(x_valid_flat)
printMicroF1Score(y_pred_decision_tree, y_valid, 'Decision tree')
print(classification_report(y_valid, y_pred_decision_tree))


Decision tree, F1-micro: 0.199
              precision    recall  f1-score   support

           0       0.16      0.24      0.19       310
           1       0.45      0.07      0.12       221
           2       0.29      0.01      0.02       190
           3       0.17      0.08      0.11       318
           4       0.16      0.07      0.09       180
           5       0.21      0.38      0.27       496
           6       0.33      0.01      0.03       154
           7       0.10      0.13      0.12       181
           8       0.15      0.12      0.13       254
           9       0.25      0.46      0.33       466
          10       0.04      0.01      0.02       202

    accuracy                           0.20      2972
   macro avg       0.21      0.14      0.13      2972
weighted avg       0.21      0.20      0.16      2972



In [62]:
# Gaussian Naive Bayes grid search
from sklearn.naive_bayes import GaussianNB

param_grid_gnb = [
    { 'var_smoothing': [1E-11, 1E-10, 1E-9, 1E-8] }
]

grid_search_naive_bayes = GridSearchCV(GaussianNB(), param_grid_gnb, scoring=scoring, cv=5, n_jobs=4)
grid_search_naive_bayes.fit(x_train_flat, y_train);

showGridSearchScores(grid_search_naive_bayes, "Naive Bayes")
y_pred_naive_bayes = grid_search_naive_bayes.predict(x_valid_flat)
printMicroF1Score(y_pred_naive_bayes, y_valid, 'Naive Bayes')
print(classification_report(y_valid, y_pred_naive_bayes))


Naive Bayes grid search:
{'var_smoothing': 1e-11} : 0.205
{'var_smoothing': 1e-10} : 0.205
{'var_smoothing': 1e-09} : 0.205
{'var_smoothing': 1e-08} : 0.205
Best parameters : {'var_smoothing': 1e-11}
Decision tree, F1-micro: 0.197
              precision    recall  f1-score   support

           0       0.17      0.54      0.26       310
           1       0.14      0.05      0.07       221
           2       0.23      0.04      0.06       190
           3       0.24      0.04      0.07       318
           4       0.18      0.15      0.16       180
           5       0.24      0.05      0.08       496
           6       0.15      0.21      0.17       154
           7       0.13      0.36      0.19       181
           8       0.34      0.14      0.20       254
           9       0.29      0.40      0.34       466
          10       0.13      0.08      0.10       202

    accuracy                           0.20      2972
   macro avg       0.20      0.19      0.16      2972
weighted a

In [5]:
# Logistic regression grid search
from sklearn.linear_model import LogisticRegression

param_grid_log_reg = [
    {   'C': [0.01, 0.1, 1, 10, 100],
        'max_iter': [100],
        'solver' : ['saga'],
        'tol':[0.01] }
]
grid_search_log_reg = GridSearchCV(LogisticRegression(), param_grid_log_reg, scoring=scoring, cv=4, n_jobs=4)
grid_search_log_reg.fit(x_train_flat, y_train);

showGridSearchScores(grid_search_log_reg, "Logistic Regression")
y_pred_log_reg = grid_search_log_reg.predict(x_valid_flat)
printMicroF1Score(y_pred_log_reg, y_valid, 'Logistic Regression')
print(classification_report(y_valid, y_pred_log_reg))


Naive Bayes grid search:
{'C': 1, 'max_iter': 100, 'solver': 'saga', 'tol': 0.1} : 0.238
{'C': 10, 'max_iter': 100, 'solver': 'saga', 'tol': 0.1} : 0.236
{'C': 100, 'max_iter': 100, 'solver': 'saga', 'tol': 0.1} : 0.237
{'C': 1000, 'max_iter': 100, 'solver': 'saga', 'tol': 0.1} : 0.236
Best parameters : {'C': 1, 'max_iter': 100, 'solver': 'saga', 'tol': 0.1}
Decision tree, F1-micro: 0.224
              precision    recall  f1-score   support

           0       0.20      0.22      0.21       310
           1       0.29      0.08      0.12       221
           2       0.06      0.01      0.01       190
           3       0.27      0.24      0.25       318
           4       0.17      0.08      0.11       180
           5       0.18      0.33      0.23       496
           6       0.09      0.02      0.03       154
           7       0.19      0.16      0.18       181
           8       0.28      0.26      0.27       254
           9       0.27      0.49      0.35       466
          10