In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from model_knn import KnnModel
from model_svm import SVM_Model
from model_multilog import MultilogRegression
from model_bayes import BayesianClassifier
from model_decisiontree import DecisionTreeClassifier
from metrics import (return_precision, show_metrics_matrix, return_accuracy, return_recall, 
                     return_f1, return_roc_auc_ovr)
from image_feature_detector import PCA_transform, get_features, get_plain_data, threshold_mid
from model_runner import ModelRunner

In [2]:
# Data

#lite data
training_data_lite = np.genfromtxt("datasets/light-train.csv",
                            delimiter=",", filling_values=0)
evaluation_data_lite = np.genfromtxt(
    "datasets/medium-test.csv", delimiter=",", filling_values=0)
evaluation_input_lite = evaluation_data_lite[:, 1:]
evaluation_answers_lite = evaluation_data_lite[:, 0]

#medium size
training_data_mid = np.genfromtxt("datasets/light-train.csv",
                            delimiter=",", filling_values=0)
evaluation_data_mid = np.genfromtxt(
    "datasets/medium-test.csv", delimiter=",", filling_values=0)
evaluation_input_mid = evaluation_data_mid[:, 1:]
evaluation_answers_mid = evaluation_data_mid[:, 0]

# full size
training_data = np.genfromtxt("datasets/emnist-letters-train.csv",
                            delimiter=",", filling_values=0)
evaluation_data = np.genfromtxt(
    "datasets/emnist-letters-test.csv", delimiter=",", filling_values=0)
evaluation_input = evaluation_data[:, 1:]
evaluation_answers = evaluation_data[:, 0]

In [3]:
# metrics
my_metrics = [return_accuracy, return_recall, return_precision, return_f1]

# Сравнение моделей уменьшения размерности данных

In [4]:
# get_plain_data --> np.ndarray with shape(784,)

# get_features --> np.ndarray with shape(13,)

# PCA
PCA_extreme = PCA_transform(24).fit(training_data) # x32 - выходной массив имеет в 32 раза меньше размерности

PCA_severe = PCA_transform(49).fit(training_data) # x16

PCA_rought = PCA_transform(98).fit(training_data) # x8

PCA_medium = PCA_transform(196).fit(training_data) # x4

PCA_fine = PCA_transform(392).fit(training_data) # x2

params_to_change = {
    'data_converter': [get_plain_data, get_features, PCA_extreme, PCA_severe, PCA_rought, PCA_medium, PCA_fine],
}

In [5]:
# Logistic

hp = {
    'data_converter': get_plain_data,
    'normalization': True,
    'shift_column': True,
    'learning_rate': 0.05,
    'batch_size': 300,
    'epochs': 300,
    'num_classes': 26,
    'reg': 'l1',
    'reg_w': 0.01,
}


MultilogRunner = ModelRunner(MultilogRegression, defaults=hp, metrics=my_metrics, responsive_bar=True)
MultilogRunner.run(training_data_mid, evaluation_input_mid, evaluation_answers_mid, params_to_change, one_vs_one=True)

on 0: -----With parameters-----                                                                                         
on 0: data_converter = <function get_plain_data at 0x7ff9b2de4820>                                                      
on 0: ~fit complete in                                                                                                  
on 300: 9.117s                                                                                                          
on 300: ~eval complete in 0.158s                                                                                        
on 300:     return_accuracy = 0.564                                                                                     
on 300:     return_recall = 0.686                                                                                       
on 300:     return_precision = 0.469                                                                                    
on 300:     return_f1 = 0.436   

on 2106:     return_precision = 0.314                                                                                    
on 2106:     return_f1 = 0.272                                                                                           
on 2106: -----End with-----                                                                                             
Проверка модели MultilogRegression |████████████████████████████████████████✗︎ (!) 2107/2100 [100%] in 48.2s (43.76/s)   
On iteration 0:
With hyperparameters: [<function get_plain_data at 0x7ff9b2de4820>]
Got metrics: [0.564, 0.6856026702482796, 0.46936358063790284, 0.435883919724493]


In [6]:
# KNN
hp = {
    'data_converter': get_plain_data,
    'k': 1,
}


KNNrunner = ModelRunner(KnnModel, defaults=hp, metrics=my_metrics, responsive_bar=True)
KNNrunner.run(training_data_mid, evaluation_input_mid, evaluation_answers_mid, params_to_change, one_vs_one=True)

on 0: -----With parameters-----                                                                                         
on 0: data_converter = <function get_plain_data at 0x7ff9b2de4820>                                                      
on 0: ~fit complete in 0.004s                                                                                           
on 0: ~eval complete in                                                                                                 
on 5000: 4.041s                                                                                                         
on 5000:     return_accuracy = 0.233                                                                                    
on 5000:     return_recall = 0.437                                                                                      
on 5000:     return_precision = 0.367                                                                                   
on 5000:     return_f1 = 0.185  

on 35006:     return_precision = 0.488                                                                                  
on 35006:     return_f1 = 0.454                                                                                         
on 35006: -----End with-----                                                                                            
Проверка модели KnnModel |████████████████████████████████████████| 35007/35007 [100%] in 23.3s (1500.37/s)             
On iteration 3:
With hyperparameters: [<PCA_transform(49);vector:(784, 49)>]
Got metrics: [0.6292, 0.7339819667849938, 0.49995519026928925, 0.4768180509485743]


In [7]:
# SVM
hp = {
    'data_converter': get_plain_data,
    'num_classes': 26,
    'epochs': 100,
    'batch_size': 1000,
    'learning_rate': 0.01,
    'regularization': 0.0005,
    'normalization': True,
    'shift_column': True,
}


SVMRunner = ModelRunner(SVM_Model, defaults=hp, metrics=my_metrics, responsive_bar=True)
SVMRunner.run(training_data_mid, evaluation_input_mid, evaluation_answers_mid, params_to_change, one_vs_one=True)

on 0: -----With parameters-----                                                                                         
on 0: data_converter = <function get_plain_data at 0x7ff9b2de4820>                                                      
on 0: ~fit complete in                                                                                                  
on 100: 2.324s                                                                                                          
on 100: ~eval complete in 0.072s                                                                                        
on 100:     return_accuracy = 0.435                                                                                     
on 100:     return_recall = 0.591                                                                                       
on 100:     return_precision = 0.424                                                                                    
on 100:     return_f1 = 0.330   

on 706:     return_precision = 0.318                                                                                    
on 706:     return_f1 = 0.271                                                                                           
on 706: -----End with-----                                                                                              
Проверка модели SVM_Model |████████████████████████████████████████✗︎ (!) 707/700 [101%] in 16.4s (43.17/s)              
On iteration 2:
With hyperparameters: [<PCA_transform(24);vector:(784, 24)>]
Got metrics: [0.4754, 0.6221787483547424, 0.4192542449065625, 0.3548350002631533]


In [8]:
# Bayes

hp = {
    'data_converter': get_plain_data,
    'num_classes': 26,
}


BayesRunner = ModelRunner(BayesianClassifier, defaults=hp, metrics=my_metrics)
BayesRunner.run(training_data_mid, evaluation_input_mid, evaluation_answers_mid, params_to_change, one_vs_one=True)

on 0: -----With parameters-----                                                                                         
on 0: data_converter = <function get_plain_data at 0x7ff9b2de4820>                                                      
on 0: ~fit complete in 0.014s                                                                                           
on 0: ~eval complete in 0.119s                                                                                          
on 0:     return_accuracy = 0.270                                                                                       
on 0:     return_recall = 0.396                                                                                         
on 0:     return_precision = 0.520                                                                                      
on 0:     return_f1 = 0.196                                                                                             
on 0: -----End with-----        

        prob_inverse = np.power(prob_in, -1)/10000


on 1: -----With parameters-----                                                                                         
on 1: data_converter = <function get_features at 0x7ff9b2de4550>                                                        
on 1: ~fit complete in 0.484s                                                                                           
on 1: ~eval complete in 2.572s                                                                                          
on 1:     return_accuracy = 0.056                                                                                       
on 1:     return_recall = 0.099                                                                                         
on 1:     return_precision = 0.809                                                                                      
on 1:     return_f1 = 0.006                                                                                             
on 1: -----End with-----        

        softvals = (np.exp(mod_array - np.max(x)).T /


on 3: -----With parameters-----                                                                                         
on 3: data_converter = <PCA_transform(49);vector:(784, 49)>                                                             
on 3: ~fit complete in 0.038s                                                                                           
on 3: ~eval complete in 0.175s                                                                                          
on 3:     return_accuracy = 0.036                                                                                       
on 3:     return_recall = 0.204                                                                                         
on 3:     return_precision = 0.373                                                                                      
on 3:     return_f1 = 0.012                                                                                             
on 3: -----End with-----        

        softvals = (np.exp(mod_array - np.max(x)).T /


on 4: -----With parameters-----                                                                                         
on 4: data_converter = <PCA_transform(98);vector:(784, 98)>                                                             
on 4: ~fit complete in 0.034s                                                                                           
on 4: ~eval complete in 0.188s                                                                                          
on 4:     return_accuracy = 0.040                                                                                       
on 4:     return_recall = 0.170                                                                                         
on 4:     return_precision = 0.288                                                                                      
on 4:     return_f1 = 0.009                                                                                             
on 4: -----End with-----        

        softvals = (np.exp(mod_array - np.max(x)).T /


on 5: -----With parameters-----                                                                                         
on 5: data_converter = <PCA_transform(196);vector:(784, 196)>                                                           
on 5: ~fit complete in 0.045s                                                                                           
on 5: ~eval complete in 0.206s                                                                                          
on 5:     return_accuracy = 0.053                                                                                       
on 5:     return_recall = 0.143                                                                                         
on 5:     return_precision = 0.538                                                                                      
on 5:     return_f1 = 0.007                                                                                             
on 5: -----End with-----        

        softvals = (np.exp(mod_array - np.max(x)).T /


on 6: -----With parameters-----                                                                                         
on 6: data_converter = <PCA_transform(392);vector:(784, 392)>                                                           
on 6: ~fit complete in 0.077s                                                                                           
on 6: ~eval complete in 0.322s                                                                                          
on 6:     return_accuracy = 0.053                                                                                       
on 6:     return_recall = 0.217                                                                                         
on 6:     return_precision = 0.633                                                                                      
on 6:     return_f1 = 0.005                                                                                             
on 6: -----End with-----        

        softvals = (np.exp(mod_array - np.max(x)).T /


Проверка модели BayesianClassifier |████████████████████████████████████████| 7/7 [100%] in 4.8s (1.45/s)               
On iteration 0:
With hyperparameters: [<function get_plain_data at 0x7ff9b2de4820>]
Got metrics: [0.2702, 0.3956182433071156, 0.5202723933041188, 0.19553933860467618]


In [None]:
# Decision Tree

hp = {
        'data_converter': PCA_severe,
        'sample_len': 16,
        'num_classes': 26,
        'window_size': -1,
        'min_samples': 4,
        'max_depth': 6,
        'tree_type': 'multilabel_ovr',
}

TreeRunner = ModelRunner(DecisionTreeClassifier,
                         defaults=hp, metrics=my_metrics, responsive_bar=True)
TreeRunner.run(training_data_mid, evaluation_input_mid,
               evaluation_answers_mid, params_to_change, one_vs_one=False)

on 0: -----With parameters-----                                                                                         
on 0: data_converter = <function get_plain_data at 0x7ff9b2de4820>                                                      
Проверка модели DecisionTreeClassifier |                                        | ▆█▆ 0/182 [0%] in 2:20 (0.0/s, eta: -)

# Поиск оптимальных гиперпараметров моделей

In [None]:
# Logistic

hp = {
    'data_converter': PCA_severe,
    'normalization': True,
    'shift_column': True,
    'learning_rate': 0.02,
    'batch_size': 500,
    'epochs': 300,
    'num_classes': 26,
    'reg': None,
    'reg_w': 0.01,
}


MultilogRunner = ModelRunner(MultilogRegression, defaults=hp, metrics=my_metrics, responsive_bar=True)
params_to_change = {
    'learning_rate': [0.05, 0.1, 0.3],
    'epochs': [300, 200, 100],
}
MultilogRunner.run(training_data_lite, evaluation_input_lite, evaluation_answers_lite, 
                   params_to_change, one_vs_one=True)

In [None]:
params_to_change = {
    'reg': [None, 'l1', 'l2'],
    'reg_w': [0.01],
}
MultilogRunner.run(training_data_lite, evaluation_input_lite, evaluation_answers_lite, 
                   params_to_change, one_vs_one=False)

In [None]:
# KNN

hp = {
    'data_converter': PCA_severe,
    'k': 3,
}


KNNrunner = ModelRunner(KnnModel, defaults=hp, metrics=my_metrics, responsive_bar=True)
params_to_change = {
    'k': [1,3,5,7,10]
}
KNNrunner.run(training_data_lite, evaluation_input_lite, evaluation_answers_lite, params_to_change, one_vs_one=True)

In [None]:
# Bayes

hp = {
    'data_converter': threshold_mid,
    'num_classes': 26,
}


BayesRunner = ModelRunner(BayessianClassifier, defaults=hp, metrics=my_metrics)

BayesRunner.run(training_data_lite, evaluation_input_lite, evaluation_answers_lite, hp, one_vs_one=True)

In [None]:
# SVM

hp = {
    'data_converter': PCA_severe,
    'num_classes': 26,
    'epochs': 200,
    'batch_size': 500,
    'learning_rate': 0.01,
    'regularization': 0.01,
    'normalization': True,
    'shift_column': True,
}


SVMRunner = ModelRunner(SVM_Model, defaults=hp, metrics=my_metrics, responsive_bar=True)
params_to_change = {
    'learning_rate': [0.001, 0.01, 0.05],
    'epochs': [100, 200, 300],
}
SVMRunner.run(training_data_lite, evaluation_input_lite, evaluation_answers, 
              params_to_change, one_vs_one=True)

In [None]:
# Tree
# lite data
hp = {
        'data_converter': PCA_extreme,
        'sample_len': 16,
        'num_classes': 26,
        'window_size': -1,
        'min_samples': 3,
        'max_depth': 7,
        'tree_type': 'multilabel_ovo',
}

TreeRunner = ModelRunner(DecisionTreeClassifier,
                         defaults=hp, metrics=my_metrics, responsive_bar=True)
params_to_change = {
    'tree_type': ['multilabel_ovo', 'multilabel_ovr'],
}
TreeRunner.run(training_data_lite, evaluation_input_lite,
               evaluation_answers_lite, params_to_change, one_vs_one=False)

In [None]:
# Tree
# medium sized data
hp = {
        'data_converter': PCA_extreme,
        'sample_len': 16,
        'num_classes': 26,
        'window_size': -1,
        'min_samples': 4,
        'max_depth': 7,
        'tree_type': 'multilabel_ovo',
}

TreeRunner = ModelRunner(DecisionTreeClassifier,
                         defaults=hp, metrics=my_metrics, responsive_bar=True)
params_to_change = {
    'tree_type': ['multilabel_ovo', 'multilabel_ovr'],
}
TreeRunner.run(training_data_mid, evaluation_input_mid,
               evaluation_answers_mid, params_to_change, one_vs_one=False)

In [None]:
# Tree
# full sized data
hp = {
        'data_converter': PCA_extreme,
        'sample_len': 16,
        'num_classes': 26,
        'window_size': -1,
        'min_samples': 5,
        'max_depth': 7,
        'tree_type': 'multilabel_ovo',
}

TreeRunner = ModelRunner(DecisionTreeClassifier,
                         defaults=hp, metrics=my_metrics, responsive_bar=True)
params_to_change = {
    'tree_type': ['multilabel_ovo', 'multilabel_ovr'],
}
TreeRunner.run(training_data, evaluation_input,
               evaluation_answers, params_to_change, one_vs_one=False)