In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from model_knn import KnnModel
from model_svm import SVM_Model
from model_multilog import MultilogRegression
from model_bayes import BayesianClassifier
from model_decisiontree import DecisionTreeClassifier
from metrics import (return_precision, show_metrics_matrix, return_accuracy, return_recall, 
                     return_f1, return_roc_auc_ovr)
from image_feature_detector import PCA_transform, get_features, get_plain_data, threshold_mid, convert_to_emnist
from model_runner import ModelRunner

In [2]:
# Data

#lite data
training_data_lite = np.genfromtxt("datasets/light-train.csv",
                            delimiter=",", filling_values=0)
evaluation_data_lite = np.genfromtxt(
    "datasets/medium-test.csv", delimiter=",", filling_values=0)
evaluation_input_lite = evaluation_data_lite[:, 1:]
evaluation_answers_lite = evaluation_data_lite[:, 0]

datapack_lite = (training_data_lite, evaluation_input_lite, evaluation_answers_lite)

#medium size
training_data_mid = np.genfromtxt("datasets/light-train.csv",
                            delimiter=",", filling_values=0)
evaluation_data_mid = np.genfromtxt(
    "datasets/medium-test.csv", delimiter=",", filling_values=0)
evaluation_input_mid = evaluation_data_mid[:, 1:]
evaluation_answers_mid = evaluation_data_mid[:, 0]

datapack_mid = (training_data_mid, evaluation_input_mid, evaluation_answers_mid)

# full size
training_data = np.genfromtxt("datasets/emnist-letters-train.csv",
                            delimiter=",", filling_values=0)
evaluation_data = np.genfromtxt(
    "datasets/emnist-letters-test.csv", delimiter=",", filling_values=0)
evaluation_input = evaluation_data[:, 1:]
evaluation_answers = evaluation_data[:, 0]

datapack_full = (training_data, evaluation_input, evaluation_answers)

In [3]:
# metrics
my_metrics = [return_accuracy, return_recall, return_precision, return_f1]

np.seterr(all="ignore")

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

# Сравнение моделей уменьшения размерности данных

In [4]:
# get_plain_data --> np.ndarray with shape(784,)

# get_features --> np.ndarray with shape(13,)

# PCA
PCA_extreme = PCA_transform(24).fit(training_data) # x32 - выходной массив имеет в 32 раза меньше размерности

PCA_severe = PCA_transform(49).fit(training_data) # x16

PCA_rought = PCA_transform(98).fit(training_data) # x8

PCA_medium = PCA_transform(196).fit(training_data) # x4

PCA_fine = PCA_transform(392).fit(training_data) # x2

params_to_change = {
    'data_converter': [get_plain_data, get_features, PCA_extreme, PCA_severe, PCA_rought, PCA_medium, PCA_fine],
}

In [5]:
# Logistic

hp = {
    'data_converter': get_plain_data,
    'normalization': True,
    'shift_column': True,
    'learning_rate': 0.05,
    'batch_size': 300,
    'epochs': 300,
    'num_classes': 26,
    'reg': 'l1',
    'reg_w': 0.05,
}

params_to_change = {
    'data_converter': [get_plain_data, get_features, PCA_extreme, PCA_severe, PCA_rought, PCA_medium, PCA_fine],
}
MultilogRunner = ModelRunner(MultilogRegression, defaults=hp, metrics=my_metrics, responsive_bar=True)
MultilogRunner.run(*datapack_mid, params_to_change, one_vs_one=True)

on 0: -----With parameters-----                                                                                         
on 0: data_converter = <function get_plain_data at 0x7f6469366820>                                                      
on 0: ~fit complete in                                                                                                  
on 300: 22.938s                                                                                                         
on 300: ~eval complete in                                                                                               
Проверка модели MultilogRegression |█████▆⚠︎                                 | (!) 300/2100 [14%] in 23.2s (12.92/s)     


KeyboardInterrupt: 

In [None]:
# KNN
hp = {
    'data_converter': get_plain_data,
    'k': 1,
}

params_to_change = {
    'data_converter': [get_plain_data, get_features, PCA_extreme, PCA_severe, PCA_rought, PCA_medium, PCA_fine],
}
KNNrunner = ModelRunner(KnnModel, defaults=hp, metrics=my_metrics, responsive_bar=True)
KNNrunner.run(*datapack_mid, params_to_change, one_vs_one=True)

In [None]:
# SVM
hp = {
    'data_converter': get_plain_data,
    'num_classes': 26,
    'epochs': 100,
    'batch_size': 1000,
    'learning_rate': 0.01,
    'regularization': 0.0005,
    'normalization': True,
    'shift_column': True,
}
params_to_change = {
    'data_converter': [get_plain_data, get_features, PCA_extreme, PCA_severe, PCA_rought, PCA_medium, PCA_fine],
}

SVMRunner = ModelRunner(SVM_Model, defaults=hp, metrics=my_metrics, responsive_bar=True)
SVMRunner.run(*datapack_mid, params_to_change, one_vs_one=True)

In [None]:
# Bayes

hp = {
    'data_converter': get_plain_data,
    'num_classes': 26,
}
params_to_change = {
    'data_converter': [get_plain_data, get_features, PCA_extreme, PCA_severe, PCA_rought, PCA_medium, PCA_fine],
}

BayesRunner = ModelRunner(BayesianClassifier, defaults=hp, metrics=my_metrics)
BayesRunner.run(*datapack_mid, params_to_change, one_vs_one=True)

In [6]:
# Decision Tree

hp = {
        'data_converter': PCA_severe,
        'sample_len': 64,
        'num_classes': 26,
        'min_samples': 8,
        'max_depth': 5,
        'tree_type': 'multilabel_ovr',
}
params_to_change = {
    'data_converter': [get_plain_data, get_features, PCA_extreme, PCA_severe, PCA_rought, PCA_medium, PCA_fine],
}
TreeRunner = ModelRunner(DecisionTreeClassifier,
                         defaults=hp, metrics=my_metrics, responsive_bar=False)
TreeRunner.run(*datapack_lite, params_to_change, one_vs_one=False)

on 0: -----With parameters-----                                                                                         
on 0: data_converter = <function get_plain_data at 0x7f6469366820>                                                      
on 0: ~fit complete in                                                                                                  
Проверка модели DecisionTreeClassifier |⚠︎                                       | (!) 0/182 [0%] in 1:06:27.3 (0.00/s)  


KeyboardInterrupt: 

# Поиск оптимальных гиперпараметров моделей

In [None]:
# Logistic

hp = {
    'data_converter': PCA_severe,
    'normalization': True,
    'shift_column': True,
    'learning_rate': 1,
    'batch_size': 15,
    'epochs': 400,
    'num_classes': 26,
    'reg': None,
    'reg_w': 0.01,
}


MultilogRunner = ModelRunner(MultilogRegression, defaults=hp, metrics=my_metrics, responsive_bar=True)
params_to_change = {
    'learning_rate': [1.5, 0.3, 0.8, 0.1],
    'epochs': [400, 250, 150],
}
MultilogRunner.run(*datapack_lite, params_to_change, one_vs_one=True)

In [None]:
params_to_change = {
    'reg': [None, 'l1', 'l2'],
    'reg_w': [0.02],
}
MultilogRunner.run(*datapack_lite, params_to_change, one_vs_one=False)

In [None]:
# KNN

hp = {
    'data_converter': PCA_severe,
    'k': 3,
}


KNNrunner = ModelRunner(KnnModel, defaults=hp, metrics=my_metrics, responsive_bar=True)
params_to_change = {
    'k': [1,3,5,7,10]
}
KNNrunner.run(*datapack_lite, params_to_change, one_vs_one=True)

In [None]:
# Bayes

hp = {
    'data_converter': threshold_mid,
    'num_classes': 26,
}


BayesRunner = ModelRunner(BayesianClassifier, defaults=hp, metrics=my_metrics)
params_to_change = {
    'data_converter': [get_plain_data, threshold_mid],
}
BayesRunner.run(*datapack_mid, params_to_change, one_vs_one=False)

In [None]:
# SVM

hp = {
    'data_converter': PCA_extreme,
    'num_classes': 26,
    'epochs': 200,
    'batch_size': 100,
    'learning_rate': 0.01,
    'regularization': 0.0005,
    'normalization': True,
    'shift_column': True,
}


SVMRunner = ModelRunner(SVM_Model, defaults=hp, metrics=my_metrics, responsive_bar=True)
params_to_change = {
    'learning_rate': [0.001, 0.01, 0.1],
    'epochs': [50, 150, 450],
}
SVMRunner.run(*datapack_mid, params_to_change, one_vs_one=True)

In [None]:
# Tree
# lite data
hp = {
        'data_converter': PCA_extreme,
        'sample_len': 32,
        'num_classes': 26,
        'window_size': -1,
        'min_samples': 5,
        'max_depth': 12,
        'tree_type': 'multilabel_ovr',
}

TreeRunner = ModelRunner(DecisionTreeClassifier,
                         defaults=hp, metrics=my_metrics, responsive_bar=True)
params_to_change = {
    'tree_type': ['multilabel_ovr', 'multilabel_ovo'],
    'sample_len':[8, 64],
}
TreeRunner.run(*datapack_full, params_to_change, one_vs_one=True)

on 0: -----With parameters-----                                                                                         
on 0: tree_type = multilabel_ovr                                                                                        
on 0: sample_len = 8                                                                                                    
on 0: ~fit complete in                                                                                                  
                                                                                                                        

        self.right_mean = data[is_true][:, -1].mean()
        self.left_mean = data[is_false][:, -1].mean()
        self.right_mean = data[is_true][:, -1].mean()
        self.left_mean = data[is_false][:, -1].mean()


                                                                                                                        

        self.right_mean = data[is_true][:, -1].mean()
        self.left_mean = data[is_false][:, -1].mean()
        self.right_mean = data[is_true][:, -1].mean()
        self.left_mean = data[is_false][:, -1].mean()
        self.right_mean = data[is_true][:, -1].mean()
        self.left_mean = data[is_false][:, -1].mean()


Проверка модели DecisionTreeClassifier |                                        | ▄▆█ 0/104 [0%] in 25:33 (0.0/s, eta: -

        self.right_mean = data[is_true][:, -1].mean()
        self.left_mean = data[is_false][:, -1].mean()


Проверка модели DecisionTreeClassifier |                                        | ▇▅▃ 0/104 [0%] in 47:13 (0.0/s, eta: -

# Проверка моделей с оптимальными параметрами на полном наборе данных

In [None]:
assert(False)

# Получение предсказаний из набора данных промежуточной аттестации

In [None]:
# KNN

hp = {
    'data_converter': PCA_severe,
    'k': 1,
}
PCA_more_severe = PCA_transform(36).fit(training_data)

KNNrunner = ModelRunner(KnnModel, defaults=hp, metrics=my_metrics, responsive_bar=True)
params_to_change = {
    'data_converter': [PCA_extreme, PCA_more_severe, PCA_severe],
}
KNNrunner.run(*datapack_full, params_to_change, one_vs_one=True)
best_models = KNNrunner.get_models()

In [None]:
evaluation_input = np.genfromtxt(
    "datasets/emnist-letters-test-data.csv", delimiter=",", filling_values=0)

evaluation_labels = evaluation_input[1:, 0]
evaluation_input = evaluation_input[1:, 1:]
evaluation_input = np.array(list(map(convert_to_emnist, evaluation_input)))

evaluation_answers = evaluation_labels//800 + 1
accuracy_list = []
preds_list = []
for model in best_models:
    preds = model.predict(evaluation_input)
    preds_list.append(preds)
    accuracy_list.append(return_accuracy(preds, evaluation_answers))
    
max_ind = accuracy_list.index(max(accuracy_list))
preds = preds[max_ind]

print(f'Лучшая модель это: {best_models[max_ind]}')
print(f'С точностью: {accuracy_list[max_ind]:.3f}')

In [None]:
print(max_ind)

In [None]:
import csv

header = ['id', 'class']
pos = np.arange(20800)[:, np.newaxis].astype(np.uint32)
data = np.hstack((pos, preds[:, np.newaxis])).astype(np.uint32)

with open('Suleimanov_KNN_predictions.csv','w', newline="") as file:
    writer = csv.writer(
    file, delimiter=",", quotechar="|", quoting=csv.QUOTE_MINIMAL)
    writer.writerow(header)
    writer.writerows(data)