# Эксперимент 3

Использование старой и новой выборки для **многоклассовой** классификации

In [37]:
import os

import pandas as pd

## Шаг 1. Загрузка таблиц с выявленными признаками

In [180]:
def csv_extractor(path: str, class_name: str):
    cells = pd.read_csv(path, delimiter=';')
    cells = cells.drop(columns=cells.columns[0])
    cells = cells.apply(lambda x: pd.to_numeric(x.astype(str).str.replace(',','.'), errors='coerce'))
    cells["class"] = class_name
    return cells

In [181]:
def csv_preparetor(path: str, case: str):
    all_cells = []
    con_path = f'{path}{case}'
    for file in os.listdir(con_path):
        path = f'{con_path}/{file}'
        
        # example name: "old_train_blasts.csv"
        class_name = file.split(sep='_')[-1].split(sep='.')[0]
        
        all_cells.append(csv_extractor(path, class_name))
    return pd.concat(all_cells)


In [182]:
TO_OLD = '../old_csv/'
TO_NEW = '../new_csv/'

test_old = csv_preparetor(TO_OLD, 'test')
train_old = csv_preparetor(TO_OLD, 'train')

test_new = csv_preparetor(TO_NEW, 'test')
train_new = csv_preparetor(TO_NEW, 'train')
train_new = train_new.fillna(0)

In [183]:
test_all = pd.concat([test_old, test_new])
train_all = pd.concat([train_old, train_new])

In [184]:
len(set(test_all.iloc[:,-1:].to_numpy().flatten()))

6

In [185]:
def splitter(train_cells, test_cells):
    X_train = train_cells.drop(columns=train_cells.columns[-1]).to_numpy()
    y_train = train_cells.iloc[:,-1:].to_numpy().flatten()

    X_test = test_cells.drop(columns=test_cells.columns[-1]).to_numpy()
    y_test = test_cells.iloc[:,-1:].to_numpy().flatten()
    
    return (X_train, y_train, X_test, y_test)

## Шаг 2. KNN

In [186]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

def knn(X_train, y_train, X_test, y_test, n_neighbors=5):
    neigh_pipe = make_pipeline(
        StandardScaler(),
        KNeighborsClassifier(n_neighbors=n_neighbors)
    )
    neigh_pipe.fit(X_train, y_train)
    return neigh_pipe.score(X_test, y_test)

### Старая выборка

In [187]:
X_train, y_train, X_test, y_test = splitter(train_old, test_old)
print("Средняя точность по тестовой выборке:", knn(X_train, y_train, X_test, y_test))

Средняя точность по тестовой выборке: 0.58


### Новая выборка

In [188]:
X_train, y_train, X_test, y_test = splitter(train_new, test_new)
print("Средняя точность по тестовой выборке:", knn(X_train, y_train, X_test, y_test))

Средняя точность по тестовой выборке: 0.6153846153846154


### Старая + Новая выборки

In [189]:
X_train, y_train, X_test, y_test = splitter(train_all, test_all)
print("Средняя точность по тестовой выборке:", knn(X_train, y_train, X_test, y_test))

Средняя точность по тестовой выборке: 0.5882352941176471


## Шаг 3. SVM

In [190]:
from sklearn.svm import SVC

def svc_helper(X_train, y_train, X_test, y_test):
    svc_rbf = SVC(C=1.5, kernel='rbf', cache_size=1000)
    
    # Обучение
    svc_rbf_pipe = make_pipeline(StandardScaler(), svc_rbf)
    svc_rbf_pipe.fit(X_train, y_train)
    
    # Тестирование
    return svc_rbf_pipe.score(X_test, y_test)

### Старая выборка

In [191]:
X_train, y_train, X_test, y_test = splitter(train_old, test_old)
print("Средняя точность по тестовой выборке:", svc_helper(X_train, y_train, X_test, y_test))

Средняя точность по тестовой выборке: 0.6133333333333333


### Новая выборка

In [192]:
X_train, y_train, X_test, y_test = splitter(train_new, test_new)
print("Средняя точность по тестовой выборке:", svc_helper(X_train, y_train, X_test, y_test))

Средняя точность по тестовой выборке: 0.7692307692307693


### Старая + Новая выборки

In [193]:
X_train, y_train, X_test, y_test = splitter(train_all, test_all)
print("Средняя точность по тестовой выборке:", svc_helper(X_train, y_train, X_test, y_test))

Средняя точность по тестовой выборке: 0.7058823529411765


## ШАГ 4. AutoML

In [194]:
import evalml
from evalml import AutoMLSearch#

### Старая выборка

In [195]:
X_train, y_train, X_test, y_test = splitter(train_old, test_old)
automl = AutoMLSearch(X_train=X_train, y_train=y_train, problem_type='multiclass')
automl.search()

Using default limit of max_batches=1.

Generating pipelines to search over...
*****************************
* Beginning pipeline search *
*****************************

Optimizing for Log Loss Multiclass. 
Lower score is better.

Using SequentialEngine to train and score pipelines.
Searching up to 1 batches for a total of 9 pipelines. 
Allowed model families: extra_trees, lightgbm, decision_tree, random_forest, xgboost, catboost, linear_model



FigureWidget({
    'data': [{'mode': 'lines+markers',
              'name': 'Best Score',
              'type'…

Batch 1: (1/9) Mode Baseline Multiclass Classificati... Elapsed:00:00
	Starting cross validation
	Finished cross validation - mean Log Loss Multiclass: 28.840
Batch 1: (2/9) Decision Tree Classifier w/ Imputer      Elapsed:00:01
	Starting cross validation
	Finished cross validation - mean Log Loss Multiclass: 6.284
Batch 1: (3/9) LightGBM Classifier w/ Imputer           Elapsed:00:08
	Starting cross validation
	Finished cross validation - mean Log Loss Multiclass: 1.481
Batch 1: (4/9) Extra Trees Classifier w/ Imputer        Elapsed:00:22
	Starting cross validation
	Finished cross validation - mean Log Loss Multiclass: 1.047
Batch 1: (5/9) Elastic Net Classifier w/ Imputer + S... Elapsed:00:31
	Starting cross validation
	Finished cross validation - mean Log Loss Multiclass: 1.797
Batch 1: (6/9) CatBoost Classifier w/ Imputer           Elapsed:00:41
	Starting cross validation
	Finished cross validation - mean Log Loss Multiclass: 1.617
Batch 1: (7/9) XGBoost Classifier w/ Imputer       

In [196]:
automl.rankings

Unnamed: 0,id,pipeline_name,score,validation_score,percent_better_than_baseline,high_variance_cv,parameters
0,8,Logistic Regression Classifier w/ Imputer + St...,0.963894,0.901006,96.657773,False,{'Imputer': {'categorical_impute_strategy': 'm...
1,6,XGBoost Classifier w/ Imputer,1.039568,1.003247,96.395381,False,{'Imputer': {'categorical_impute_strategy': 'm...
2,3,Extra Trees Classifier w/ Imputer,1.046773,1.038366,96.370396,False,{'Imputer': {'categorical_impute_strategy': 'm...
3,7,Random Forest Classifier w/ Imputer,1.06148,0.978424,96.319402,False,{'Imputer': {'categorical_impute_strategy': 'm...
4,2,LightGBM Classifier w/ Imputer,1.481486,1.374229,94.863063,False,{'Imputer': {'categorical_impute_strategy': 'm...
5,5,CatBoost Classifier w/ Imputer,1.617449,1.615219,94.391622,False,{'Imputer': {'categorical_impute_strategy': 'm...
6,4,Elastic Net Classifier w/ Imputer + Standard S...,1.796692,1.79678,93.770112,False,{'Imputer': {'categorical_impute_strategy': 'm...
7,1,Decision Tree Classifier w/ Imputer,6.284,6.675106,78.210727,False,{'Imputer': {'categorical_impute_strategy': 'm...
8,0,Mode Baseline Multiclass Classification Pipeline,28.839878,28.839878,0.0,False,{'Baseline Classifier': {'strategy': 'mode'}}


In [208]:
pipeline = automl.best_pipeline
print("Точность: ", pipeline.score(X_test, y_test, ['F1 Micro'])['F1 Micro'])

Точность:  0.68


### Новая выборка

In [209]:
X_train, y_train, X_test, y_test = splitter(train_new, test_new)
automl = AutoMLSearch(X_train=X_train, y_train=y_train, problem_type='multiclass')
automl.search()

Using default limit of max_batches=1.

Generating pipelines to search over...
*****************************
* Beginning pipeline search *
*****************************

Optimizing for Log Loss Multiclass. 
Lower score is better.

Using SequentialEngine to train and score pipelines.
Searching up to 1 batches for a total of 9 pipelines. 
Allowed model families: extra_trees, lightgbm, decision_tree, random_forest, xgboost, catboost, linear_model



FigureWidget({
    'data': [{'mode': 'lines+markers',
              'name': 'Best Score',
              'type'…

Batch 1: (1/9) Mode Baseline Multiclass Classificati... Elapsed:00:00
	Starting cross validation
	Finished cross validation - mean Log Loss Multiclass: 28.890
Batch 1: (2/9) Decision Tree Classifier w/ Imputer      Elapsed:00:01
	Starting cross validation
	Finished cross validation - mean Log Loss Multiclass: 4.967
High coefficient of variation (cv >= 0.2) within cross validation scores. Decision Tree Classifier w/ Imputer may not perform as estimated on unseen data.
Batch 1: (3/9) LightGBM Classifier w/ Imputer           Elapsed:00:08
	Starting cross validation
	Finished cross validation - mean Log Loss Multiclass: 0.899
High coefficient of variation (cv >= 0.2) within cross validation scores. LightGBM Classifier w/ Imputer may not perform as estimated on unseen data.
Batch 1: (4/9) Extra Trees Classifier w/ Imputer        Elapsed:00:22
	Starting cross validation
	Finished cross validation - mean Log Loss Multiclass: 0.817
Batch 1: (5/9) Elastic Net Classifier w/ Imputer + S... Elapse

In [210]:
automl.rankings

Unnamed: 0,id,pipeline_name,score,validation_score,percent_better_than_baseline,high_variance_cv,parameters
0,6,XGBoost Classifier w/ Imputer,0.650996,0.767961,97.746632,False,{'Imputer': {'categorical_impute_strategy': 'm...
1,7,Random Forest Classifier w/ Imputer,0.731133,0.793487,97.469246,False,{'Imputer': {'categorical_impute_strategy': 'm...
2,8,Logistic Regression Classifier w/ Imputer + St...,0.738436,0.962803,97.443966,True,{'Imputer': {'categorical_impute_strategy': 'm...
3,3,Extra Trees Classifier w/ Imputer,0.817141,0.850752,97.171534,False,{'Imputer': {'categorical_impute_strategy': 'm...
4,2,LightGBM Classifier w/ Imputer,0.898655,1.030264,96.889382,True,{'Imputer': {'categorical_impute_strategy': 'm...
5,5,CatBoost Classifier w/ Imputer,1.535465,1.553938,94.685116,False,{'Imputer': {'categorical_impute_strategy': 'm...
6,4,Elastic Net Classifier w/ Imputer + Standard S...,1.799216,1.799058,93.772166,False,{'Imputer': {'categorical_impute_strategy': 'm...
7,1,Decision Tree Classifier w/ Imputer,4.966516,6.676376,82.808821,True,{'Imputer': {'categorical_impute_strategy': 'm...
8,0,Mode Baseline Multiclass Classification Pipeline,28.889911,28.889911,0.0,False,{'Baseline Classifier': {'strategy': 'mode'}}


In [211]:
pipeline = automl.best_pipeline
print("Точность: ", pipeline.score(X_test, y_test, ['F1 Micro'])['F1 Micro'])

Точность:  0.8012820512820514


### Старая + Новая выборки

In [212]:
X_train, y_train, X_test, y_test = splitter(train_all, test_all)
automl = AutoMLSearch(X_train=X_train, y_train=y_train, problem_type='multiclass')
automl.search()

Using default limit of max_batches=1.

Generating pipelines to search over...
*****************************
* Beginning pipeline search *
*****************************

Optimizing for Log Loss Multiclass. 
Lower score is better.

Using SequentialEngine to train and score pipelines.
Searching up to 1 batches for a total of 9 pipelines. 
Allowed model families: extra_trees, lightgbm, decision_tree, random_forest, xgboost, catboost, linear_model



FigureWidget({
    'data': [{'mode': 'lines+markers',
              'name': 'Best Score',
              'type'…

Batch 1: (1/9) Mode Baseline Multiclass Classificati... Elapsed:00:00
	Starting cross validation
	Finished cross validation - mean Log Loss Multiclass: 28.782
Batch 1: (2/9) Decision Tree Classifier w/ Imputer      Elapsed:00:00
	Starting cross validation
	Finished cross validation - mean Log Loss Multiclass: 4.230
Batch 1: (3/9) LightGBM Classifier w/ Imputer           Elapsed:00:08
	Starting cross validation
	Finished cross validation - mean Log Loss Multiclass: 1.059
Batch 1: (4/9) Extra Trees Classifier w/ Imputer        Elapsed:00:29
	Starting cross validation
	Finished cross validation - mean Log Loss Multiclass: 0.975
Batch 1: (5/9) Elastic Net Classifier w/ Imputer + S... Elapsed:00:37
	Starting cross validation
	Finished cross validation - mean Log Loss Multiclass: 1.796
Batch 1: (6/9) CatBoost Classifier w/ Imputer           Elapsed:00:48
	Starting cross validation
	Finished cross validation - mean Log Loss Multiclass: 1.542
Batch 1: (7/9) XGBoost Classifier w/ Imputer       

In [213]:
automl.rankings

Unnamed: 0,id,pipeline_name,score,validation_score,percent_better_than_baseline,high_variance_cv,parameters
0,8,Logistic Regression Classifier w/ Imputer + St...,0.756995,0.810962,97.369929,False,{'Imputer': {'categorical_impute_strategy': 'm...
1,6,XGBoost Classifier w/ Imputer,0.798732,0.840912,97.224922,False,{'Imputer': {'categorical_impute_strategy': 'm...
2,7,Random Forest Classifier w/ Imputer,0.910061,0.934754,96.838125,False,{'Imputer': {'categorical_impute_strategy': 'm...
3,3,Extra Trees Classifier w/ Imputer,0.975396,0.986299,96.611127,False,{'Imputer': {'categorical_impute_strategy': 'm...
4,2,LightGBM Classifier w/ Imputer,1.058906,1.152651,96.320984,False,{'Imputer': {'categorical_impute_strategy': 'm...
5,5,CatBoost Classifier w/ Imputer,1.541947,1.542487,94.642726,False,{'Imputer': {'categorical_impute_strategy': 'm...
6,4,Elastic Net Classifier w/ Imputer + Standard S...,1.796046,1.796871,93.759898,False,{'Imputer': {'categorical_impute_strategy': 'm...
7,1,Decision Tree Classifier w/ Imputer,4.229614,4.155275,85.304817,False,{'Imputer': {'categorical_impute_strategy': 'm...
8,0,Mode Baseline Multiclass Classification Pipeline,28.782314,28.782314,0.0,False,{'Baseline Classifier': {'strategy': 'mode'}}


In [214]:
pipeline = automl.best_pipeline
print("Точность: ", pipeline.score(X_test, y_test, ['F1 Micro'])['F1 Micro'])

Точность:  0.7516339869281046
