# Эксперимент 2

Использование классических методов машинного обучения для классификации

In [1]:
import pathlib
import shutil
import os
import random

import numpy as np
import pandas as pd
from PIL import Image

## Подготовка данных

In [13]:
def random_fill_dir(ds_size, dir_name, paths, other_check_set=set()):
    """Функция для заполнения папок случайными выборками."""
    check_set = set()
    while len(check_set) != ds_size:
        # Выбираем случайный номер из списка
        random_path_i = random.randrange(len(paths))
        p = paths[random_path_i]
        
        # Если не использовали путь, добавляем в выборку
        if p not in check_set and p not in other_check_set:
            shutil.copyfile(p, dir_name + p.name)
            check_set.add(p)
    return check_set

In [15]:
# Очистка папки от данных
shutil.rmtree("./2_data")
os.makedirs("./2_data")

data_dir = pathlib.Path("../First_bmp")
blasts = list(data_dir.glob("./Бласты/*.bmp"))
lymphocytes = list(data_dir.glob("./Лимфоцит/*.bmp"))

# -----------------------------------------------
# - тестовая выборка: 100 бластов и 100 лимфоцитов
os.makedirs("./2_data/test_ds/")
os.makedirs("./2_data/test_ds/blasts/")
os.makedirs("./2_data/test_ds/lymphocytes/")

# -- 100 бластов
test_blasts_paths = random_fill_dir(
    ds_size=100,
    dir_name="./2_data/test_ds/blasts/",
    paths=blasts,
)

# -- 100 лимфоцитов
test_lymphocytes_paths = random_fill_dir(
    ds_size=100,
    dir_name="./2_data/test_ds/lymphocytes/",
    paths=lymphocytes,
)

# -----------------------------------------------
# - выборка с аугментацией данных
os.makedirs("./2_data/aug")
os.makedirs("./2_data/aug/blasts/")
os.makedirs("./2_data/aug/lymphocytes/")

# -- 6528 бластов
blsts = random_fill_dir(
    ds_size=6528,
    dir_name="./2_data/aug/blasts/",
    paths=blasts,
    other_check_set=test_blasts_paths,
)

# -- 6528 лимфоцитов в .bmp (original + rotated)
for lymphocyte in lymphocytes:
    if lymphocyte not in test_lymphocytes_paths:
        shutil.copyfile(
            lymphocyte,
            "./2_data/aug/lymphocytes/" + lymphocyte.name
        )
        img = Image.open(lymphocyte)
        for i in range(15):
            img.rotate(24*(i+1))
            img.save("./2_data/aug/lymphocytes/" + str(24*(i+1)) + lymphocyte.name)

### Признаки

In [2]:
blasts = pd.read_csv("../blasts.csv", delimiter=';')
blasts = blasts.drop(columns=blasts.columns[0])
blasts["class"] = 0


lymphocytes = pd.read_csv("../lymphocytes.csv", delimiter=';')
lymphocytes = lymphocytes.drop(columns=lymphocytes.columns[0])
lymphocytes["class"] = 1

In [3]:
cells = pd.concat([blasts, lymphocytes])
cells = cells.apply(lambda x: pd.to_numeric(x.astype(str).str.replace(',','.'), errors='coerce'))
cells

Unnamed: 0,RGB_R_������� ������ ���������_ASM,RGB_R_������� ������ ���������_CON,RGB_R_������� ������ ���������_ENT,RGB_R_������� ������ ���������_LUN,RGB_R_������� ������ ���������_MPR,RGB_R_������� ������ ���������_TR,RGB_R_������� ������ ���������_CORR,RGB_R_������� ���� �����_NVAL,RGB_R_������� ���� �����_MS,RGB_R_������� ���� �����_OMS,...,XYZLABUV_V_Wavelet_energy_re,XYZLABUV_V_Wavelet_max,XYZLABUV_V_Wavelet_max_re,XYZLABUV_V_Wavelet_range,XYZLABUV_V_Wavelet_range_re,XYZLABUV_V_Wavelet_mean,XYZLABUV_V_Wavelet_mean_re,XYZLABUV_V_Wavelet_disp,XYZLABUV_V_Wavelet_disp_re,class
0,0.077472,304.778015,5.051431,0.526868,0.275101,0.395026,0.481361,2695.457031,96.195641,0.776611,...,0.000619,23060.49023,90.433296,23136.97656,90.733238,0.068231,0.000494,17.353344,0.002662,0
1,0.121913,343.202728,4.621132,0.577551,0.346851,0.459308,0.487137,2211.828857,133.039490,0.753889,...,0.000792,20546.02734,80.572655,20636.16602,80.926140,0.164430,0.001251,44.080952,0.004519,0
2,0.110586,466.182800,4.962141,0.548461,0.330476,0.434957,0.481797,1438.899170,103.440971,0.713867,...,0.000974,17512.75781,68.677483,17613.73438,69.073463,-0.186434,-0.001427,77.950180,0.007765,0
3,0.094910,265.774200,4.527308,0.600780,0.302829,0.457337,0.485685,3760.375488,95.934242,0.791180,...,0.000699,24152.79688,94.716850,24229.85742,95.019051,-0.045219,-0.000331,23.108362,0.003175,0
4,0.095491,314.862823,4.524604,0.591442,0.304228,0.451179,0.483576,3705.871826,103.703209,0.787892,...,0.000936,24071.68750,94.398773,24175.07813,94.804230,-0.021599,-0.000161,31.568651,0.004146,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6507,0.091780,290.360718,5.086297,0.538498,0.300764,0.413959,0.485746,2982.782959,111.967873,0.772425,...,0.000536,25652.94922,100.599800,25729.67578,100.900688,0.032251,0.000227,18.325632,0.002684,1
6508,0.068344,421.591309,5.357217,0.494875,0.258739,0.368619,0.482880,2134.849365,86.494362,0.759256,...,0.000727,17908.46094,70.229256,17988.57031,70.543411,0.057966,0.000472,31.200439,0.003987,1
6509,0.077127,455.398773,4.926092,0.565518,0.273578,0.422262,0.481315,2601.734375,74.362259,0.777642,...,0.000823,17406.05469,68.259041,17490.84961,68.591568,0.038317,0.000296,35.966915,0.004814,1
6510,0.103039,245.014648,4.457886,0.599588,0.316575,0.460483,0.487168,4232.018066,116.313988,0.800776,...,0.000813,24970.65625,97.924141,25061.66211,98.281029,0.034401,0.000261,31.062300,0.003789,1


### Тестовая выборка с признаками

In [4]:
test_blasts = pd.read_csv("../test_blasts.csv", delimiter=';')
test_blasts = test_blasts.drop(columns=test_blasts.columns[0])
test_blasts["class"] = 0

test_lymphocytes = pd.read_csv("../test_lymphocytes.csv", delimiter=';')
test_lymphocytes = test_lymphocytes.drop(columns=test_lymphocytes.columns[0])
test_lymphocytes["class"] = 1

test_cells = pd.concat([test_blasts, test_lymphocytes])
test_cells = test_cells.apply(lambda x: pd.to_numeric(x.astype(str).str.replace(',','.'), errors='coerce'))

### Формирование данных с подходящим для тренировки форматом

In [10]:
X_train = cells.drop(columns=cells.columns[-1]).to_numpy()
y_train = cells.iloc[:,-1:].to_numpy().flatten()

X_test = test_cells.drop(columns=cells.columns[-1]).to_numpy()
y_test = test_cells.iloc[:,-1:].to_numpy().flatten()

## k-nn

In [12]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

results = {}

for i in range(100):
    neigh_pipe = make_pipeline(
        StandardScaler(),
        KNeighborsClassifier(n_neighbors=i+2)
    )
    neigh_pipe.fit(X_train, y_train)
    results[i] = neigh_pipe.score(X_test, y_test)

In [13]:
acc = 0.001
n_neighbors = 0
for k, v in results.items():
    if v > acc:
        acc = v
        n_neighbors = k

In [15]:
print("Средняя точность по тестовой выборке:", acc)
print("Оптимальное количество соседей:", n_neighbors)

Средняя точность по тестовой выборке: 0.75
Оптимальное количество соседей: 51


## SVM

In [21]:
from sklearn.svm import SVC

results = {}
clf = make_pipeline(StandardScaler(), SVC())
for i in range(5):
    # Коэффициент регуляризации
    c = i*0.5 + 1 # 1, 1.5, 2.0, ...
    
    # Ядро (kernel) -- радиально-базисная функция (radia-basis function) 
    svc_rbf = SVC(C=c, kernel='rbf', cache_size=1000)
    
    # Обучение
    svc_rbf_pipe = make_pipeline(StandardScaler(), svc_rbf)
    svc_rbf_pipe.fit(X_train, y_train)
    
    # Тестирование
    k = f"C = {c}, kernel=rbf" # название ключа
    results[k] = svc_rbf_pipe.score(X_test, y_test)
    
    for deg in range(4):
        # Ядро (kernel) -- полином степени deg+2
        svc_poly = SVC(C=c, kernel='poly', degree=deg+2, cache_size=1000)
        
        # Обучение
        svc_poly_pipe = make_pipeline(StandardScaler(), svc_poly)
        svc_poly_pipe.fit(X_train, y_train)

        # Тестирование
        k = f"C = {c}, kernel=poly, degree={deg+2}"
        results[k] = svc_poly_pipe.score(X_test, y_test)

In [22]:
results

{'C = 1.0, kernel=rbf': 0.825,
 'C = 1.0, kernel=poly, degree=2': 0.79,
 'C = 1.0, kernel=poly, degree=3': 0.795,
 'C = 1.0, kernel=poly, degree=4': 0.755,
 'C = 1.0, kernel=poly, degree=5': 0.75,
 'C = 1.5, kernel=rbf': 0.83,
 'C = 1.5, kernel=poly, degree=2': 0.8,
 'C = 1.5, kernel=poly, degree=3': 0.78,
 'C = 1.5, kernel=poly, degree=4': 0.735,
 'C = 1.5, kernel=poly, degree=5': 0.74,
 'C = 2.0, kernel=rbf': 0.815,
 'C = 2.0, kernel=poly, degree=2': 0.815,
 'C = 2.0, kernel=poly, degree=3': 0.785,
 'C = 2.0, kernel=poly, degree=4': 0.735,
 'C = 2.0, kernel=poly, degree=5': 0.725,
 'C = 2.5, kernel=rbf': 0.815,
 'C = 2.5, kernel=poly, degree=2': 0.81,
 'C = 2.5, kernel=poly, degree=3': 0.78,
 'C = 2.5, kernel=poly, degree=4': 0.73,
 'C = 2.5, kernel=poly, degree=5': 0.72,
 'C = 3.0, kernel=rbf': 0.805,
 'C = 3.0, kernel=poly, degree=2': 0.82,
 'C = 3.0, kernel=poly, degree=3': 0.78,
 'C = 3.0, kernel=poly, degree=4': 0.705,
 'C = 3.0, kernel=poly, degree=5': 0.72}

In [23]:
acc = 0.001
params = ""
for k, v in results.items():
    if v > acc:
        acc = v
        params = k

In [24]:
print("Средняя точность:", acc)
print("Оптимальные параметры:", params)

Средняя точность: 0.83
Оптимальные параметры: C = 1.5, kernel=rbf


## AutoML

In [25]:
import evalml
from evalml import AutoMLSearch

In [26]:
automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary')
automl.search()

Using default limit of max_batches=1.

Generating pipelines to search over...
*****************************
* Beginning pipeline search *
*****************************

Optimizing for Log Loss Binary. 
Lower score is better.

Using SequentialEngine to train and score pipelines.
Searching up to 1 batches for a total of 9 pipelines. 
Allowed model families: random_forest, linear_model, extra_trees, catboost, decision_tree, lightgbm, xgboost



FigureWidget({
    'data': [{'mode': 'lines+markers',
              'name': 'Best Score',
              'type'…

Batch 1: (1/9) Mode Baseline Binary Classification P... Elapsed:00:00
	Starting cross validation
	Finished cross validation - mean Log Loss Binary: 17.117
Batch 1: (2/9) Decision Tree Classifier w/ Imputer      Elapsed:00:01
	Starting cross validation
	Finished cross validation - mean Log Loss Binary: 0.387
Batch 1: (3/9) LightGBM Classifier w/ Imputer           Elapsed:00:10
	Starting cross validation
	Finished cross validation - mean Log Loss Binary: 0.041
Batch 1: (4/9) Extra Trees Classifier w/ Imputer        Elapsed:00:27
	Starting cross validation
	Finished cross validation - mean Log Loss Binary: 0.421
Batch 1: (5/9) Elastic Net Classifier w/ Imputer + S... Elapsed:00:37
	Starting cross validation
	Finished cross validation - mean Log Loss Binary: 0.693
Batch 1: (6/9) CatBoost Classifier w/ Imputer           Elapsed:00:50
	Starting cross validation
	Finished cross validation - mean Log Loss Binary: 0.567
Batch 1: (7/9) XGBoost Classifier w/ Imputer            Elapsed:01:00
	Star

In [27]:
automl.rankings

Unnamed: 0,id,pipeline_name,score,validation_score,percent_better_than_baseline,high_variance_cv,parameters
0,2,LightGBM Classifier w/ Imputer,0.040903,0.037615,99.761037,False,{'Imputer': {'categorical_impute_strategy': 'm...
1,6,XGBoost Classifier w/ Imputer,0.05525,0.054387,99.677222,False,{'Imputer': {'categorical_impute_strategy': 'm...
2,8,Logistic Regression Classifier w/ Imputer + St...,0.298543,0.296459,98.255864,False,{'Imputer': {'categorical_impute_strategy': 'm...
3,7,Random Forest Classifier w/ Imputer,0.305425,0.306765,98.215653,False,{'Imputer': {'categorical_impute_strategy': 'm...
4,1,Decision Tree Classifier w/ Imputer,0.38719,0.384583,97.737973,False,{'Imputer': {'categorical_impute_strategy': 'm...
5,3,Extra Trees Classifier w/ Imputer,0.421458,0.42316,97.537771,False,{'Imputer': {'categorical_impute_strategy': 'm...
6,5,CatBoost Classifier w/ Imputer,0.567419,0.571652,96.685045,False,{'Imputer': {'categorical_impute_strategy': 'm...
7,4,Elastic Net Classifier w/ Imputer + Standard S...,0.69311,0.693106,95.950738,False,{'Imputer': {'categorical_impute_strategy': 'm...
8,0,Mode Baseline Binary Classification Pipeline,17.116934,17.111677,0.0,False,{'Baseline Classifier': {'strategy': 'mode'}}


In [28]:
automl.describe_pipeline(3)

*************************************
* Extra Trees Classifier w/ Imputer *
*************************************

Problem Type: binary
Model Family: Extra Trees

Pipeline Steps
1. Imputer
	 * categorical_impute_strategy : most_frequent
	 * numeric_impute_strategy : mean
	 * categorical_fill_value : None
	 * numeric_fill_value : None
2. Extra Trees Classifier
	 * n_estimators : 100
	 * max_features : auto
	 * max_depth : 6
	 * min_samples_split : 2
	 * min_weight_fraction_leaf : 0.0
	 * n_jobs : -1

Training
Training for binary problems.
Total training time (including CV): 9.8 seconds

Cross Validation
----------------
             Log Loss Binary  MCC Binary   AUC  Precision    F1  Balanced Accuracy Binary  Accuracy Binary # Training # Validation
0                      0.423       0.644 0.904      0.796 0.826                     0.821            0.821     8760.0       4380.0
1                      0.426       0.625 0.903      0.784 0.818                     0.812            0.811     

In [29]:
pipeline = automl.best_pipeline
print("Точность: ", pipeline.score(test_X, test_y, ["f1"])['F1'])

Точность:  0.7096774193548387
