In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder, Normalizer, label_binarize
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, fbeta_score, f1_score, log_loss, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import column_or_1d

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/evareidman/wine_department/main/data/dataset%20after%20eda.csv')
df.head()

Unnamed: 0,Имя,Год,Страна,Сахар,Цена,Рейтинг,Количество отзывов,Наличие отзывов,Ценовая категория,Цена/Рейтинг
0,Вино Chateau Ducru-Beaucaillou,1985,Франция,Сухое,56990.0,5.0,0.0,0,Дорогое,0.282129
1,Вино Chateau Ducru-Beaucaillou,1986,Франция,Сухое,59990.0,5.0,0.0,0,Дорогое,0.297189
2,Вино Chateau Pichon Longueville Comtesse de La...,1995,Франция,Сухое,71490.0,0.0,0.0,0,Дорогое,
3,Вино Chateau Leoville Las Cases,2001,Франция,Сухое,67990.0,0.0,0.0,0,Дорогое,
4,Вино Chateau Calon Segur,2001,Франция,Сухое,39490.0,5.0,1.0,1,Дорогое,0.194277


In [3]:
df = df.drop('Наличие отзывов', axis=1)

In [4]:
df = df.set_index('Имя')
df.head()

Unnamed: 0_level_0,Год,Страна,Сахар,Цена,Рейтинг,Количество отзывов,Ценовая категория,Цена/Рейтинг
Имя,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Вино Chateau Ducru-Beaucaillou,1985,Франция,Сухое,56990.0,5.0,0.0,Дорогое,0.282129
Вино Chateau Ducru-Beaucaillou,1986,Франция,Сухое,59990.0,5.0,0.0,Дорогое,0.297189
Вино Chateau Pichon Longueville Comtesse de Lalande,1995,Франция,Сухое,71490.0,0.0,0.0,Дорогое,
Вино Chateau Leoville Las Cases,2001,Франция,Сухое,67990.0,0.0,0.0,Дорогое,
Вино Chateau Calon Segur,2001,Франция,Сухое,39490.0,5.0,1.0,Дорогое,0.194277


Проверяем данные на выбросы и пропуска еще раз.

In [7]:
df = df.dropna(subset=['Ценовая категория'])

In [8]:
df['Ценовая категория'].unique()

array(['Дорогое', 'Среднее', 'Дешевое'], dtype=object)

# Машинное обучение
* __Таргет__: Ценовая категория
* __Числовые признаки:__ Год, Цена, Рейтинг, Количество отзывов, Цена/Рейтинг
* __Категориальные признаки:__ Страна, Сахар

## а) Логистическая регрессия

__1.__ Выделяем числовые и категориальные переменные и делим данные на обучающую и тестовую выборку, чтобы дальше предсказывать таргет "Ценовая категория"

In [9]:
target = 'Ценовая категория'
numeric_features = ['Год', 'Цена', 'Рейтинг', 'Количество отзывов', 'Цена/Рейтинг']
categorical_features = ['Страна', 'Сахар']
df[categorical_features] = df[categorical_features].fillna('Отсутствует')

X = df.drop(columns=target)
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 

__2.__ Заполняем пропуски в числовых признаках медианой.

In [10]:
numeric_data = X_train.select_dtypes([np.number])

numeric_data_median = numeric_data.median()
numeric_features = numeric_data.columns

X_train = X_train.fillna(numeric_data_median)
X_test = X_test.fillna(numeric_data_median)  

__3.__ Создаем column_tranformer для следующих преобразований:
* Cтандартизация числовых признаков через StandartScaler для приведения их к одному масштабу.
* Кодирование категориальных признаков в виде однократных переменных через OneHotEncoder, который преобразует каждый категориальный признак в несколько бинарных, где каждый бинарный признак представляет отдельное значение категории.

___Примечания:___ _Пример column_transformer был взят из семинара по логистической регрессии и адаптирован под нашу модель._

In [11]:
column_transformer = ColumnTransformer([
    ('scaling', StandardScaler(), numeric_features),    
    ('ohe', OneHotEncoder(handle_unknown="ignore", drop="first"),
     categorical_features)
])

__4.__ Объединим преобразования и моделирование через Pipeline и подберем наиболее подходящие параметры модели, которые могут дать наивысшее из возможных качество через GridSearchCV.

In [12]:
pipeline_lr = Pipeline([
    ('transformer', column_transformer),
    ('lr', LogisticRegression(random_state=42))
])

In [13]:
params = {
    'lr__C': np.logspace(-2, 4, 20),
    'lr__penalty': ['l2', 'none'],  
    'lr__solver': ['newton-cg'],  
    'lr__multi_class': ['ovr', 'multinomial']
}


searcher = GridSearchCV(pipeline_lr, params, scoring='roc_auc_ovr', cv=4, n_jobs=-1)
searcher.fit(X_train, y_train)

best_params = searcher.best_params_
print(best_params)

pipeline_lr.set_params(**best_params)
pipeline_lr.fit(X_train, y_train)









Traceback (most recent call last):
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 327, in _score
    return self._sign * self._score_func(y, y_pred, **self._kwargs)
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_ranking.py", line 551, in roc_auc_score
    y_score = check_array(y_score, ensure_2d=False)
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 921, in check_array
    _assert_all_finite(
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 161, in _assert_all_finite
    rai

  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 921, in check_array
    _assert_all_finite(
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 161, in _assert_all_finite
    raise ValueError(msg_err)
ValueError: Input contains NaN.

Traceback (most recent call last):
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 327, in _score
    return self._sign * self._score_func(y, y_pred, **self._kwargs)
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_ranking.py", line 551, in roc_auc_sc

Traceback (most recent call last):
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 327, in _score
    return self._sign * self._score_func(y, y_pred, **self._kwargs)
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_ranking.py", line 551, in roc_auc_score
    y_score = check_array(y_score, ensure_2d=False)
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 921, in check_array
    _assert_all_finite(
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 161, in _assert_all_finite
    rai

Traceback (most recent call last):
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 327, in _score
    return self._sign * self._score_func(y, y_pred, **self._kwargs)
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_ranking.py", line 551, in roc_auc_score
    y_score = check_array(y_score, ensure_2d=False)
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 921, in check_array
    _assert_all_finite(
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 161, in _assert_all_finite
    rai

Traceback (most recent call last):
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 327, in _score
    return self._sign * self._score_func(y, y_pred, **self._kwargs)
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_ranking.py", line 551, in roc_auc_score
    y_score = check_array(y_score, ensure_2d=False)
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 921, in check_array
    _assert_all_finite(
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 161, in _assert_all_finite
    rai

Traceback (most recent call last):
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 327, in _score
    return self._sign * self._score_func(y, y_pred, **self._kwargs)
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_ranking.py", line 551, in roc_auc_score
    y_score = check_array(y_score, ensure_2d=False)
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 921, in check_array
    _assert_all_finite(
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 161, in _assert_all_finite
    rai



Traceback (most recent call last):
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 327, in _score
    return self._sign * self._score_func(y, y_pred, **self._kwargs)
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_ranking.py", line 551, in roc_auc_score
    y_score = check_array(y_score, ensure_2d=False)
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 921, in check_array
    _assert_all_finite(
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 161, in _assert_all_finite
    rai

Traceback (most recent call last):
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 327, in _score
    return self._sign * self._score_func(y, y_pred, **self._kwargs)
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_ranking.py", line 551, in roc_auc_score
    y_score = check_array(y_score, ensure_2d=False)
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 921, in check_array
    _assert_all_finite(
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 161, in _assert_all_finite
    rai

Traceback (most recent call last):
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 327, in _score
    return self._sign * self._score_func(y, y_pred, **self._kwargs)
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_ranking.py", line 551, in roc_auc_score
    y_score = check_array(y_score, ensure_2d=False)
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 921, in check_array
    _assert_all_finite(
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 161, in _assert_all_finite
    rai

Traceback (most recent call last):
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 327, in _score
    return self._sign * self._score_func(y, y_pred, **self._kwargs)
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_ranking.py", line 551, in roc_auc_score
    y_score = check_array(y_score, ensure_2d=False)
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 921, in check_array
    _assert_all_finite(
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 161, in _assert_all_finite
    rai

Traceback (most recent call last):
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 327, in _score
    return self._sign * self._score_func(y, y_pred, **self._kwargs)
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_ranking.py", line 551, in roc_auc_score
    y_score = check_array(y_score, ensure_2d=False)
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 921, in check_array
    _assert_all_finite(
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 161, in _assert_all_finite
    rai

Traceback (most recent call last):
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 327, in _score
    return self._sign * self._score_func(y, y_pred, **self._kwargs)
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_ranking.py", line 551, in roc_auc_score
    y_score = check_array(y_score, ensure_2d=False)
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 921, in check_array
    _assert_all_finite(
  File "/Users/evareidman/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 161, in _assert_all_finite
    rai

{'lr__C': 1128.8378916846884, 'lr__multi_class': 'multinomial', 'lr__penalty': 'l2', 'lr__solver': 'newton-cg'}


In [14]:
coefs = pipeline_lr.named_steps['lr'].coef_
intercepts = pipeline_lr.named_steps['lr'].intercept_
print(f'Коэффициенты модели: {coefs}')
print(f'Свободный члены модели: {intercepts}')

Коэффициенты модели: [[ 1.14857730e+00 -1.00550286e+02  3.03954764e-01  4.74420929e-01
  -2.82546943e+01  2.56961582e+00 -6.26386737e-05  1.01666173e-01
  -1.60117561e+00  3.90323529e-01  5.51015499e+00  5.62837019e-01
   1.13700483e+00 -2.09743707e-01  5.40414604e+00  9.32099961e-02
  -6.44002737e-04  2.09738087e-02  4.24343932e-03 -8.80143584e-01
   4.82224339e-01 -1.15510013e+00 -3.46814964e-05  6.20844074e-01]
 [-6.41311284e-01  9.13169260e+01 -6.87094219e-01  2.93993153e-01
   1.89009479e+01 -9.64418804e-05  3.79519157e+00 -2.34600826e-05
  -3.95973366e-06 -6.72092134e-04 -1.67111169e-05 -1.57044467e-05
   8.23479878e-01  1.72430191e+00 -8.11162593e-04 -4.75551907e-05
  -6.33630389e-01  1.61910510e+00  2.04020257e+00 -4.93814585e-01
  -3.99696208e-06  6.40844417e-01 -1.80236821e-03 -5.76569226e-01]
 [-5.07266021e-01  9.23336024e+00  3.83139455e-01 -7.68414082e-01
   9.35374638e+00 -2.56951938e+00 -3.79512893e+00 -1.01642713e-01
   1.60117957e+00 -3.89651437e-01 -5.51013828e+00 -5.

__5.__ Оценим качество модели.

In [15]:
y_pred = pipeline_lr.predict(X_test)

In [16]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

precision = precision_score(y_test, y_pred, average='micro')
recall = recall_score(y_test, y_pred, average='micro')
f1 = f1_score(y_test, y_pred, average='micro')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

classes = np.unique(np.concatenate((y_test, y_pred)))
y_test_bin = label_binarize(y_test, classes=classes)
y_pred_bin = label_binarize(y_pred, classes=classes)

roc_auc = roc_auc_score(y_test_bin, y_pred_bin, average='micro', multi_class='ovr')
print(f'ROC AUC: {roc_auc}')

y_pred_proba = pipeline_lr.predict_proba(X_test)
logloss = log_loss(y_test, y_pred_proba)
print(f'Log Loss: {logloss}')

Accuracy: 0.9907834101382489
Precision: 0.9907834101382489
Recall: 0.9907834101382489
F1 Score: 0.9907834101382489
ROC AUC: 0.9930875576036866
Log Loss: 0.020819906558436806


__Интерпретация:__ Модель на всех метриках показывает допустимо высокие результате, что говорит о высоком качестве ее предсказаний в решении задачи классификации ценовой категории вина.

## б) Случайный лес
_Пробуем построить модель с помощью другого алгоритма._

__1.__ Повторяем шаги 1-3 из предыдущего пункта.

In [17]:
numeric_features = ['Год', 'Цена', 'Рейтинг', 'Количество отзывов', 'Цена/Рейтинг']
categorical_features = ['Страна', 'Сахар']
df[categorical_features] = df[categorical_features].fillna('Отсутствует')

X = df.drop(columns=target)
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
numeric_data = X_train.select_dtypes([np.number])

numeric_data_median = numeric_data.median()
numeric_features = numeric_data.columns

X_train = X_train.fillna(numeric_data_median)
X_test = X_test.fillna(numeric_data_median)

column_transformer = ColumnTransformer([
    ('scaling', StandardScaler(), numeric_features),
    ('ohe', OneHotEncoder(handle_unknown="ignore", drop="first"),
     categorical_features)
])

__2.__ Объединим преобразования и моделирование на основе Случайного леса через Pipeline и подберем наиболее подходящие параметры модели, которые могут дать наивысшее из возможных качество через GridSearchCV.

In [19]:
pipeline_rf = Pipeline([
    ('transformer', column_transformer),
    ('rf', RandomForestClassifier(random_state=42))
])

# данные параметры предложены chat gpt как значимые для модели
params = {
    'rf__n_estimators': [100, 200, 500],
    'rf__max_depth': [None, 5, 10],
    'rf__max_features': ['auto', 'sqrt', 'log2'],
    'rf__criterion': ['gini', 'entropy'],
}

searcher = GridSearchCV(pipeline_rf, params, scoring='roc_auc_ovr', cv=4, n_jobs=-1)
searcher.fit(X_train, y_train)

best_params = searcher.best_params_
print(best_params)

pipeline_rf.set_params(**best_params)
pipeline_rf.fit(X_train, y_train)

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(




{'rf__criterion': 'gini', 'rf__max_depth': 10, 'rf__max_features': 'auto', 'rf__n_estimators': 100}


In [20]:
feature_importance = pipeline_rf.named_steps['rf'].feature_importances_
print(f'Коэффициенты модели: {feature_importance}')

Коэффициенты модели: [9.64780709e-02 4.86367577e-01 5.26908285e-02 5.19492597e-02
 2.58168363e-01 1.63041781e-03 3.74871923e-04 1.19885258e-03
 2.37649330e-04 2.15568359e-03 1.01031014e-04 7.96126026e-04
 2.13787600e-03 3.80611342e-03 8.39252136e-04 3.96913694e-03
 1.68036859e-03 1.74384053e-02 2.79574216e-03 2.78895622e-03
 3.03826419e-04 6.44073029e-03 9.89824514e-04 4.66103710e-03]


__3.__ Оцениваем качество модели.

In [21]:
y_pred = pipeline_rf.predict(X_test)
y_test = column_or_1d(y_test)
y_pred = column_or_1d(y_pred)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

precision = precision_score(y_test, y_pred, average='micro')
recall = recall_score(y_test, y_pred, average='micro')
f1 = f1_score(y_test, y_pred, average='micro')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

classes = np.unique(np.concatenate((y_test, y_pred)))

y_test_bin = label_binarize(y_test, classes=classes)
y_pred_bin = label_binarize(y_pred, classes=classes)

roc_auc = roc_auc_score(y_test_bin, y_pred_bin, average='micro', multi_class='ovr')
print(f'ROC AUC: {roc_auc}')
y_pred_proba = pipeline_rf.predict_proba(X_test)
logloss = log_loss(y_test, y_pred_proba)
print(f'Log Loss: {logloss}')

Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
ROC AUC: 1.0
Log Loss: 0.07413079955007233


__Интерпретация:__ Модель, построенная на основе cлучайного леса показывает результаты, даже выше, чем у логистической регрессии, но с более высоким Log Loss, что говорит о том, что в решении поставленной задачи наиболее оптимальной из представленных моделей является логистическая регрессия.