# Загрузка библиотек

In [1]:
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
from sklearn.utils.class_weight import compute_class_weight

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
%config InlineBackend.figure_format = 'svg'
plt.rcParams["figure.figsize"] = (8, 5)
plt.rcParams["image.cmap"] = "viridis"

import statsmodels.formula.api as smf
import statsmodels.api as sm
from statsmodels.discrete.discrete_model import Probit
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, precision_score, recall_score
from catboost.utils import get_roc_curve
from catboost import CatBoostClassifier, Pool

from imblearn.over_sampling import SMOTE, ADASYN
import shap

# Загрузка данных

In [2]:
df_2013_2014 = pd.read_csv('../data_preprocessed/data_2013_2014.csv')
df_2014_2015 = pd.read_csv('../data_preprocessed/data_2014_2015.csv')
df_2015_2016 = pd.read_csv('../data_preprocessed/data_2015_2016.csv')
df_2016_2017 = pd.read_csv('../data_preprocessed/data_2016_2017.csv')
df_2017_2018 = pd.read_csv('../data_preprocessed/data_2017_2018.csv')

In [3]:
data = pd.concat([df_2013_2014,df_2014_2015,df_2015_2016,
                 df_2016_2017,df_2017_2018]).astype({'neigh_airports_pr_period':'int32'})

In [4]:
data.shape

(51894, 12)

# Пробит и Логит обычные

In [18]:
# Подготовка данных
y = data.entry_2.values
regressors = ['served_airports_pr_period', 'neigh_airports_pr_period',
              'airport_type', 'population', 'megapolis', 'tourist',
              'distance_km'] # в случае добавления квадрата расстояния значимости нет и у расстояния, и у его квадрата
X = sm.add_constant(data[regressors])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, stratify=y) # не забыть написать про stratify=y

# Пробит
probit_model = Probit(y_train, X_train.astype(float))
probit_model = probit_model.fit(cov_type='HC0', maxiter=1000)
probit_predict = list(map(round,probit_model.predict(X_test)))
print(probit_model.summary())

Optimization terminated successfully.
         Current function value: 0.031942
         Iterations 9
                          Probit Regression Results                           
Dep. Variable:                      y   No. Observations:                38920
Model:                         Probit   Df Residuals:                    38912
Method:                           MLE   Df Model:                            7
Date:                Thu, 05 May 2022   Pseudo R-squ.:                  0.1605
Time:                        00:49:24   Log-Likelihood:                -1243.2
converged:                       True   LL-Null:                       -1480.8
Covariance Type:                  HC0   LLR p-value:                 1.655e-98
                                coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------
const                        -2.7386      0.085    -32.409      0.000      -2.

In [43]:
results_as_html = probit_model.summary().tables[1].as_html()
df_probit = pd.read_html(results_as_html, header=0, index_col=0)[0]
df_probit = df_probit[['coef','std err']]
df_probit['coef'] = df_probit['coef'].apply(lambda row: str(round(row,4)))
df_probit['std err'] = df_probit['std err'].apply(lambda row: round(row,4))
df_probit.iloc[0,0] = '-2.7386***'
df_probit.iloc[1,0] = '0.0028'
df_probit.iloc[2,0] = '0.0839***'
df_probit.iloc[3,0] = '0.0883*'
df_probit.iloc[4,0] = '0.0001***'
df_probit.iloc[5,0] = '-0.1014**'
df_probit.iloc[6,0] = '0.3124***'
df_probit.iloc[7,0] = '-0.0001***'
# print(df_probit.to_latex())

\begin{tabular}{llr}
\toprule
{} &        coef &  std err \\
\midrule
const                     &  -2.7386*** &    0.085 \\
served\_airports\_pr\_period &      0.0028 &    0.004 \\
neigh\_airports\_pr\_period  &   0.0839*** &    0.007 \\
airport\_type              &     0.0883* &    0.046 \\
population                &   0.0001*** &    0.000 \\
megapolis                 &   -0.1014** &    0.049 \\
tourist                   &   0.3124*** &    0.057 \\
distance\_km               &  -0.0001*** &    0.000 \\
\bottomrule
\end{tabular}



In [42]:
df_probit

Unnamed: 0,coef,std err
const,-2.7386***,0.085
served_airports_pr_period,0.0028,0.004
neigh_airports_pr_period,0.0839***,0.007
airport_type,0.0883*,0.046
population,0.0001***,0.0
megapolis,-0.1014**,0.049
tourist,0.3124***,0.057
distance_km,-0.0001***,0.0


In [6]:
# Проверка мультиколлинеарности
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = [variance_inflation_factor(X.values, X.columns.get_loc(i)) for i in X.columns]
pd.DataFrame({'vif': vif[1:]}, index=X.columns[1:]).T

Unnamed: 0,served_airports_pr_period,neigh_airports_pr_period,airport_type,population,megapolis,tourist,distance_km
vif,1.348069,1.404956,1.407667,2.511195,1.902185,1.895716,1.215538


In [57]:
df_metrics = pd.DataFrame()
df_metrics['Пробит'] = [round(accuracy_score(y_test, probit_predict),3),
                round(f1_score(y_test, probit_predict, average = 'macro'),3),
                round(precision_score(y_test, probit_predict, average = 'macro'),3),
                round(recall_score(y_test, probit_predict, average = 'macro'),3)]

In [7]:
# Метрики качества (все на тестовых данных)
print('Метрики для пробита')
print(classification_report(y_test, probit_predict))

Метрики для пробита
              precision    recall  f1-score   support

           0       0.99      1.00      1.00     12892
           1       0.00      0.00      0.00        82

    accuracy                           0.99     12974
   macro avg       0.50      0.50      0.50     12974
weighted avg       0.99      0.99      0.99     12974



# Зависимость веса класса от его частотности: логит, случайный лес, градиентный бустинг

In [8]:
# Подготовка данных
y = data.entry_2.values
regressors = ['served_airports_pr_period', 'neigh_airports_pr_period',
              'airport_type', 'population', 'megapolis', 'tourist',
              'distance_km'] 
X = data[regressors]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, stratify=y)
scaler = StandardScaler() # cкалируем данные
X_train_scaled, X_test_scaled = scaler.fit_transform(X_train), scaler.transform(X_test)

# Логит
logit_w_model = LogisticRegression(class_weight='balanced')
logit_w_model.fit(X_train_scaled, y_train)
logit_w_importance = pd.DataFrame({'coef':logit_w_model.coef_.reshape(7),'feature':X_train.columns}).set_index('feature')
print('Коэффициенты логит модели')
print(logit_w_importance.sort_values(
    by = 'coef', key = lambda x: abs(x), ascending = False
))
logit_w_predict = logit_w_model.predict(X_test_scaled)

# Случайный лес
forest_w_model = RandomForestClassifier(class_weight='balanced')
forest_w_model.fit(X_train,y_train)
forest_w_importance = pd.DataFrame(forest_w_model.feature_importances_, index = X_train.columns, columns = ['importance'])
print('Важность переменных в лесу (cнижение impurity)')
print(forest_w_importance.sort_values('importance', ascending = False))
forest_w_predict = forest_w_model.predict(X_test)

# Градиентный бустинг
classes = np.unique(y_train)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weights = dict(zip(classes, weights))
catboost_w_model = CatBoostClassifier(class_weights = class_weights, logging_level = 'Silent')
catboost_w_model.fit(X_train,y_train)
catboost_w_importance = pd.DataFrame(catboost_w_model.get_feature_importance(Pool(X_test, label=y_test),
                                                                             type = "LossFunctionChange"), 
                                     index = X_train.columns, columns = ['importance'])
print('Важность переменных в градиентном бустинге (cнижение значения Loss функции)')
print(catboost_w_importance.sort_values('importance', ascending = False))
catboost_w_predict = catboost_w_model.predict(X_test)

Коэффициенты логит модели
                               coef
feature                            
neigh_airports_pr_period   1.033457
tourist                    0.404815
distance_km               -0.404606
megapolis                 -0.389873
population                 0.339319
served_airports_pr_period -0.183179
airport_type               0.177997
Важность переменных в лесу (cнижение impurity)
                           importance
population                   0.262579
neigh_airports_pr_period     0.230763
distance_km                  0.191193
served_airports_pr_period    0.149107
tourist                      0.093142
airport_type                 0.039299
megapolis                    0.033917
Важность переменных в градиентном бустинге (cнижение значения Loss функции)
                           importance
neigh_airports_pr_period     0.192012
tourist                      0.005780
airport_type                -0.000684
distance_km                 -0.027775
megapolis                   -0.02

In [9]:
# %%time
# shap_values = shap.TreeExplainer(forest_w_model).shap_values(X_train) 
# shap.summary_plot(shap_values, X_train)

In [58]:
df_metrics['Логит(разные веса у классов)'] = [round(accuracy_score(y_test, logit_w_predict),3),
                round(f1_score(y_test, logit_w_predict, average = 'macro'),3),
                round(precision_score(y_test, logit_w_predict, average = 'macro'),3),
                round(recall_score(y_test, logit_w_predict, average = 'macro'),3)]

df_metrics['Случайный лес(разные веса у классов)'] = [round(accuracy_score(y_test, forest_w_predict),3),
                round(f1_score(y_test, forest_w_predict, average = 'macro'),3),
                round(precision_score(y_test, forest_w_predict, average = 'macro'),3),
                round(recall_score(y_test, forest_w_predict, average = 'macro'),3)]

df_metrics['Градиентный бустинг(разные веса у классов)'] = [round(accuracy_score(y_test, catboost_w_predict),3),
                round(f1_score(y_test, catboost_w_predict, average = 'macro'),3),
                round(precision_score(y_test, catboost_w_predict, average = 'macro'),3),
                round(recall_score(y_test, catboost_w_predict, average = 'macro'),3)]

In [10]:
# Метрики качества (все на тестовых данных)
print('Метрики для логита')
print(classification_report(y_test, logit_w_predict))
print('Метрики для леса')
print(classification_report(y_test, forest_w_predict))
print('Метрики для бустинга')
print(classification_report(y_test, catboost_w_predict))

Метрики для логита
              precision    recall  f1-score   support

           0       1.00      0.86      0.92     12892
           1       0.03      0.70      0.06        82

    accuracy                           0.86     12974
   macro avg       0.51      0.78      0.49     12974
weighted avg       0.99      0.86      0.92     12974

Метрики для леса
              precision    recall  f1-score   support

           0       0.99      0.97      0.98     12892
           1       0.02      0.10      0.03        82

    accuracy                           0.96     12974
   macro avg       0.51      0.53      0.51     12974
weighted avg       0.99      0.96      0.97     12974

Метрики для бустинга
              precision    recall  f1-score   support

           0       1.00      0.95      0.97     12892
           1       0.04      0.35      0.07        82

    accuracy                           0.94     12974
   macro avg       0.52      0.65      0.52     12974
weighted avg     

# Ресэмплирование данных: логит, лес и бустинг

In [11]:
# Подготовка данных
regressors = ['served_airports_pr_period', 'neigh_airports_pr_period',
              'airport_type', 'population', 'megapolis', 'tourist',
              'distance_km']
X = data[regressors]
y = data.entry_2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=27)
# concatenate our training data back together
X = pd.concat([X_train, y_train], axis=1)
# separate minority and majority classes
not_entry_2 = X[X.entry_2==0]
entry_2 = X[X.entry_2==1]
# upsample minority
entry_2_upsampled = resample(entry_2,
                          replace=True, # sample with replacement
                          n_samples=len(not_entry_2), # match number in majority class
                          random_state=42) # reproducible results
# combine majority and upsampled minority
upsampled = pd.concat([not_entry_2, entry_2_upsampled])
y_train = upsampled.entry_2.values
X_train = upsampled.drop('entry_2', axis=1)
scaler = StandardScaler() # cкалируем данные
X_train_scaled, X_test_scaled = scaler.fit_transform(X_train), scaler.transform(X_test)

# Логит
logit_s_model = LogisticRegression()
logit_s_model.fit(X_train_scaled, y_train)
logit_s_importance = pd.DataFrame({'coef':logit_s_model.coef_.reshape(7),'feature':X_train.columns}).set_index('feature')
print('Коэффициенты логит модели')
print(logit_s_importance.sort_values(
    by = 'coef', key = lambda x: abs(x), ascending = False
))
logit_s_predict = logit_s_model.predict(X_test_scaled)

# Случайный лес
forest_s_model = RandomForestClassifier()
forest_s_model.fit(X_train,y_train)
forest_s_importance = pd.DataFrame(forest_s_model.feature_importances_, index = X_train.columns, columns = ['importance'])
print('Важность переменных в лесу (cнижение impurity)')
print(forest_s_importance.sort_values('importance', ascending = False))
forest_s_predict = forest_s_model.predict(X_test)

# Градиентный бустинг
catboost_s_model = CatBoostClassifier(logging_level = 'Silent')
catboost_s_model.fit(X_train,y_train)
catboost_s_importance = pd.DataFrame(catboost_s_model.get_feature_importance(Pool(X_test, label=y_test),
                                                                             type = "LossFunctionChange"), 
                                     index = X_train.columns, columns = ['importance'])
print('Важность переменных в градиентном бустинге (cнижение значения Loss функции)')
print(catboost_s_importance.sort_values('importance', ascending = False))
catboost_s_predict = catboost_s_model.predict(X_test)

Коэффициенты логит модели
                               coef
feature                            
neigh_airports_pr_period   2.626483
tourist                    0.502083
population                 0.397428
megapolis                 -0.356781
distance_km               -0.350701
airport_type               0.295872
served_airports_pr_period -0.083333
Важность переменных в лесу (cнижение impurity)
                           importance
population                   0.250020
neigh_airports_pr_period     0.244797
distance_km                  0.192754
served_airports_pr_period    0.155600
tourist                      0.088691
airport_type                 0.039757
megapolis                    0.028380
Важность переменных в градиентном бустинге (cнижение значения Loss функции)
                           importance
neigh_airports_pr_period     0.070452
megapolis                    0.013063
airport_type                -0.004786
distance_km                 -0.010213
tourist                     -0.01

In [59]:
df_metrics['Логит(случайное ресемплирование)'] = [round(accuracy_score(y_test, logit_s_predict),3),
                round(f1_score(y_test, logit_s_predict, average = 'macro'),3),
                round(precision_score(y_test, logit_s_predict, average = 'macro'),3),
                round(recall_score(y_test, logit_s_predict, average = 'macro'),3)]

df_metrics['Случайный лес(случайное ресемплирование)'] = [round(accuracy_score(y_test, forest_s_predict),3),
                round(f1_score(y_test, forest_s_predict, average = 'macro'),3),
                round(precision_score(y_test, forest_s_predict, average = 'macro'),3),
                round(recall_score(y_test, forest_s_predict, average = 'macro'),3)]

df_metrics['Градиентный бустинг(случайное ресемплирование)'] = [round(accuracy_score(y_test, catboost_s_predict),3),
                round(f1_score(y_test, catboost_s_predict, average = 'macro'),3),
                round(precision_score(y_test, catboost_s_predict, average = 'macro'),3),
                round(recall_score(y_test, catboost_s_predict, average = 'macro'),3)]

In [12]:
# Метрики качества (все на тестовых данных)
print('Метрики для логита')
print(classification_report(y_test, logit_s_predict))
print('Метрики для леса')
print(classification_report(y_test, forest_s_predict))
print('Метрики для бустинга')
print(classification_report(y_test, catboost_s_predict))

Метрики для логита
              precision    recall  f1-score   support

           0       1.00      0.85      0.92     12892
           1       0.03      0.76      0.06        82

    accuracy                           0.85     12974
   macro avg       0.51      0.80      0.49     12974
weighted avg       0.99      0.85      0.91     12974

Метрики для леса
              precision    recall  f1-score   support

           0       0.99      0.97      0.98     12892
           1       0.01      0.07      0.02        82

    accuracy                           0.96     12974
   macro avg       0.50      0.52      0.50     12974
weighted avg       0.99      0.96      0.97     12974

Метрики для бустинга
              precision    recall  f1-score   support

           0       1.00      0.96      0.98     12892
           1       0.04      0.26      0.07        82

    accuracy                           0.95     12974
   macro avg       0.52      0.61      0.52     12974
weighted avg     

# Ресэмплирование данных с помощью SMOTE

In [13]:
# Подготовка данных
y = data.entry_2.values
regressors = ['served_airports_pr_period', 'neigh_airports_pr_period',
              'airport_type', 'population', 'megapolis', 'tourist',
              'distance_km'] 
X = data[regressors]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, stratify=y)
X_train, y_train = SMOTE().fit_resample(X_train, y_train)
scaler = StandardScaler() # cкалируем данные
X_train_scaled, X_test_scaled = scaler.fit_transform(X_train), scaler.transform(X_test)

# Логит
logit_smote_model = LogisticRegression()
logit_smote_model.fit(X_train_scaled, y_train)
logit_smote_importance = pd.DataFrame({'coef':logit_smote_model.coef_.reshape(7),'feature':X_train.columns}).set_index('feature')
print('Коэффициенты логит модели')
print(logit_smote_importance.sort_values(
    by = 'coef', key = lambda x: abs(x), ascending = False
))
logit_smote_predict = logit_smote_model.predict(X_test_scaled)

# Случайный лес
forest_smote_model = RandomForestClassifier()
forest_smote_model.fit(X_train,y_train)
forest_smote_importance = pd.DataFrame(forest_smote_model.feature_importances_, index = X_train.columns, columns = ['importance'])
print('Важность переменных в лесу (cнижение impurity)')
print(forest_smote_importance.sort_values('importance', ascending = False))
forest_smote_predict = forest_smote_model.predict(X_test)

# Градиентный бустинг
catboost_smote_model = CatBoostClassifier(logging_level = 'Silent')
catboost_smote_model.fit(X_train,y_train)
catboost_smote_importance = pd.DataFrame(catboost_smote_model.get_feature_importance(Pool(X_test, label=y_test),
                                                                             type = "LossFunctionChange"), 
                                     index = X_train.columns, columns = ['importance'])
print('Важность переменных в градиентном бустинге (cнижение значения Loss функции)')
print(catboost_smote_importance.sort_values('importance', ascending = False))
catboost_smote_predict = catboost_smote_model.predict(X_test)

Коэффициенты логит модели
                               coef
feature                            
neigh_airports_pr_period   2.483097
population                 0.778479
megapolis                 -0.445552
distance_km               -0.382262
tourist                    0.288666
served_airports_pr_period -0.212618
airport_type               0.022516
Важность переменных в лесу (cнижение impurity)
                           importance
population                   0.275388
neigh_airports_pr_period     0.223677
distance_km                  0.221763
served_airports_pr_period    0.132742
tourist                      0.079987
megapolis                    0.036999
airport_type                 0.029445
Важность переменных в градиентном бустинге (cнижение значения Loss функции)
                           importance
population                   0.091211
neigh_airports_pr_period     0.085841
megapolis                    0.068939
served_airports_pr_period    0.012794
airport_type                 0.00

In [60]:
df_metrics['Логит(smote)'] = [round(accuracy_score(y_test, logit_smote_predict),3),
                round(f1_score(y_test, logit_smote_predict, average = 'macro'),3),
                round(precision_score(y_test, logit_smote_predict, average = 'macro'),3),
                round(recall_score(y_test, logit_smote_predict, average = 'macro'),3)]

df_metrics['Случайный лес(smote)'] = [round(accuracy_score(y_test, forest_smote_predict),3),
                round(f1_score(y_test, forest_smote_predict, average = 'macro'),3),
                round(precision_score(y_test, forest_smote_predict, average = 'macro'),3),
                round(recall_score(y_test, forest_smote_predict, average = 'macro'),3)]

df_metrics['Градиентный бустинг(smote)'] = [round(accuracy_score(y_test, catboost_smote_predict),3),
                round(f1_score(y_test, catboost_smote_predict, average = 'macro'),3),
                round(precision_score(y_test, catboost_smote_predict, average = 'macro'),3),
                round(recall_score(y_test, catboost_smote_predict, average = 'macro'),3)]

In [14]:
# Метрики качества (все на тестовых данных)
print('Метрики для логита')
print(classification_report(y_test, logit_smote_predict))
print('Метрики для леса')
print(classification_report(y_test, forest_smote_predict))
print('Метрики для бустинга')
print(classification_report(y_test, catboost_smote_predict))

Метрики для логита
              precision    recall  f1-score   support

           0       1.00      0.86      0.93     12892
           1       0.03      0.67      0.06        82

    accuracy                           0.86     12974
   macro avg       0.51      0.77      0.49     12974
weighted avg       0.99      0.86      0.92     12974

Метрики для леса
              precision    recall  f1-score   support

           0       0.99      0.96      0.98     12892
           1       0.04      0.22      0.06        82

    accuracy                           0.96     12974
   macro avg       0.52      0.59      0.52     12974
weighted avg       0.99      0.96      0.97     12974

Метрики для бустинга
              precision    recall  f1-score   support

           0       1.00      0.95      0.97     12892
           1       0.05      0.38      0.08        82

    accuracy                           0.95     12974
   macro avg       0.52      0.66      0.53     12974
weighted avg     

# Ресэмплирование данных с помощью ADASYN

In [15]:
# Подготовка данных
y = data.entry_2.values
regressors = ['served_airports_pr_period', 'neigh_airports_pr_period',
              'airport_type', 'population', 'megapolis', 'tourist',
              'distance_km'] 
X = data[regressors]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, stratify=y)
X_train, y_train = ADASYN().fit_resample(X_train, y_train)
scaler = StandardScaler() # cкалируем данные
X_train_scaled, X_test_scaled = scaler.fit_transform(X_train), scaler.transform(X_test)

# Логит
logit_adasyn_model = LogisticRegression()
logit_adasyn_model.fit(X_train_scaled, y_train)
logit_adasyn_importance = pd.DataFrame({'coef':logit_adasyn_model.coef_.reshape(7),'feature':X_train.columns}).set_index('feature')
print('Коэффициенты логит модели')
print(logit_adasyn_importance.sort_values(
    by = 'coef', key = lambda x: abs(x), ascending = False
))
logit_adasyn_predict = logit_adasyn_model.predict(X_test_scaled)

# Случайный лес
forest_adasyn_model = RandomForestClassifier()
forest_adasyn_model.fit(X_train,y_train)
forest_adasyn_importance = pd.DataFrame(forest_adasyn_model.feature_importances_, index = X_train.columns, columns = ['importance'])
print('Важность переменных в лесу (cнижение impurity)')
print(forest_adasyn_importance.sort_values('importance', ascending = False))
forest_adasyn_predict = forest_adasyn_model.predict(X_test)

# Градиентный бустинг
catboost_adasyn_model = CatBoostClassifier(logging_level = 'Silent')
catboost_adasyn_model.fit(X_train,y_train)
catboost_adasyn_importance = pd.DataFrame(catboost_adasyn_model.get_feature_importance(Pool(X_test, label=y_test),
                                                                             type = "LossFunctionChange"), 
                                     index = X_train.columns, columns = ['importance'])
print('Важность переменных в градиентном бустинге (cнижение значения Loss функции)')
print(catboost_adasyn_importance.sort_values('importance', ascending = False))
catboost_adasyn_predict = catboost_adasyn_model.predict(X_test)

Коэффициенты логит модели
                               coef
feature                            
neigh_airports_pr_period   2.281484
population                 0.825467
distance_km               -0.456510
megapolis                 -0.423129
tourist                    0.249608
served_airports_pr_period -0.144031
airport_type              -0.100236
Важность переменных в лесу (cнижение impurity)
                           importance
population                   0.278981
distance_km                  0.236586
neigh_airports_pr_period     0.203452
served_airports_pr_period    0.144254
tourist                      0.069684
megapolis                    0.034832
airport_type                 0.032212
Важность переменных в градиентном бустинге (cнижение значения Loss функции)
                           importance
population                   0.148322
neigh_airports_pr_period     0.089573
megapolis                    0.073407
distance_km                  0.018682
airport_type                 0.01

In [61]:
df_metrics['Логит(adasyn)'] = [round(accuracy_score(y_test, logit_adasyn_predict),3),
                round(f1_score(y_test, logit_adasyn_predict, average = 'macro'),3),
                round(precision_score(y_test, logit_adasyn_predict, average = 'macro'),3),
                round(recall_score(y_test, logit_adasyn_predict, average = 'macro'),3)]

df_metrics['Случайный лес(adasyn)'] = [round(accuracy_score(y_test, forest_adasyn_predict),3),
                round(f1_score(y_test, forest_adasyn_predict, average = 'macro'),3),
                round(precision_score(y_test, forest_adasyn_predict, average = 'macro'),3),
                round(recall_score(y_test, forest_adasyn_predict, average = 'macro'),3)]

df_metrics['Градиентный бустинг(adasyn)'] = [round(accuracy_score(y_test, catboost_adasyn_predict),3),
                round(f1_score(y_test, catboost_adasyn_predict, average = 'macro'),3),
                round(precision_score(y_test, catboost_adasyn_predict, average = 'macro'),3),
                round(recall_score(y_test, catboost_adasyn_predict, average = 'macro'),3)]

In [16]:
# Метрики качества (все на тестовых данных)
print('Метрики для логита')
print(classification_report(y_test, logit_adasyn_predict))
print('Метрики для леса')
print(classification_report(y_test, forest_adasyn_predict))
print('Метрики для бустинга')
print(classification_report(y_test, catboost_adasyn_predict))

Метрики для логита
              precision    recall  f1-score   support

           0       1.00      0.86      0.92     12892
           1       0.03      0.66      0.05        82

    accuracy                           0.86     12974
   macro avg       0.51      0.76      0.49     12974
weighted avg       0.99      0.86      0.92     12974

Метрики для леса
              precision    recall  f1-score   support

           0       1.00      0.97      0.98     12892
           1       0.05      0.24      0.08        82

    accuracy                           0.97     12974
   macro avg       0.52      0.61      0.53     12974
weighted avg       0.99      0.97      0.98     12974

Метрики для бустинга
              precision    recall  f1-score   support

           0       1.00      0.96      0.98     12892
           1       0.04      0.28      0.07        82

    accuracy                           0.96     12974
   macro avg       0.52      0.62      0.53     12974
weighted avg     

In [64]:
df_metrics['Метрика'] = ['accuracy','f1-score','precision','recall']
df_metrics = df_metrics.set_index('Метрика')

Unnamed: 0_level_0,Пробит,Логит(разные веса у классов),Случайный лес(разные веса у классов),Градиентный бустинг(разные веса у классов),Логит(случайное ресемплирование),Случайный лес(случайное ресемплирование),Градиентный бустинг(случайное ресемплирование),Логит(smote),Случайный лес(smote),Градиентный бустинг(smote),Логит(adasyn),Случайный лес(adasyn),Градиентный бустинг(adasyn)
Метрика,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
accuracy,0.993,0.85,0.96,0.938,0.839,0.961,0.952,0.855,0.956,0.943,0.849,0.963,0.952
f1-score,0.498,0.467,0.494,0.488,0.462,0.496,0.494,0.468,0.49,0.487,0.466,0.497,0.493
precision,0.497,0.501,0.499,0.499,0.5,0.5,0.5,0.5,0.498,0.497,0.5,0.501,0.5
recall,0.5,0.524,0.495,0.49,0.495,0.502,0.503,0.509,0.487,0.48,0.506,0.503,0.497


In [66]:
df_metrics_probit = df_metrics[['Пробит']]
df_metrics_weights = df_metrics[['Логит(разные веса у классов)','Случайный лес(разные веса у классов)',
                                 'Градиентный бустинг(разные веса у классов)']]
df_metrics_resample = df_metrics[['Логит(случайное ресемплирование)','Случайный лес(случайное ресемплирование)',
                                 'Градиентный бустинг(случайное ресемплирование)']]
df_metrics_smote = df_metrics[['Логит(smote)','Случайный лес(smote)',
                                 'Градиентный бустинг(smote)']]
df_metrics_adasyn = df_metrics[['Логит(adasyn)','Случайный лес(adasyn)',
                                 'Градиентный бустинг(adasyn)']]

In [67]:
df_metrics_weights.rename(columns = {'Логит(разные веса у классов)':'Логит',
                                     'Случайный лес(разные веса у классов)':'Случайный лес',
                                     'Градиентный бустинг(разные веса у классов)':'Градиентный бустинг'},inplace = True)
df_metrics_resample.rename(columns = {'Логит(случайное ресемплирование)':'Логит',
                                     'Случайный лес(случайное ресемплирование)':'Случайный лес',
                                     'Градиентный бустинг(случайное ресемплирование)':'Градиентный бустинг'},inplace = True)
df_metrics_smote.rename(columns = {'Логит(smote)':'Логит',
                                     'Случайный лес(smote)':'Случайный лес',
                                     'Градиентный бустинг(smote)':'Градиентный бустинг'},inplace = True)
df_metrics_adasyn.rename(columns = {'Логит(adasyn)':'Логит',
                                     'Случайный лес(adasyn)':'Случайный лес',
                                     'Градиентный бустинг(adasyn)':'Градиентный бустинг'},inplace = True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [73]:
print(df_metrics_probit.to_latex())

\begin{tabular}{lr}
\toprule
{} &  Пробит \\
Метрика   &         \\
\midrule
accuracy  &   0.993 \\
f1-score  &   0.498 \\
precision &   0.497 \\
recall    &   0.500 \\
\bottomrule
\end{tabular}



In [74]:
print(df_metrics_weights.to_latex())

\begin{tabular}{lrrr}
\toprule
{} &  Логит &  Случайный лес &  Градиентный бустинг \\
Метрика   &        &                &                      \\
\midrule
accuracy  &  0.850 &          0.960 &                0.938 \\
f1-score  &  0.467 &          0.494 &                0.488 \\
precision &  0.501 &          0.499 &                0.499 \\
recall    &  0.524 &          0.495 &                0.490 \\
\bottomrule
\end{tabular}



In [75]:
print(df_metrics_resample.to_latex())

\begin{tabular}{lrrr}
\toprule
{} &  Логит &  Случайный лес &  Градиентный бустинг \\
Метрика   &        &                &                      \\
\midrule
accuracy  &  0.839 &          0.961 &                0.952 \\
f1-score  &  0.462 &          0.496 &                0.494 \\
precision &  0.500 &          0.500 &                0.500 \\
recall    &  0.495 &          0.502 &                0.503 \\
\bottomrule
\end{tabular}



In [76]:
print(df_metrics_smote.to_latex())

\begin{tabular}{lrrr}
\toprule
{} &  Логит &  Случайный лес &  Градиентный бустинг \\
Метрика   &        &                &                      \\
\midrule
accuracy  &  0.855 &          0.956 &                0.943 \\
f1-score  &  0.468 &          0.490 &                0.487 \\
precision &  0.500 &          0.498 &                0.497 \\
recall    &  0.509 &          0.487 &                0.480 \\
\bottomrule
\end{tabular}



In [77]:
print(df_metrics_adasyn.to_latex())

\begin{tabular}{lrrr}
\toprule
{} &  Логит &  Случайный лес &  Градиентный бустинг \\
Метрика   &        &                &                      \\
\midrule
accuracy  &  0.849 &          0.963 &                0.952 \\
f1-score  &  0.466 &          0.497 &                0.493 \\
precision &  0.500 &          0.501 &                0.500 \\
recall    &  0.506 &          0.503 &                0.497 \\
\bottomrule
\end{tabular}



In [81]:
df_logit = pd.concat([logit_w_importance,logit_s_importance,
                      logit_smote_importance,logit_adasyn_importance], axis = 1)
df_logit.columns = ['Веса в функции потерь','Случайное ресемплирование','SMOTE','ADASYN']
df_forest = pd.concat([forest_w_importance,forest_s_importance,
                       forest_smote_importance,forest_adasyn_importance], axis = 1)
df_forest.columns = ['Веса в функции потерь','Случайное ресемплирование','SMOTE','ADASYN']
df_catboost = pd.concat([catboost_w_importance,catboost_s_importance,
                         catboost_smote_importance,catboost_adasyn_importance], axis = 1)
df_catboost.columns = ['Веса в функции потерь','Случайное ресемплирование','SMOTE','ADASYN']

In [85]:
print(df_logit.to_latex())

\begin{tabular}{lrrrr}
\toprule
{} &  Веса в функции потерь &  Случайное ресемплирование &     SMOTE &    ADASYN \\
feature                   &                        &                            &           &           \\
\midrule
served\_airports\_pr\_period &              -0.183179 &                  -0.083333 & -0.212618 & -0.144031 \\
neigh\_airports\_pr\_period  &               1.033457 &                   2.626483 &  2.483097 &  2.281484 \\
airport\_type              &               0.177997 &                   0.295872 &  0.022516 & -0.100236 \\
population                &               0.339319 &                   0.397428 &  0.778479 &  0.825467 \\
megapolis                 &              -0.389873 &                  -0.356781 & -0.445552 & -0.423129 \\
tourist                   &               0.404815 &                   0.502083 &  0.288666 &  0.249608 \\
distance\_km               &              -0.404606 &                  -0.350701 & -0.382262 & -0.456510 \\
\bottomrule

In [86]:
print(df_forest.to_latex())

\begin{tabular}{lrrrr}
\toprule
{} &  Веса в функции потерь &  Случайное ресемплирование &     SMOTE &    ADASYN \\
\midrule
served\_airports\_pr\_period &               0.149107 &                   0.155600 &  0.132742 &  0.144254 \\
neigh\_airports\_pr\_period  &               0.230763 &                   0.244797 &  0.223677 &  0.203452 \\
airport\_type              &               0.039299 &                   0.039757 &  0.029445 &  0.032212 \\
population                &               0.262579 &                   0.250020 &  0.275388 &  0.278981 \\
megapolis                 &               0.033917 &                   0.028380 &  0.036999 &  0.034832 \\
tourist                   &               0.093142 &                   0.088691 &  0.079987 &  0.069684 \\
distance\_km               &               0.191193 &                   0.192754 &  0.221763 &  0.236586 \\
\bottomrule
\end{tabular}



In [87]:
print(df_catboost.to_latex())

\begin{tabular}{lrrrr}
\toprule
{} &  Веса в функции потерь &  Случайное ресемплирование &     SMOTE &    ADASYN \\
\midrule
served\_airports\_pr\_period &              -0.152479 &                  -0.036392 &  0.012794 &  0.007880 \\
neigh\_airports\_pr\_period  &               0.192012 &                   0.070452 &  0.085841 &  0.089573 \\
airport\_type              &              -0.000684 &                  -0.004786 &  0.008825 &  0.012001 \\
population                &              -0.050108 &                  -0.036947 &  0.091211 &  0.148322 \\
megapolis                 &              -0.028096 &                   0.013063 &  0.068939 &  0.073407 \\
tourist                   &               0.005780 &                  -0.010778 &  0.007389 &  0.003620 \\
distance\_km               &              -0.027775 &                  -0.010213 & -0.001816 &  0.018682 \\
\bottomrule
\end{tabular}

