In [55]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import train_test_split
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, confusion_matrix

### Разделим фичи по типам, отделим таргет и преобразуем его, разделим на тренировочную и тестирующую выборку

In [44]:
data_file = pd.read_csv('data.csv')

target = 'satisfaction'
drop_features = ['Unnamed: 0', 'id', 'Arrival Delay in Minutes']
numerical_features = ['Age', 'Flight Distance', 'Departure Delay in Minutes']
categorical_features_to_onehot = ['Gender', 'Customer Type', 'Type of Travel', 'Class']
categorical_features_other = [feat for feat in list(data_file.columns) if feat != target and feat not in numerical_features and feat not in drop_features and feat not in categorical_features_to_onehot]
 
X = data_file.drop(target,axis=1)
y = data_file[target]

enc = LabelEncoder()
y = enc.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=True)

In [45]:
class Calc_Array(TransformerMixin):
    def fit(self, X, y = None, **kwargs):
        return self
    def transform(self, X, y = None, **kwargs):
        return X.toarray()

### Перпроцессинг данных

In [46]:
TransformColums = ColumnTransformer([
    ('Drop','drop',drop_features),
    ('Encoder', OneHotEncoder(), categorical_features_to_onehot),
    ('MinMax',MinMaxScaler(),numerical_features)
], remainder='passthrough')



### Функция подчета метрик

In [52]:
def Metrics(model, X, y):
    y_pred = model.predict(X)
    print('Accuracy = ', accuracy_score(y, y_pred))
    print('Precision = ', precision_score(y, y_pred))
    print('Recall = ', recall_score(y, y_pred))
    print('ROC_AUC_score = ', roc_auc_score(y, y_pred))
    print('Confudion_Matrix =')
    print(confusion_matrix(y, y_pred))

### KNN Sklearn

In [47]:
knn_sk_pipeline = Pipeline([
    ('Column Transform', TransformColums),
    ('knn', KNeighborsClassifier())
])

knn_sk = GridSearchCV(knn_sk_pipeline,{'knn__n_neighbors':[1,3,5,7]})
knn_sk.fit(X_train, y_train)

In [53]:
Metrics(knn_sk, X_test, y_test)

Accuracy =  0.932486405851499
Precision =  0.949549338639822
Recall =  0.892998678996037
ROC_AUC_score =  0.9280758120978305
Confudion_Matrix =
[[11266   431]
 [  972  8112]]


### LogisticRegression Sklearn


In [75]:
logit_sk_pipeline = Pipeline([
    ('Column Transform', TransformColums),
    ('logit', LogisticRegression())
])

logit_sk = GridSearchCV(logit_sk_pipeline,{'logit__C':[10,1,0.1,0.01],'logit__max_iter':[500,1000,5000]})
logit_sk.fit(X_train, y_train)

In [71]:
Metrics(logit_sk, X_test, y_test)

Accuracy =  0.8763774601799721
Precision =  0.8726690310033177
Recall =  0.8397181858212242
ROC_AUC_score =  0.8722827912948132
Confudion_Matrix =
[[10584  1113]
 [ 1456  7628]]


{'logit__max_iter': 500}