In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import train_test_split
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, confusion_matrix
from sklearn.svm import LinearSVC

### Разделим фичи по типам, отделим таргет и преобразуем его, разделим на тренировочную и тестирующую выборку

In [2]:
data_file = pd.read_csv('data.csv')

target = 'satisfaction'
drop_features = ['Unnamed: 0', 'id', 'Arrival Delay in Minutes']
numerical_features = ['Age', 'Flight Distance', 'Departure Delay in Minutes']
categorical_features_to_onehot = ['Gender', 'Customer Type', 'Type of Travel', 'Class']
categorical_features_other = [feat for feat in list(data_file.columns) if feat != target and feat not in numerical_features and feat not in drop_features and feat not in categorical_features_to_onehot]
 
X = data_file.drop(target,axis=1)
y = data_file[target]

enc = LabelEncoder()
y = enc.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=True)

In [45]:
class Calc_Array(TransformerMixin):
    def fit(self, X, y = None, **kwargs):
        return self
    def transform(self, X, y = None, **kwargs):
        return X.toarray()

### Перпроцессинг данных

In [3]:
TransformColums = ColumnTransformer([
    ('Drop','drop',drop_features),
    ('Encoder', OneHotEncoder(), categorical_features_to_onehot),
    ('MinMax', MinMaxScaler(), [feature for feature in numerical_features if feature != 'Age']),
    ('StandartScaler', StandardScaler(), ['Age'])
], remainder='passthrough')



### Функция подчета метрик

In [9]:
def Metrics(model, X, y):
    y_pred = model.predict(X)
    print('Accuracy = ', accuracy_score(y, y_pred))
    print('Precision = ', precision_score(y, y_pred))
    print('Recall = ', recall_score(y, y_pred))
    print('ROC_AUC_score = ', roc_auc_score(y, y_pred))
    print('Confudion_Matrix =')
    print(confusion_matrix(y, y_pred))

### KNN Sklearn

In [7]:
knn_sk_pipeline = Pipeline([
    ('Column Transform', TransformColums),
    ('knn', KNeighborsClassifier())
])

knn_sk = GridSearchCV(knn_sk_pipeline,{'knn__n_neighbors':[1,3,5,7]})
knn_sk.fit(X_train, y_train)

In [10]:
Metrics(knn_sk, X_test, y_test)

Accuracy =  0.9330638564072952
Precision =  0.9480627525552651
Recall =  0.8931810547531072
ROC_AUC_score =  0.9281517088111527
Confudion_Matrix =
[[11413   437]
 [  954  7977]]


### LogisticRegression Sklearn


In [8]:
logit_sk_pipeline = Pipeline([
    ('Column Transform', TransformColums),
    ('logit', LogisticRegression())
])

logit_sk = GridSearchCV(logit_sk_pipeline,{'logit__C':[10,1,0.1,0.01],'logit__max_iter':[500,1000,5000]})
logit_sk.fit(X_train, y_train)

In [9]:
Metrics(logit_sk, X_test, y_test)

Accuracy =  0.8744526249939849
Precision =  0.8703682193053781
Recall =  0.8338526596928556
ROC_AUC_score =  0.8696181484136174
Confudion_Matrix =
[[10679  1116]
 [ 1493  7493]]


### SVM

In [4]:
svm_sk_pipeline = Pipeline([
    ('Column Transform', TransformColums),
    ('SVM', LinearSVC())
])
svm_sk = GridSearchCV(svm_sk_pipeline,{'SVM__С':[10, 1, 0.1 0.01]})
svm_sk.fit(X_train, y_train)

5 fits failed out of a total of 10.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/pavel/Work/ML_Lab/ML/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/pavel/Work/ML_Lab/ML/lib/python3.8/site-packages/sklearn/pipeline.py", line 382, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/home/pavel/Work/ML_Lab/ML/lib/python3.8/site-packages/sklearn/svm/_classes.py", line 257, in fit
    self.coef_, self.intercept_, n_iter_ = _fit_liblinear(
  File "/home/pavel/Work/ML_Lab/ML/lib/python3.8/site-packages/sklearn/svm/_ba

In [15]:
Metrics(svm_sk, X_test, y_test)

Accuracy =  0.8777248448101631
Precision =  0.8781960227272727
Recall =  0.8307020490426604
ROC_AUC_score =  0.8719333030023428
Confudion_Matrix =
[[10821  1029]
 [ 1512  7419]]


In [16]:
svm_sk.best_params_

{'SVM__C': 0.1}