In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn import preprocessing
from sklearn import model_selection
from sklearn import linear_model
from sklearn import tree
from sklearn import ensemble
from sklearn import metrics

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
data_airpass = pd.read_csv('data/AirPass.csv')

# data_airpass.drop(['Unnamed: 0', 'id'], axis=1, inplace=True)
data_airpass.drop(['Unnamed: 0'], axis=1, inplace=True)
display(data_airpass.isna().sum().sum())
data_airpass['Arrival Delay in Minutes'] = data_airpass['Arrival Delay in Minutes'].fillna(data_airpass['Arrival Delay in Minutes'].median())
data_airpass['Arrival Delay in Minutes'] = data_airpass['Arrival Delay in Minutes'].fillna(data_airpass['Arrival Delay in Minutes'].median())
display(data_airpass['Arrival Delay in Minutes'].mean())

310

15.133392362180475

In [4]:
pd.DataFrame(data_airpass.groupby(['Gender', 'satisfaction'])['satisfaction'].count())

Unnamed: 0_level_0,Unnamed: 1_level_0,satisfaction
Gender,satisfaction,Unnamed: 2_level_1
Female,neutral or dissatisfied,30193
Female,satisfied,22534
Male,neutral or dissatisfied,28686
Male,satisfied,22491


In [5]:
pd.DataFrame(data_airpass.groupby(['Type of Travel', 'satisfaction'])['satisfaction'].count())

Unnamed: 0_level_0,Unnamed: 1_level_0,satisfaction
Type of Travel,satisfaction,Unnamed: 2_level_1
Business travel,neutral or dissatisfied,29909
Business travel,satisfied,41746
Personal Travel,neutral or dissatisfied,28970
Personal Travel,satisfied,3279


In [6]:
pd.DataFrame(data_airpass.groupby(['Class', 'satisfaction'])['satisfaction'].count())

Unnamed: 0_level_0,Unnamed: 1_level_0,satisfaction
Class,satisfaction,Unnamed: 2_level_1
Business,neutral or dissatisfied,15185
Business,satisfied,34480
Eco,neutral or dissatisfied,38044
Eco,satisfied,8701
Eco Plus,neutral or dissatisfied,5650
Eco Plus,satisfied,1844


In [7]:
data_airpass['satisfaction'] = data_airpass['satisfaction'].map({'neutral or dissatisfied': 0, 'satisfied': 1})
data_airpass['Customer Type'] = data_airpass['Customer Type'].map({'Loyal Customer':1, 'disloyal Customer':0})
data_airpass['Type of Travel'] = data_airpass['Type of Travel'].map({'Personal Travel':0, 'Business travel':1})

In [8]:
# data_airpass - изначальный dataset с удалёнными колонками 'Unnamed 0:' и 'id'
categorical = ['Gender', 'Class']
data_dummies = pd.get_dummies(data_airpass[categorical])
data = pd.concat([data_airpass, data_dummies], axis = 1)
data.drop(categorical, axis=1, inplace=True)

In [9]:
# После get_dummies
display(data.columns)
display(data.shape)

Index(['id', 'Customer Type', 'Age', 'Type of Travel', 'Flight Distance',
       'Inflight wifi service', 'Departure/Arrival time convenient',
       'Ease of Online booking', 'Gate location', 'Food and drink',
       'Online boarding', 'Seat comfort', 'Inflight entertainment',
       'On-board service', 'Leg room service', 'Baggage handling',
       'Checkin service', 'Inflight service', 'Cleanliness',
       'Departure Delay in Minutes', 'Arrival Delay in Minutes',
       'satisfaction', 'Gender_Female', 'Gender_Male', 'Class_Business',
       'Class_Eco', 'Class_Eco Plus'],
      dtype='object')

(103904, 27)

In [10]:
X = data.drop(['satisfaction'], axis=1)
y = data['satisfaction']

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=26)
display(X_train.shape)
display(X_test.shape)

(83123, 26)

(20781, 26)

In [11]:
scaler = preprocessing.StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_test_scaled[0, :]

array([ 0.94082514,  0.47381845,  0.37290362,  0.67016452,  2.69748792,
       -0.54752968, -0.69553636,  0.17491054, -0.76416846,  0.60146049,
        1.29843847,  1.18407603,  0.48374602,  0.47926399,  0.494379  ,
        0.31485443,  1.34221146,  0.30683708, -0.21471839, -0.38594265,
       -0.36434834, -1.0182236 ,  1.0182236 ,  1.04624209, -0.90523003,
       -0.27897268])

In [12]:
model = linear_model.LogisticRegression()
model.fit(X_train, y_train)
y_test_pred = model.predict(X_test)
metrics.f1_score(y_test, y_test_pred)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.6903936682013346

In [13]:
model = ensemble.AdaBoostClassifier(base_estimator=tree.DecisionTreeClassifier(random_state=26), learning_rate=0.01, random_state=26)
model.fit(X_train, y_train)
y_test_pred = model.predict(X_test)
metrics.f1_score(y_test, y_test_pred)

0.9401681226306248

In [14]:
params = {"n_estimators":2**np.arange(8), "learning_rate":0.1**np.arange(3)}

basic_model = ensemble.GradientBoostingClassifier()
searcher = model_selection.GridSearchCV(estimator=basic_model, param_grid=params, scoring='f1', cv=3, n_jobs=4)
searcher.fit(X_train, y_train)

display(searcher.best_params_)
display(searcher.best_estimator_)

y_test_pred = searcher.best_estimator_.predict(X_test)
display(metrics.f1_score(y_test, y_test_pred))

{'learning_rate': 1.0, 'n_estimators': 128}

GradientBoostingClassifier(learning_rate=1.0, n_estimators=128)

0.9506418769366977

In [16]:
import xgboost as xgb

model = xgb.XGBClassifier(random_state=26)
model.fit(X_train, y_train)
y_test_pred = model.predict(X_test)
metrics.f1_score(y_test, y_test_pred)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




0.9579785161685312

In [17]:
import catboost as ctb

model = ctb.CatBoostClassifier(random_state=26)
model.fit(X_train, y_train)
y_test_pred = model.predict(X_test)
metrics.f1_score(y_test, y_test_pred)

Learning rate set to 0.068023
0:	learn: 0.6008064	total: 90.2ms	remaining: 1m 30s
1:	learn: 0.5274382	total: 95.7ms	remaining: 47.8s
2:	learn: 0.4526895	total: 103ms	remaining: 34.1s
3:	learn: 0.4085055	total: 110ms	remaining: 27.3s
4:	learn: 0.3754327	total: 115ms	remaining: 22.8s
5:	learn: 0.3316907	total: 119ms	remaining: 19.7s
6:	learn: 0.3110787	total: 123ms	remaining: 17.4s
7:	learn: 0.2929224	total: 128ms	remaining: 15.9s
8:	learn: 0.2731868	total: 135ms	remaining: 14.8s
9:	learn: 0.2584989	total: 142ms	remaining: 14s
10:	learn: 0.2433966	total: 146ms	remaining: 13.1s
11:	learn: 0.2327964	total: 149ms	remaining: 12.3s
12:	learn: 0.2245690	total: 153ms	remaining: 11.6s
13:	learn: 0.2158969	total: 157ms	remaining: 11s
14:	learn: 0.2073525	total: 164ms	remaining: 10.7s
15:	learn: 0.1986438	total: 170ms	remaining: 10.5s
16:	learn: 0.1921241	total: 176ms	remaining: 10.2s
17:	learn: 0.1881225	total: 180ms	remaining: 9.79s
18:	learn: 0.1841773	total: 183ms	remaining: 9.47s
19:	learn: 0

0.9601956860128975

In [18]:
from catboost.utils import get_confusion_matrix
get_confusion_matrix(model, ctb.Pool(X_train, y_train))

array([[46683.,   523.],
       [ 1278., 34639.]])

In [19]:
sorted(np.column_stack((model.feature_importances_, model.feature_names_)), key=lambda it: it[0], reverse=True)

[array(['7.414114332133211', 'Online boarding'], dtype='<U33'),
 array(['6.723716189409476', 'Customer Type'], dtype='<U33'),
 array(['4.850559350581365', 'Class_Business'], dtype='<U33'),
 array(['3.857613530118308', 'Checkin service'], dtype='<U33'),
 array(['3.432939687371243', 'Age'], dtype='<U33'),
 array(['3.306300122837376', 'Gate location'], dtype='<U33'),
 array(['3.2777014678700893', 'Baggage handling'], dtype='<U33'),
 array(['3.0619909732379487', 'Inflight service'], dtype='<U33'),
 array(['3.045439809435761', 'Seat comfort'], dtype='<U33'),
 array(['25.67880706958734', 'Inflight wifi service'], dtype='<U33'),
 array(['2.9443475377762782', 'Inflight entertainment'], dtype='<U33'),
 array(['18.948915814707917', 'Type of Travel'], dtype='<U33'),
 array(['1.9717003141154392', 'id'], dtype='<U33'),
 array(['1.7304341578223774', 'Ease of Online booking'], dtype='<U33'),
 array(['1.7129107274369155', 'On-board service'], dtype='<U33'),
 array(['1.664047655325426', 'Cleanliness'],