### Data Cleaning

In [22]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold, GridSearchCV, ParameterGrid
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier, StackingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import itertools as it
import time as time

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option('display.float_format', lambda x: '%.4f' % x)

train = pd.read_csv('train.csv', index_col = 0)
test = pd.read_csv('test.csv', index_col = 0)

def data_prep(df):
    df["satisfaction"] = df["satisfaction"].map({'neutral or dissatisfied': 0, 'satisfied':1})
    df = df.dropna()
    df = df.drop(columns=["id"])
    print(df.shape)
    df.columns = df.columns.str.replace(' ', '_')
    df.columns = df.columns.str.replace('/', '_')
    df.columns = df.columns.str.replace('-', '_')
    df['Type_of_Travel'] = df['Type_of_Travel'].astype('string')
    df['Class'] = df['Class'].astype('string')
    df['Gender'] = df['Gender'].astype('string')
    df['Customer_Type'] = df['Customer_Type'].astype('string')
    return df

train = data_prep(train)
test = data_prep(test)

train.head()

(103594, 23)
(25893, 23)


Unnamed: 0,Gender,Customer_Type,Age,Type_of_Travel,Class,Flight_Distance,Inflight_wifi_service,Departure_Arrival_time_convenient,Ease_of_Online_booking,Gate_location,Food_and_drink,Online_boarding,Seat_comfort,Inflight_entertainment,On_board_service,Leg_room_service,Baggage_handling,Checkin_service,Inflight_service,Cleanliness,Departure_Delay_in_Minutes,Arrival_Delay_in_Minutes,satisfaction
0,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,3,1,5,3,5,5,4,3,4,4,5,5,25,18.0,0
1,Male,disloyal Customer,25,Business travel,Business,235,3,2,3,3,1,3,1,1,1,5,3,1,4,1,1,6.0,0
2,Female,Loyal Customer,26,Business travel,Business,1142,2,2,2,2,5,5,5,5,4,3,4,4,4,5,0,0.0,1
3,Female,Loyal Customer,25,Business travel,Business,562,2,5,5,5,2,2,2,2,2,5,3,1,4,2,11,9.0,0
4,Male,Loyal Customer,61,Business travel,Business,214,3,3,3,3,4,5,5,3,3,4,4,3,3,3,0,0.0,1


In [23]:
categorical_columns = list(train.select_dtypes('string').columns)
train = pd.get_dummies(train, columns = categorical_columns, drop_first = False)
test = pd.get_dummies(test, columns = categorical_columns, drop_first = False)

train = train.dropna()
test = test.dropna()

print(train.shape)
print(test.shape)

X = train.drop(columns=["satisfaction"])
y = train["satisfaction"]
Xtest = test.drop(columns=["satisfaction"])
ytest = test["satisfaction"]

test.head()

(103594, 28)
(25893, 28)


Unnamed: 0,Age,Flight_Distance,Inflight_wifi_service,Departure_Arrival_time_convenient,Ease_of_Online_booking,Gate_location,Food_and_drink,Online_boarding,Seat_comfort,Inflight_entertainment,On_board_service,Leg_room_service,Baggage_handling,Checkin_service,Inflight_service,Cleanliness,Departure_Delay_in_Minutes,Arrival_Delay_in_Minutes,satisfaction,Gender_Female,Gender_Male,Customer_Type_Loyal Customer,Customer_Type_disloyal Customer,Type_of_Travel_Business travel,Type_of_Travel_Personal Travel,Class_Business,Class_Eco,Class_Eco Plus
0,52,160,5,4,3,4,3,4,3,5,5,5,5,2,5,5,50,44.0,1,1,0,1,0,1,0,0,1,0
1,36,2863,1,1,3,1,5,4,5,4,4,4,4,3,4,5,0,0.0,1,1,0,1,0,1,0,1,0,0
2,20,192,2,0,2,4,2,2,2,2,4,1,3,2,2,2,0,0.0,0,0,1,0,1,1,0,0,1,0
3,44,3377,0,0,0,2,3,4,4,1,1,1,1,3,1,4,0,6.0,1,0,1,1,0,1,0,1,0,0
4,49,1182,2,3,4,3,4,1,2,2,2,2,2,4,2,4,0,20.0,1,1,0,1,0,1,0,0,1,0


### a) Random Forest

In [3]:
def confusion_matrix_data(pred_values, actual_values, cutoff=0.5):
    bins=np.array([0,cutoff,1])
    cm = np.histogram2d(actual_values, pred_values, bins=bins)[0]
    precision = 100*(cm[1,1])/(cm[0,1]+cm[1,1])
    return precision

In [26]:
def acc(pred_values, actual_values, cutoff=0.5):
    bins=np.array([0,cutoff,1])
    cm = np.histogram2d(actual_values, pred_values, bins=bins)[0]
    accuracy = 100*(cm[0,0]+cm[1,1])/cm.sum()
    return accuracy

In [24]:
oob_precision = {}
for i in [75, 100, 125]:
    for j in [1, 2, 3, 4, 5]:
        for k in [2, 3, 4, 5, 6]:
            for l in [1, 2, 3]:
                model = RandomForestClassifier(n_estimators = i, max_features = j, min_samples_split = k, min_samples_leaf = l, n_jobs=-1, random_state=1, oob_score=True).fit(X, y)
                oob_precision[i, j, k, l] = confusion_matrix_data(pd.Series(model.oob_decision_function_[:,1]), y)

print(max(oob_precision, key=oob_precision.get), oob_precision[max(oob_precision, key=oob_precision.get)])

(125, 5, 6, 1) 97.28348450665317


In [27]:
params = max(oob_precision, key=oob_precision.get)
rf_params = {'n_estimators': params[0], 'max_features': params[1], 'min_samples_split': params[2], 'min_samples_leaf': params[3]}
m1 = RandomForestClassifier(**rf_params, n_jobs=-1, random_state=1, oob_score=True)
model = m1.fit(X, y)
pred = model.predict_proba(Xtest)[:, 1]
print(f"Random forest model -- Precision on test data: {confusion_matrix_data(pred, ytest)}")
print(f"Random forest model -- Accuracy on test data: {acc(pred, ytest)}")

Random forest model -- Precision on test data: 97.16485507246377
Random forest model -- Accuracy on test data: 96.32719267755765


### b) AdaBoost

In [28]:
model = AdaBoostClassifier(random_state = 1)
grid = dict()
grid['n_estimators'] = [10, 50, 100,200,500]
grid['learning_rate'] = [0.0001, 0.001, 0.01,0.1, 1.0]
grid['base_estimator'] = [DecisionTreeClassifier(max_depth=1),DecisionTreeClassifier(max_depth=2),
                          DecisionTreeClassifier(max_depth=3),DecisionTreeClassifier(max_depth=4)]
# define the evaluation procedure
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
# define the grid search procedure
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, verbose=1, scoring='precision', refit='precision')
# execute the grid search
grid_result = grid_search.fit(X, y)
# summarize the best score and configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best: 0.969326 using {'base_estimator': DecisionTreeClassifier(max_depth=4), 'learning_rate': 0.1, 'n_estimators': 500}


In [29]:
params = grid_result.best_params_
m2 = AdaBoostClassifier(**params, random_state=1)
model = m2.fit(X,y)
pred = model.predict_proba(Xtest)[:, 1]
print(f"AdaBoost model -- Precision on test data: {confusion_matrix_data(pred, ytest)}")
print(f"AdaBoost model -- Accuracy on test data: {acc(pred, ytest)}")

AdaBoost model -- Precision on test data: 96.9179320318149
AdaBoost model -- Accuracy on test data: 96.20360715251226


### c) Gradient Boost 

In [31]:
model = GradientBoostingClassifier(random_state=1)
grid = dict()
grid['n_estimators'] = [50, 100,200,500]
grid['learning_rate'] = [0.0001, 0.001, 0.01,0.1]
grid['max_depth'] = [2,3,4,5]
grid['subsample'] = [0.5,1.0]
# define the evaluation procedure
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=1)
# define the grid search procedure
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, verbose=1, scoring='precision', refit='precision')
# execute the grid search
grid_result = grid_search.fit(X, y)
# summarize the best score and configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Fitting 3 folds for each of 128 candidates, totalling 384 fits
Best: 0.971683 using {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200, 'subsample': 1.0}


In [33]:
params = grid_result.best_params_
m3 = GradientBoostingClassifier(**params,random_state=1)
model = m3.fit(X,y)
pred = model.predict_proba(Xtest)[:, 1]
print(f"Gradient Boost model -- Precision on test data: {confusion_matrix_data(pred, ytest)}")
print(f"Gradient Boost model -- Accuracy on test data: {acc(pred, ytest)}")

Gradient Boost model -- Precision on test data: 97.07401032702238
Gradient Boost model -- Accuracy on test data: 96.2460896767466


### d) XGBoost

In [34]:
param_grid = {'n_estimators':[25,100,500],
              'max_depth': [6,7,8],
              'learning_rate': [0.001,0.1,0.2],
              'gamma': [0.1,0.25,0.5],
              'reg_lambda':[0,0.01,0.001],
              'scale_pos_weight':[1.25,1.5,1.75]}
cv = StratifiedKFold(n_splits=3,shuffle=True,random_state=1)
grid_search = GridSearchCV(estimator=xgb.XGBClassifier(random_state=1),
                              param_grid = param_grid,verbose = 1,n_jobs=-1,cv = cv, scoring='precision', refit='precision')
grid_result = grid_search.fit(X,y)
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Fitting 3 folds for each of 729 candidates, totalling 2187 fits




Best: 0.966689 using {'gamma': 0.25, 'learning_rate': 0.1, 'max_depth': 8, 'n_estimators': 500, 'reg_lambda': 0, 'scale_pos_weight': 1.25}


In [35]:
params = grid_result.best_params_
m4 = xgb.XGBClassifier(**params,random_state=1)
model = m4.fit(X,y)
pred = model.predict_proba(Xtest)[:, 1]
print(f"XGBoost model -- Precision on test data: {confusion_matrix_data(pred, ytest)}")
print(f"XGBoost model -- Accuracy on test data: {acc(pred, ytest)}")

XGBoost model -- Precision on test data: 96.71513193322563
XGBoost model -- Accuracy on test data: 96.31174448692697


### 1) Voting ensemble

In [36]:
en1 = VotingClassifier(estimators = [('rf',m1), ('ada',m2), ('gb',m3), ('xgb',m4)], voting = 'hard')
model = en1.fit(X,y)
pred = model.predict_proba(Xtest)[:, 1]
print(f"Voting ensemble model (hard) -- Precision on test data: {confusion_matrix_data(pred, ytest)}")
print(f"Voting ensemble model (hard) -- Accuracy on test data: {acc(pred, ytest)}")





AttributeError: predict_proba is not available when voting='hard'

In [37]:
en2 = VotingClassifier(estimators = [('rf',m1), ('ada',m2), ('gb',m3), ('xgb',m4)], voting = 'soft')
model = en2.fit(X,y)
pred = model.predict_proba(Xtest)[:, 1]
print(f"Voting ensemble model (soft) -- Precision on test data: {confusion_matrix_data(pred, ytest)}")
print(f"Voting ensemble model (soft) -- Accuracy on test data: {acc(pred, ytest)}")



Voting ensemble model (soft) -- Precision on test data: 97.31124388919066
Voting ensemble model (soft) -- Accuracy on test data: 96.47395048854902


### 2) Stacking ensemble

In [38]:
en3 = StackingClassifier(estimators = [('rf',m1), ('ada',m2), ('gb',m3), ('xgb',m4)],
                        final_estimator=LogisticRegression(random_state=1,max_iter=10000),n_jobs=-1,
                        cv = StratifiedKFold(n_splits=5,shuffle=True,random_state=1))
model = en3.fit(X,y)
pred = model.predict_proba(Xtest)[:, 1]
print(f"Stacking ensemble model (LogisticRegression) -- Precision on test data: {confusion_matrix_data(pred, ytest)}")
print(f"Stacking ensemble model (LogisticRegression) -- Accuracy on test data: {acc(pred, ytest)}")

coefs = model.final_estimator_.coef_

Stacking ensemble model (LogisticRegression) -- Precision on test data: 97.09779179810725
Stacking ensemble model (LogisticRegression) -- Accuracy on test data: 96.47008844089136


In [39]:
en4 = StackingClassifier(estimators = [('rf',m1), ('ada',m2), ('gb',m3), ('xgb',m4)],
                           final_estimator=RandomForestClassifier(n_estimators=500, max_features=1,random_state=1,oob_score=True),
                           n_jobs=-1,cv = StratifiedKFold(n_splits=5,shuffle=True,random_state=1))
model = en4.fit(X,y)
pred = model.predict_proba(Xtest)[:, 1]
print(f"Stacking ensemble model (RandomForestClassifier) -- Precision on test data: {confusion_matrix_data(pred, ytest)}")
print(f"Stacking ensemble model (RandomForestClassifier) -- Accuracy on test data: {acc(pred, ytest)}")

Stacking ensemble model (RandomForestClassifier) -- Precision on test data: 97.22549641853296
Stacking ensemble model (RandomForestClassifier) -- Accuracy on test data: 96.33877882053065


### Results

In [71]:
models = ['Random forest', 'AdaBoost', 'Gradient Boost', 'XGBoost']
importances = dict(zip(models, list(coefs[0])))
def importance(val):
    return importances[val]

ordered = sorted(importances, key = importance, reverse = True)

print("Models in order of importance:")
print("-"*50)
for i, model in enumerate(ordered):
    print(f"#{i+1}: {model} ({importances[model]})")

Models in order of importance:
--------------------------------------------------
#1: AdaBoost (22.482328547522403)
#2: Random forest (4.381044277331529)
#3: XGBoost (1.5488381852997994)
#4: Gradient Boost (1.1774755007465192)


### Cutoffs

In [72]:
def estimate_cutoff(model):   
    prec = {}
    for i in (np.array(range(25, 1000, 25)) * 0.001):
        pred = model.predict_proba(Xtest)[:, 1]
        prec[i] = confusion_matrix_data(pred, ytest, cutoff=i)
    best_prec = max(prec, key=prec.get)
    print(best_prec, prec[best_prec])
    print(f"Precision : {confusion_matrix_data(pred, ytest, cutoff = best_prec)}")
    print(f"Accuracy: {acc(pred, ytest, cutoff = best_prec)}")

In [73]:
f1 = m1.fit(X,y) #rf
f2 = m2.fit(X,y) #ada
f3 = m3.fit(X,y) #gb
f4 = m4.fit(X,y) #xgb





In [74]:
for model in [f1, f2, f3, f4]:
    estimate_cutoff(model)

0.975 99.9870214146658
Precision : 99.9870214146658
Accuracy: 85.85718147761943


  precision = 100*(cm[1,1])/(cm[0,1]+cm[1,1])
  precision = 100*(cm[1,1])/(cm[0,1]+cm[1,1])
  precision = 100*(cm[1,1])/(cm[0,1]+cm[1,1])
  precision = 100*(cm[1,1])/(cm[0,1]+cm[1,1])
  precision = 100*(cm[1,1])/(cm[0,1]+cm[1,1])
  precision = 100*(cm[1,1])/(cm[0,1]+cm[1,1])
  precision = 100*(cm[1,1])/(cm[0,1]+cm[1,1])
  precision = 100*(cm[1,1])/(cm[0,1]+cm[1,1])


0.65 100.0
Precision : 100.0
Accuracy: 62.2446220986367
0.975 99.94522348816827
Precision : 99.94522348816827
Accuracy: 91.3219789132198
0.975 99.91050119331742
Precision : 99.91050119331742
Accuracy: 94.87506275827444


In [75]:
#estimate_cutoff(en1.fit(X,y))
estimate_cutoff(en2.fit(X,y))
estimate_cutoff(en3.fit(X,y))
estimate_cutoff(en4.fit(X,y))





  precision = 100*(cm[1,1])/(cm[0,1]+cm[1,1])
  precision = 100*(cm[1,1])/(cm[0,1]+cm[1,1])


0.9 100.0
Precision : 100.0
Accuracy: 69.20789402541227
0.975 99.95901639344262
Precision : 99.95901639344262
Accuracy: 93.77051712818137
0.975 99.89042733339974
Precision : 99.89042733339974
Accuracy: 94.79395975746341


### Saving Models

In [78]:
import pickle
def save_model(model, filename):
    # save the model to disk
    filename = f'{filename}.sav'
    pickle.dump(model, open(filename, 'wb'))
    
save_model(f1, 'f1')
save_model(f2, 'f2')
save_model(f3, 'f3')
save_model(f4, 'f4')

In [79]:
ensemble1 = en2.fit(X,y) #soft voting
ensemble2 = en3.fit(X,y) #logistic stacking
ensemble3 = en4.fit(X,y) #RF stacking
# We do not use hard voting ensemble, as it is not possible to search cutoffs when results must be 0 or 1





In [81]:
save_model(ensemble1, 'ensemble1')
save_model(ensemble2, 'ensemble2')
save_model(ensemble3, 'ensemble3')

### Feature Importances

In [83]:
def load_model(filename):
    # load the model from disk
    loaded_model = pickle.load(open(f'{filename}.sav', 'rb'))
    return loaded_model

# 2nd to worst performance in terms of precision (even though less than 0.1% from 100%), best performance in terms of accuracy:
loaded_model = load_model('f4')
loaded_model.feature_importances_

array([0.00617663, 0.00343445, 0.14233801, 0.00545094, 0.00691719,
       0.0122112 , 0.00392513, 0.38385156, 0.01253875, 0.03345801,
       0.01188793, 0.01144078, 0.01239789, 0.01754906, 0.01168233,
       0.01364641, 0.00292074, 0.00431291, 0.00284916, 0.        ,
       0.0610305 , 0.        , 0.1696069 , 0.        , 0.06349049,
       0.00360832, 0.00327466], dtype=float32)

In [84]:
loaded_features = list(loaded_model.feature_importances_)
features = [round(feature, 6) for feature in loaded_features]
feature_zip = zip(X.columns, features)
feature_dict = {x: y for (x, y) in list(feature_zip)}
sorted_dict = {k: v for k, v in sorted(feature_dict.items(), key=lambda item: item[1], reverse = True)}
sorted_dict

{'Online_boarding': 0.383852,
 'Type_of_Travel_Business travel': 0.169607,
 'Inflight_wifi_service': 0.142338,
 'Class_Business': 0.06349,
 'Customer_Type_Loyal Customer': 0.061031,
 'Inflight_entertainment': 0.033458,
 'Checkin_service': 0.017549,
 'Cleanliness': 0.013646,
 'Seat_comfort': 0.012539,
 'Baggage_handling': 0.012398,
 'Gate_location': 0.012211,
 'On_board_service': 0.011888,
 'Inflight_service': 0.011682,
 'Leg_room_service': 0.011441,
 'Ease_of_Online_booking': 0.006917,
 'Age': 0.006177,
 'Departure_Arrival_time_convenient': 0.005451,
 'Arrival_Delay_in_Minutes': 0.004313,
 'Food_and_drink': 0.003925,
 'Class_Eco': 0.003608,
 'Flight_Distance': 0.003434,
 'Class_Eco Plus': 0.003275,
 'Departure_Delay_in_Minutes': 0.002921,
 'Gender_Female': 0.002849,
 'Gender_Male': 0.0,
 'Customer_Type_disloyal Customer': 0.0,
 'Type_of_Travel_Personal Travel': 0.0}

Some of the takeaways were foreseen. For example, gender is not that important.

### Conclusions

**In order of precision:**

Soft Voting <br>
Cutoff: 0.9 <br>
Precision : 100.0 <br>
Accuracy: 69.20789402541227 <br>

AdaBoost <br>
Cutoff: 0.65 <br>
Precision : 100.0 <br>
Accuracy: 62.2446220986367 <br>

Random Forest <br>
Cutoff: 0.975 <br>
Precision : 99.9870214146658 <br>
Accuracy: 85.85718147761943 <br>

Logistic Stacking <br>
Cutoff: 0.975 <br>
Precision : 99.95901639344262 <br>
Accuracy: 93.77051712818137 <br>

GradientBoost <br>
Cutoff: 0.975 <br>
Precision : 99.94522348816827 <br>
Accuracy: 91.3219789132198 <br>

XGBoost <br>
Cutoff: 0.975 <br>
Precision : 99.91050119331742 <br>
Accuracy: 94.87506275827444 <br>

Random Forest Stacking <br>
Cutoff: 0.975 <br>
Precision : 99.89042733339974 <br>
Accuracy: 94.79395975746341 <br>

**In order of accuracy:**

XGBoost <br>
Cutoff: 0.975 <br>
Precision : 99.91050119331742 <br>
Accuracy: 94.87506275827444 <br>

Random Forest Stacking <br>
Cutoff: 0.975 <br>
Precision : 99.89042733339974 <br>
Accuracy: 94.79395975746341 <br>

Logistic Stacking <br>
Cutoff: 0.975 <br>
Precision : 99.95901639344262 <br>
Accuracy: 93.77051712818137 <br>

GradientBoost <br>
Cutoff: 0.975 <br>
Precision : 99.94522348816827 <br>
Accuracy: 91.3219789132198 <br>

Random Forest <br>
Cutoff: 0.975 <br>
Precision : 99.9870214146658 <br>
Accuracy: 85.85718147761943 <br>

Soft Voting <br>
Cutoff: 0.9 <br>
Precision : 100.0 <br>
Accuracy: 69.20789402541227 <br>

AdaBoost <br>
Cutoff: 0.65 <br>
Precision : 100.0 <br>
Accuracy: 62.2446220986367 <br>

#### As you can see, precision is "inversely optimal" to accuracy.