# 1-Import Libraries

In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
# for preprocessing 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import category_encoders as ce
from mlxtend.plotting import plot_decision_regions
from sklearn import preprocessing

# for evaluation 
from sklearn.metrics import classification_report, accuracy_score,confusion_matrix

#models 
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn import svm 
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')

# 2-Read Dataset

In [2]:
data=pd.read_csv('E:/Dataset/XGBOOST_Data.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,0,19556,Female,Loyal Customer,52,Business travel,Eco,160,5,4,...,5,5,5,5,2,5,5,50,44.0,satisfied
1,1,90035,Female,Loyal Customer,36,Business travel,Business,2863,1,1,...,4,4,4,4,3,4,5,0,0.0,satisfied
2,2,12360,Male,disloyal Customer,20,Business travel,Eco,192,2,0,...,2,4,1,3,2,2,2,0,0.0,neutral or dissatisfied
3,3,77959,Male,Loyal Customer,44,Business travel,Business,3377,0,0,...,1,1,1,1,3,1,4,0,6.0,satisfied
4,4,36875,Female,Loyal Customer,49,Business travel,Eco,1182,2,3,...,2,2,2,2,4,2,4,0,20.0,satisfied


# 3-Data Exploration

In [3]:
data.columns

Index(['Unnamed: 0', 'id', 'Gender', 'Customer Type', 'Age', 'Type of Travel',
       'Class', 'Flight Distance', 'Inflight wifi service',
       'Departure/Arrival time convenient', 'Ease of Online booking',
       'Gate location', 'Food and drink', 'Online boarding', 'Seat comfort',
       'Inflight entertainment', 'On-board service', 'Leg room service',
       'Baggage handling', 'Checkin service', 'Inflight service',
       'Cleanliness', 'Departure Delay in Minutes', 'Arrival Delay in Minutes',
       'satisfaction'],
      dtype='object')

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25976 entries, 0 to 25975
Data columns (total 25 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Unnamed: 0                         25976 non-null  int64  
 1   id                                 25976 non-null  int64  
 2   Gender                             25976 non-null  object 
 3   Customer Type                      25976 non-null  object 
 4   Age                                25976 non-null  int64  
 5   Type of Travel                     25976 non-null  object 
 6   Class                              25976 non-null  object 
 7   Flight Distance                    25976 non-null  int64  
 8   Inflight wifi service              25976 non-null  int64  
 9   Departure/Arrival time convenient  25976 non-null  int64  
 10  Ease of Online booking             25976 non-null  int64  
 11  Gate location                      25976 non-null  int

In [5]:
data.isna().sum()

Unnamed: 0                            0
id                                    0
Gender                                0
Customer Type                         0
Age                                   0
Type of Travel                        0
Class                                 0
Flight Distance                       0
Inflight wifi service                 0
Departure/Arrival time convenient     0
Ease of Online booking                0
Gate location                         0
Food and drink                        0
Online boarding                       0
Seat comfort                          0
Inflight entertainment                0
On-board service                      0
Leg room service                      0
Baggage handling                      0
Checkin service                       0
Inflight service                      0
Cleanliness                           0
Departure Delay in Minutes            0
Arrival Delay in Minutes             83
satisfaction                          0


# 4-Data Preprocessing

In [6]:
# drop Unnecessary columns 
data.drop(columns=['Unnamed: 0','id'], axis=1, inplace = True)
data.head()

Unnamed: 0,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,Female,Loyal Customer,52,Business travel,Eco,160,5,4,3,4,...,5,5,5,5,2,5,5,50,44.0,satisfied
1,Female,Loyal Customer,36,Business travel,Business,2863,1,1,3,1,...,4,4,4,4,3,4,5,0,0.0,satisfied
2,Male,disloyal Customer,20,Business travel,Eco,192,2,0,2,4,...,2,4,1,3,2,2,2,0,0.0,neutral or dissatisfied
3,Male,Loyal Customer,44,Business travel,Business,3377,0,0,0,2,...,1,1,1,1,3,1,4,0,6.0,satisfied
4,Female,Loyal Customer,49,Business travel,Eco,1182,2,3,4,3,...,2,2,2,2,4,2,4,0,20.0,satisfied


In [7]:
data['Gender'].value_counts()

Female    13172
Male      12804
Name: Gender, dtype: int64

In [8]:
data['Customer Type'].value_counts()

Loyal Customer       21177
disloyal Customer     4799
Name: Customer Type, dtype: int64

In [9]:
data['Type of Travel'].value_counts()

Business travel    18038
Personal Travel     7938
Name: Type of Travel, dtype: int64

In [10]:
data['Class'].value_counts()

Business    12495
Eco         11564
Eco Plus     1917
Name: Class, dtype: int64

In [11]:
data['satisfaction'].value_counts()

neutral or dissatisfied    14573
satisfied                  11403
Name: satisfaction, dtype: int64

In [12]:
data['Arrival Delay in Minutes'].value_counts()

0.0      14594
1.0        536
2.0        523
3.0        490
4.0        466
         ...  
307.0        1
285.0        1
347.0        1
795.0        1
288.0        1
Name: Arrival Delay in Minutes, Length: 320, dtype: int64

In [13]:
Gender_map = {'Male' : 1, 'Female': 0}
customer_map = {'Loyal Customer' : 1, 'disloyal Customer': 0}
travel_map = {'Business travel' : 1, 'Personal Travel': 0}
class_map = {'Business' : 1, 'Eco': 2, 'Eco Plus': 3}
satisfaction_map = {'neutral or dissatisfied' : 0, 'satisfied': 1}

In [14]:
# handling Categorical data 
data.Gender = data.Gender.map(Gender_map)
data['Customer Type'] = data['Customer Type'].map(customer_map)
data['Type of Travel'] = data['Type of Travel'].map(travel_map)
data['Class'] = data['Class'].map(class_map)
data['satisfaction'] = data['satisfaction'].map(satisfaction_map)


In [15]:
# fill missing value Delay in minutes
data['Arrival Delay in Minutes']=data['Arrival Delay in Minutes'].fillna(data['Arrival Delay in Minutes'].median())

In [16]:
data.isna().sum()

Gender                               0
Customer Type                        0
Age                                  0
Type of Travel                       0
Class                                0
Flight Distance                      0
Inflight wifi service                0
Departure/Arrival time convenient    0
Ease of Online booking               0
Gate location                        0
Food and drink                       0
Online boarding                      0
Seat comfort                         0
Inflight entertainment               0
On-board service                     0
Leg room service                     0
Baggage handling                     0
Checkin service                      0
Inflight service                     0
Cleanliness                          0
Departure Delay in Minutes           0
Arrival Delay in Minutes             0
satisfaction                         0
dtype: int64

# 5-Splitting Data

In [17]:
x = data.drop(columns = ['satisfaction'], axis = 1)
x.head()

Unnamed: 0,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,...,Seat comfort,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes
0,0,1,52,1,2,160,5,4,3,4,...,3,5,5,5,5,2,5,5,50,44.0
1,0,1,36,1,1,2863,1,1,3,1,...,5,4,4,4,4,3,4,5,0,0.0
2,1,0,20,1,2,192,2,0,2,4,...,2,2,4,1,3,2,2,2,0,0.0
3,1,1,44,1,1,3377,0,0,0,2,...,4,1,1,1,1,3,1,4,0,6.0
4,0,1,49,1,2,1182,2,3,4,3,...,2,2,2,2,2,4,2,4,0,20.0


In [18]:
y = data.satisfaction
y.head()

0    1
1    1
2    0
3    1
4    1
Name: satisfaction, dtype: int64

In [19]:
# splitting data into train & test
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.3, random_state = 42)

In [20]:
# Scaling for x(train&test) data
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
# transform only om x_test 
x_test = scaler.transform(x_test)

In [21]:
x_train

array([[-0.9870497 , -2.10499491, -1.03620868, ..., -1.73782304,
        -0.2478487 , -0.33558015],
       [-0.9870497 ,  0.47506053,  0.48267182, ...,  1.29979641,
        -0.37777227, -0.38732828],
       [ 1.01312022,  0.47506053, -0.83809383, ...,  0.54039155,
         0.60964686,  0.0266567 ],
       ...,
       [-0.9870497 ,  0.47506053,  1.14305464, ...,  0.54039155,
        -0.37777227, -0.38732828],
       [-0.9870497 ,  0.47506053, -1.76262978, ...,  1.29979641,
        -0.03997099, -0.38732828],
       [-0.9870497 ,  0.47506053,  1.47324606, ...,  0.54039155,
         1.02540228,  1.34623381]])

In [22]:
x_test

array([[-0.9870497 , -2.10499491, -1.30036181, ..., -0.21901332,
        -0.37777227, -0.38732828],
       [ 1.01312022,  0.47506053,  0.74682495, ..., -0.97841818,
        -0.37777227, -0.38732828],
       [ 1.01312022,  0.47506053,  0.08644213, ..., -0.21901332,
        -0.35178755, -0.36145421],
       ...,
       [ 1.01312022,  0.47506053,  0.21851869, ..., -0.21901332,
        -0.32580284, -0.02509142],
       [ 1.01312022,  0.47506053,  1.6713609 , ..., -0.21901332,
        -0.32580284, -0.25795797],
       [-0.9870497 ,  0.47506053, -0.177711  , ..., -0.21901332,
        -0.2478487 , -0.33558015]])

# 6-Building Models

In [23]:
def evaluate_model(actual, predicted):
    
    accuracy_score_model = accuracy_score(actual, predicted)
    classification_report_model = classification_report(actual, predicted)
    
    return accuracy_score_model, classification_report_model

In [24]:
models={
    'Logistic Regression':LogisticRegression(),
    'Knn Classifier':KNeighborsClassifier(),
    'Decission Tree':DecisionTreeClassifier(),
    'Random Forest':RandomForestClassifier(),
    'XGB model':XGBClassifier()
}

In [25]:
models_list=[]
accuracy_train_list=[]
accuracy_test_list=[]

In [26]:
for i in range(len(list(models))):
    # models---> values
    model = list(models.values())[i]
    model.fit(x_train ,y_train)

    # model prediction 
    y_train_pred =model.predict(x_train)
    y_test_pred= model.predict(x_test)

    # model evaluation 
    train_accuracy, train_classification_report=evaluate_model(y_train ,y_train_pred)
    test_accuracy, test_classification_report=evaluate_model(y_test,y_test_pred)

    print (list(models.keys())[i])
    models_list.append(list(models.keys())[i])
    
    print('model training performance:')
    print ('train accuracy=',train_accuracy)
    print ('train classification_report =')
    print(train_classification_report)
   
    
    print("----------------------------------------------")
    print('model testing performance:')
    print ('test accuracy =',test_accuracy)
    print ('test classification_report =')
    print(test_classification_report)
    print('\n')

    accuracy_test_list.append( test_accuracy)
    accuracy_train_list.append( train_accuracy)



Logistic Regression
model training performance:
train accuracy= 0.8733432326898751
train classification_report =
              precision    recall  f1-score   support

           0       0.88      0.90      0.89     10162
           1       0.87      0.84      0.85      8021

    accuracy                           0.87     18183
   macro avg       0.87      0.87      0.87     18183
weighted avg       0.87      0.87      0.87     18183

----------------------------------------------
model testing performance:
test accuracy = 0.8656486590529963
test classification_report =
              precision    recall  f1-score   support

           0       0.87      0.90      0.88      4411
           1       0.86      0.83      0.84      3382

    accuracy                           0.87      7793
   macro avg       0.86      0.86      0.86      7793
weighted avg       0.87      0.87      0.87      7793



Knn Classifier
model training performance:
train accuracy= 0.9421987570807897
train classific

In [27]:
accuracy_df=pd.DataFrame(list(zip(models_list,accuracy_test_list,accuracy_train_list)),
                       columns=['Model Name ','train_accuracy_score','test_accuracy_score'])
accuracy_df

Unnamed: 0,Model Name,train_accuracy_score,test_accuracy_score
0,Logistic Regression,0.865649,0.873343
1,Knn Classifier,0.912871,0.942199
2,Decission Tree,0.928654,1.0
3,Random Forest,0.94957,1.0
4,XGB model,0.955345,0.99373


In [28]:
Model=XGBClassifier()
params={
    'booster':['gbtree','gblinear','dart'],
    'eta':[0.1,0.2,0.3,0.4],
    'max_depth':[20,30,40,50,60],
    'sampling_method':['uniform','gradient_based']  
}
gs_model=GridSearchCV(model,params)
gs_model.fit(x_train,y_train)
print(gs_model.best_params_)

{'booster': 'gbtree', 'eta': 0.4, 'max_depth': 30, 'sampling_method': 'uniform'}


In [29]:
Model=XGBClassifier(booster= 'gbtree', eta=0.4, max_depth=30,sampling_method='uniform')
model.fit(x_train,y_train)
y_pred=model.predict(x_test)
y_pred_train=model.predict(x_train)
print('Training Classification Report :')
print(classification_report(y_train,y_pred_train))
print('Testing Classification Report :')
print(classification_report(y_test,y_pred))

Training Classification Report :
              precision    recall  f1-score   support

           0       0.99      1.00      0.99     10162
           1       1.00      0.99      0.99      8021

    accuracy                           0.99     18183
   macro avg       0.99      0.99      0.99     18183
weighted avg       0.99      0.99      0.99     18183

Testing Classification Report :
              precision    recall  f1-score   support

           0       0.95      0.97      0.96      4411
           1       0.96      0.94      0.95      3382

    accuracy                           0.96      7793
   macro avg       0.96      0.95      0.95      7793
weighted avg       0.96      0.96      0.96      7793

