In [7]:
# Import the required modules
import sqlalchemy
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report,precision_score, recall_score, f1_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import xgboost

In [2]:
# Load the data
dataset_path = 'Data/Airline_Passenger_Satisfaction.csv'
original_data = pd.read_csv(dataset_path)
pd.set_option("display.max_rows",500)
pd.set_option("display.max_columns",500)
original_data.head()

Unnamed: 0.1,Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,Food and drink,Online boarding,Seat comfort,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,0,19556,Female,Loyal Customer,52,Business travel,Eco,160,5,4,3,4,3,4,3,5,5,5,5,2,5,5,50,44.0,satisfied
1,1,90035,Female,Loyal Customer,36,Business travel,Business,2863,1,1,3,1,5,4,5,4,4,4,4,3,4,5,0,0.0,satisfied
2,2,12360,Male,disloyal Customer,20,Business travel,Eco,192,2,0,2,4,2,2,2,2,4,1,3,2,2,2,0,0.0,neutral or dissatisfied
3,3,77959,Male,Loyal Customer,44,Business travel,Business,3377,0,0,0,2,3,4,4,1,1,1,1,3,1,4,0,6.0,satisfied
4,4,36875,Female,Loyal Customer,49,Business travel,Eco,1182,2,3,4,3,4,1,2,2,2,2,2,4,2,4,0,20.0,satisfied


In [3]:
#Drop unecessary columns
original_data.drop(["Unnamed: 0","id"],axis=1, inplace=True)
original_data["Arrival Delay in Minutes"]=original_data["Arrival Delay in Minutes"].fillna(0)

original_data["satisfaction"] = original_data["satisfaction"].replace({"satisfied": 1, "neutral or dissatisfied": 0})
original_data=pd.get_dummies(original_data, columns=["Gender","Customer Type","Type of Travel","Class"]).astype(int)

s_data_scaled = StandardScaler().fit_transform(original_data[["Age","Flight Distance","Departure Delay in Minutes","Arrival Delay in Minutes"]])
s_data_scaled[0:5]

array([[ 0.81788702, -1.03517064,  0.95381034,  0.78220768],
       [-0.23923776,  1.67144329, -0.38228641, -0.39218839],
       [-1.29636255, -1.00312786, -0.38228641, -0.39218839],
       [ 0.28932463,  2.18613052, -0.38228641, -0.23204347],
       [ 0.61967612, -0.01180422, -0.38228641,  0.14162801]])

In [4]:
df_s_scaled=pd.DataFrame(
    s_data_scaled,
    columns=["Age","Flight Distance","Departure Delay in Minutes","Arrival Delay in Minutes"]
)
original_data["Age"] =df_s_scaled["Age"]
original_data["Flight Distance"] =df_s_scaled["Flight Distance"]
original_data["Departure Delay in Minutes"] =df_s_scaled["Departure Delay in Minutes"]
original_data["Arrival Delay in Minutes"] =df_s_scaled["Arrival Delay in Minutes"]
original_data.head()

Unnamed: 0,Age,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,Food and drink,Online boarding,Seat comfort,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction,Gender_Female,Gender_Male,Customer Type_Loyal Customer,Customer Type_disloyal Customer,Type of Travel_Business travel,Type of Travel_Personal Travel,Class_Business,Class_Eco,Class_Eco Plus
0,0.817887,-1.035171,5,4,3,4,3,4,3,5,5,5,5,2,5,5,0.95381,0.782208,1,1,0,1,0,1,0,0,1,0
1,-0.239238,1.671443,1,1,3,1,5,4,5,4,4,4,4,3,4,5,-0.382286,-0.392188,1,1,0,1,0,1,0,1,0,0
2,-1.296363,-1.003128,2,0,2,4,2,2,2,2,4,1,3,2,2,2,-0.382286,-0.392188,0,0,1,0,1,1,0,0,1,0
3,0.289325,2.186131,0,0,0,2,3,4,4,1,1,1,1,3,1,4,-0.382286,-0.232043,1,0,1,1,0,1,0,1,0,0
4,0.619676,-0.011804,2,3,4,3,4,1,2,2,2,2,2,4,2,4,-0.382286,0.141628,1,1,0,1,0,1,0,0,1,0


In [19]:
X = original_data.drop(columns=['satisfaction'])
y = original_data['satisfaction']

In [20]:
# Split into testing and training sets using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [21]:
classifier3=xgboost.XGBClassifier(random_state=78)
classifier3=classifier3.fit(X_train,y_train)
y_pred_xb=classifier3.predict(X_test)

In [22]:
# Calculating the confusion matrix
cm3 = confusion_matrix(y_test, y_pred_xb)
cm_df3 = pd.DataFrame(
    cm3, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score3 = accuracy_score(y_test, y_pred_xb)

In [23]:
# Displaying results
print("Confusion Matrix")
display(cm_df3)
print(f"Accuracy Score : {acc_score3}")
print("Classification Report")
print(classification_report(y_test, y_pred_xb))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,3530,89
Actual 1,191,2684


Accuracy Score : 0.9568832768709578
Classification Report
              precision    recall  f1-score   support

           0       0.95      0.98      0.96      3619
           1       0.97      0.93      0.95      2875

    accuracy                           0.96      6494
   macro avg       0.96      0.95      0.96      6494
weighted avg       0.96      0.96      0.96      6494



In [5]:
X_new =original_data[["Online boarding","Inflight wifi service","Class_Business","Type of Travel_Personal Travel","Type of Travel_Business travel",
"Inflight entertainment","Seat comfort","Ease of Online booking","Class_Eco","Flight Distance","Age","Leg room service","Customer Type_Loyal Customer","Checkin service","On-board service",
"Cleanliness","Customer Type_disloyal Customer","Inflight service","Baggage handling",'Gate location',"Departure/Arrival time convenient"]]
y_new = original_data['satisfaction']

In [12]:
# Split into testing and training sets using train_test_split
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_new, y_new, random_state=78)

In [16]:
classifier3_new=xgboost.XGBClassifier(random_state=78)
classifier3_new=classifier3_new.fit(X_train1,y_train1)
y_pred_xb_new=classifier3_new.predict(X_test1)

In [17]:
# Calculating the confusion matrix
cm_new = confusion_matrix(y_test1, y_pred_xb_new)
cm_df_new = pd.DataFrame(
    cm_new, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score_new = accuracy_score(y_test1, y_pred_xb_new)

In [18]:
# Displaying results
print("Confusion Matrix")
display(cm_df_new)
print(f"Accuracy Score : {acc_score_new}")
print("Classification Report")
print(classification_report(y_test1, y_pred_xb_new))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,3529,90
Actual 1,196,2679


Accuracy Score : 0.9559593470896212
Classification Report
              precision    recall  f1-score   support

           0       0.95      0.98      0.96      3619
           1       0.97      0.93      0.95      2875

    accuracy                           0.96      6494
   macro avg       0.96      0.95      0.96      6494
weighted avg       0.96      0.96      0.96      6494

