In [120]:
import pandas as pd
import metric_calculation as mc
import pickle
import os

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [121]:
df = pd.read_csv("dataset/train_split.csv")
df.head()


Unnamed: 0.1,Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,...,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,1,5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,...,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied
2,2,110028,Female,Loyal Customer,26,Business travel,Business,1142,2,2,...,5,4,3,4,4,4,5,0,0.0,satisfied
3,3,24026,Female,Loyal Customer,25,Business travel,Business,562,2,5,...,2,2,5,3,1,4,2,11,9.0,neutral or dissatisfied
4,4,119299,Male,Loyal Customer,61,Business travel,Business,214,3,3,...,3,3,4,4,3,3,3,0,0.0,satisfied


In [122]:
df.drop(columns=['id', 'Unnamed: 0'], inplace=True, errors='ignore')



In [123]:
df['Arrival Delay in Minutes'] = df['Arrival Delay in Minutes'].fillna(
    df['Arrival Delay in Minutes'].median()
)


In [124]:
df['satisfaction'] = df['satisfaction'].map({
    'neutral or dissatisfied': 0,
    'satisfied': 1
})


In [125]:
X = df.drop(columns=['satisfaction'])
y = df['satisfaction']


In [126]:
categorical_cols = [
    'Gender', 'Customer Type', 'Type of Travel', 'Class'
]

X = pd.get_dummies(
    X,
    columns=categorical_cols,
    drop_first=True
)


In [127]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [128]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [129]:
print("Train shape:", X_train_scaled.shape)
print("Test shape:", X_test_scaled.shape)
print("Train satisfaction ratio:", y_train.mean())
print("Test satisfaction ratio:", y_test.mean())


Train shape: (79123, 23)
Test shape: (19781, 23)
Train satisfaction ratio: 0.43295881096520605
Test satisfaction ratio: 0.43294070067236234


In [130]:
feature_columns = X_train.columns

with open("models/feature_columns.pkl", "wb") as f:
    pickle.dump(feature_columns, f)

In [131]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_scaled, y_train)

# Make predictions
y_pred_lr = lr.predict(X_test_scaled)
y_pred_proba_lr = lr.predict_proba(X_test_scaled)[:, 1]

# Calculate and display metrics using metric_calculation module
lr_metrics = mc.calculate_metrics(y_test, y_pred_lr, y_pred_proba_lr)
mc.display_metrics("Logistic Regression", lr_metrics)


Model: Logistic Regression
Accuracy                  : 0.8781
AUC Score                 : 0.9296
Precision                 : 0.8730
Recall                    : 0.8406
F1 Score                  : 0.8565
MCC Score                 : 0.7510



In [132]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

# Make predictions
y_pred_dt = dt.predict(X_test)
y_pred_proba_dt = dt.predict_proba(X_test)[:, 1]

# Calculate and display metrics using metric_calculation module
dt_metrics = mc.calculate_metrics(y_test, y_pred_dt, y_pred_proba_dt)
mc.display_metrics("Decision Tree", dt_metrics)


Model: Decision Tree
Accuracy                  : 0.9449
AUC Score                 : 0.9439
Precision                 : 0.9366
Recall                    : 0.9361
F1 Score                  : 0.9363
MCC Score                 : 0.8878



In [133]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)

# Make predictions
y_pred_knn = knn.predict(X_test_scaled)
y_pred_proba_knn = knn.predict_proba(X_test_scaled)[:, 1]

# Calculate and display metrics using metric_calculation module
knn_metrics = mc.calculate_metrics(y_test, y_pred_knn, y_pred_proba_knn)
mc.display_metrics("K-Nearest Neighbors", knn_metrics)


Model: K-Nearest Neighbors
Accuracy                  : 0.9272
AUC Score                 : 0.9685
Precision                 : 0.9501
Recall                    : 0.8779
F1 Score                  : 0.9125
MCC Score                 : 0.8522



In [134]:
nb = GaussianNB()
nb.fit(X_train_scaled, y_train)

# Make predictions
y_pred_nb = nb.predict(X_test_scaled)
y_pred_proba_nb = nb.predict_proba(X_test_scaled)[:, 1]

# Calculate and display metrics using metric_calculation module
nb_metrics = mc.calculate_metrics(y_test, y_pred_nb, y_pred_proba_nb)
mc.display_metrics("Naive Bayes", nb_metrics)


Model: Naive Bayes
Accuracy                  : 0.8625
AUC Score                 : 0.9214
Precision                 : 0.8584
Recall                    : 0.8174
F1 Score                  : 0.8374
MCC Score                 : 0.7191



In [135]:
rf = RandomForestClassifier(
    n_estimators=100,
    random_state=42
)
rf.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf.predict(X_test)
y_pred_proba_rf = rf.predict_proba(X_test)[:, 1]

# Calculate and display metrics using metric_calculation module
rf_metrics = mc.calculate_metrics(y_test, y_pred_rf, y_pred_proba_rf)
mc.display_metrics("Random Forest", rf_metrics)


Model: Random Forest
Accuracy                  : 0.9617
AUC Score                 : 0.9936
Precision                 : 0.9730
Recall                    : 0.9376
F1 Score                  : 0.9550
MCC Score                 : 0.9222



In [136]:
xgb_model = XGBClassifier(
    eval_metric='logloss',
    random_state=42
)
xgb_model.fit(X_train, y_train)

# Make predictions
y_pred_xgb = xgb_model.predict(X_test)
y_pred_proba_xgb = xgb_model.predict_proba(X_test)[:, 1]

# Calculate and display metrics using metric_calculation module
xgb_metrics = mc.calculate_metrics(y_test, y_pred_xgb, y_pred_proba_xgb)
mc.display_metrics("XGBoost", xgb_metrics)


Model: XGBoost
Accuracy                  : 0.9619
AUC Score                 : 0.9948
Precision                 : 0.9704
Recall                    : 0.9407
F1 Score                  : 0.9553
MCC Score                 : 0.9224



In [137]:
results_df = pd.DataFrame.from_dict({
    "Logistic Regression": lr_metrics,
    "Decision Tree": dt_metrics,
    "KNN": knn_metrics,
    "Naive Bayes": nb_metrics,
    "Random Forest": rf_metrics,
    "XGBoost": xgb_metrics
}, orient='index')

results_df.sort_values(by="F1 Score", ascending=False)

Unnamed: 0,Accuracy,AUC Score,Precision,Recall,F1 Score,MCC Score
XGBoost,0.961883,0.994799,0.970369,0.940682,0.955295,0.922425
Random Forest,0.961731,0.99364,0.97298,0.937646,0.954986,0.922201
Decision Tree,0.944897,0.94386,0.936565,0.936128,0.936347,0.887768
KNN,0.927152,0.968503,0.950082,0.877861,0.912545,0.852242
Logistic Regression,0.878065,0.929607,0.873029,0.840612,0.856514,0.750973
Naive Bayes,0.862545,0.921357,0.858369,0.817375,0.837371,0.719109


In [138]:
os.makedirs("models", exist_ok=True)

# Save scaler
with open("models/scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

# Save models
models = {
    "logistic_regression.pkl": lr,
    "decision_tree.pkl": dt,
    "knn.pkl": knn,
    "naive_bayes.pkl": nb,
    "random_forest.pkl": rf,
    "xgboost.pkl": xgb_model
}

for filename, model in models.items():
    with open(f"models/{filename}", "wb") as f:
        pickle.dump(model, f)

print("All models and scaler saved successfully.")


All models and scaler saved successfully.
