In [None]:
import pandas as pd
import metric_calculation as mc
import pickle
import os

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [None]:
df = pd.read_csv("dataset/train_split.csv")
df.head()

In [None]:
df.drop(columns=['id', 'Unnamed: 0'], inplace=True, errors='ignore')



In [None]:
df['Arrival Delay in Minutes'] = df['Arrival Delay in Minutes'].fillna(
    df['Arrival Delay in Minutes'].median()
)


In [None]:
df['satisfaction'] = df['satisfaction'].map({
    'neutral or dissatisfied': 0,
    'satisfied': 1
})


In [None]:
X = df.drop(columns=['satisfaction'])
y = df['satisfaction']


In [None]:
categorical_cols = [
    'Gender', 'Customer Type', 'Type of Travel', 'Class'
]

X = pd.get_dummies(
    X,
    columns=categorical_cols,
    drop_first=True
)


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
print("Train shape:", X_train_scaled.shape)
print("Test shape:", X_test_scaled.shape)
print("Train satisfaction ratio:", y_train.mean())
print("Test satisfaction ratio:", y_test.mean())


In [None]:
feature_columns = X_train.columns

with open("models/feature_columns.pkl", "wb") as f:
    pickle.dump(feature_columns, f)

In [None]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_scaled, y_train)

# Make predictions
y_pred_lr = lr.predict(X_test_scaled)
y_pred_proba_lr = lr.predict_proba(X_test_scaled)[:, 1]

# Calculate and display metrics using metric_calculation module
lr_metrics = mc.calculate_metrics(y_test, y_pred_lr, y_pred_proba_lr)
mc.display_metrics("Logistic Regression", lr_metrics)

In [None]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

# Make predictions
y_pred_dt = dt.predict(X_test)
y_pred_proba_dt = dt.predict_proba(X_test)[:, 1]

# Calculate and display metrics using metric_calculation module
dt_metrics = mc.calculate_metrics(y_test, y_pred_dt, y_pred_proba_dt)
mc.display_metrics("Decision Tree", dt_metrics)

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)

# Make predictions
y_pred_knn = knn.predict(X_test_scaled)
y_pred_proba_knn = knn.predict_proba(X_test_scaled)[:, 1]

# Calculate and display metrics using metric_calculation module
knn_metrics = mc.calculate_metrics(y_test, y_pred_knn, y_pred_proba_knn)
mc.display_metrics("K-Nearest Neighbors", knn_metrics)

In [None]:
nb = GaussianNB()
nb.fit(X_train_scaled, y_train)

# Make predictions
y_pred_nb = nb.predict(X_test_scaled)
y_pred_proba_nb = nb.predict_proba(X_test_scaled)[:, 1]

# Calculate and display metrics using metric_calculation module
nb_metrics = mc.calculate_metrics(y_test, y_pred_nb, y_pred_proba_nb)
mc.display_metrics("Naive Bayes", nb_metrics)

In [None]:
rf = RandomForestClassifier(
    n_estimators=100,
    random_state=42
)
rf.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf.predict(X_test)
y_pred_proba_rf = rf.predict_proba(X_test)[:, 1]

# Calculate and display metrics using metric_calculation module
rf_metrics = mc.calculate_metrics(y_test, y_pred_rf, y_pred_proba_rf)
mc.display_metrics("Random Forest", rf_metrics)

In [None]:
xgb_model = XGBClassifier(
    eval_metric='logloss',
    random_state=42
)
xgb_model.fit(X_train, y_train)

# Make predictions
y_pred_xgb = xgb_model.predict(X_test)
y_pred_proba_xgb = xgb_model.predict_proba(X_test)[:, 1]

# Calculate and display metrics using metric_calculation module
xgb_metrics = mc.calculate_metrics(y_test, y_pred_xgb, y_pred_proba_xgb)
mc.display_metrics("XGBoost", xgb_metrics)

In [None]:
results_df = pd.DataFrame.from_dict({
    "Logistic Regression": lr_metrics,
    "Decision Tree": dt_metrics,
    "KNN": knn_metrics,
    "Naive Bayes": nb_metrics,
    "Random Forest": rf_metrics,
    "XGBoost": xgb_metrics
}, orient='index')

results_df.sort_values(by="F1 Score", ascending=False)

In [None]:
os.makedirs("models", exist_ok=True)

# Save scaler
with open("models/scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

# Save models
models = {
    "logistic_regression.pkl": lr,
    "decision_tree.pkl": dt,
    "knn.pkl": knn,
    "naive_bayes.pkl": nb,
    "random_forest.pkl": rf,
    "xgboost.pkl": xgb_model
}

for filename, model in models.items():
    with open(f"models/{filename}", "wb") as f:
        pickle.dump(model, f)

print("All models and scaler saved successfully.")
