In [1]:
import os
import pandas as pd
import kagglehub

path = kagglehub.dataset_download("teejmahal20/airline-passenger-satisfaction")
print("Path to dataset files:", path)

csv_files = []
for root, dirs, files in os.walk(path):
    for f in files:
        if f.lower().endswith(".csv"):
            csv_files.append(os.path.join(root, f))

csv_path = max(csv_files, key=os.path.getsize)
print("Using CSV:", csv_path)

df = pd.read_csv(csv_path)
df.head()

Path to dataset files: /home/sagemaker-user/.cache/kagglehub/datasets/teejmahal20/airline-passenger-satisfaction/versions/1
Using CSV: /home/sagemaker-user/.cache/kagglehub/datasets/teejmahal20/airline-passenger-satisfaction/versions/1/train.csv


Unnamed: 0.1,Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,...,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,1,5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,...,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied
2,2,110028,Female,Loyal Customer,26,Business travel,Business,1142,2,2,...,5,4,3,4,4,4,5,0,0.0,satisfied
3,3,24026,Female,Loyal Customer,25,Business travel,Business,562,2,5,...,2,2,5,3,1,4,2,11,9.0,neutral or dissatisfied
4,4,119299,Male,Loyal Customer,61,Business travel,Business,214,3,3,...,3,3,4,4,3,3,3,0,0.0,satisfied


In [2]:
df.shape, df.columns
df.isna().sum().sort_values(ascending=False).head(20)
df["satisfaction"].value_counts()

satisfaction
neutral or dissatisfied    58879
satisfied                  45025
Name: count, dtype: int64

In [5]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.linear_model import LogisticRegression
import joblib

# Drop obvious ID columns if present
drop_cols = [c for c in ["Unnamed: 0", "id", "ID"] if c in df.columns]
df2 = df.drop(columns=drop_cols, errors="ignore").copy()

# Target
y = df2["satisfaction"]
X = df2.drop(columns=["satisfaction"])

# Train/val/test
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp
)

# Column types
cat_cols = X_train.select_dtypes(include=["object"]).columns.tolist()
num_cols = [c for c in X_train.columns if c not in cat_cols]

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols)
    ]
)

model = LogisticRegression(max_iter=2000)

clf = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", model)
])

clf.fit(X_train, y_train)

# Evaluate
val_pred = clf.predict(X_val)
test_pred = clf.predict(X_test)

print("VAL Accuracy:", accuracy_score(y_val, val_pred))
print("VAL F1:", f1_score(y_val, val_pred, average="weighted"))

print("TEST Accuracy:", accuracy_score(y_test, test_pred))
print("TEST F1:", f1_score(y_test, test_pred, average="weighted"))
print("\nClassification report (TEST):\n", classification_report(y_test, test_pred))


# Ensure models folder exists
os.makedirs("models", exist_ok=True)

# Save the trained pipeline/model
joblib.dump(clf, "models/model.pkl")

print("Model saved at: models/model.pkl")

VAL Accuracy: 0.8781598870781471
VAL F1: 0.8777444939066971
TEST Accuracy: 0.873091235724368
TEST F1: 0.8728266411509319

Classification report (TEST):
                          precision    recall  f1-score   support

neutral or dissatisfied       0.88      0.90      0.89      8832
              satisfied       0.87      0.84      0.85      6754

               accuracy                           0.87     15586
              macro avg       0.87      0.87      0.87     15586
           weighted avg       0.87      0.87      0.87     15586

Model saved at: models/model.pkl
