## Load and Preprocessing Portion ##

In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [2]:
def load_data():
    train_df = pd.read_csv("data/train.csv")
    test_df = pd.read_csv("data/test.csv")
    return train_df, test_df

In [3]:
def clean_data(df):
    spend_cols = ["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]
    df["Spend"] = df[spend_cols].fillna(0).sum(axis=1) > 0
    df = df.drop(columns=spend_cols)
    df = df.drop(columns=["Name"])
    return df

In [4]:
def fill_missing_values(df):
    freq_imputer = SimpleImputer(strategy="most_frequent")
    num_imputer = SimpleImputer(strategy="median")
    df[["HomePlanet", "Destination", "VIP", "CryoSleep"]] = freq_imputer.fit_transform(df[["HomePlanet", "Destination", "VIP", "CryoSleep"]]) # freq
    df[["Age"]] = num_imputer.fit_transform(df[["Age"]]) # Numerical
    df["Cabin"] = df["Cabin"].fillna("X/0000/X")
    df[["Deck", "CabinNum", "Side"]] = df["Cabin"].str.split("/", expand=True)
    df["CabinNum"] = pd.to_numeric(df["CabinNum"], errors="coerce")
    df = df.drop(columns="Cabin")
    return df

In [5]:
def clean_features(df):
    scaler = StandardScaler()
    bins = [-0.1, 12, 17, 30, 50, 80]
    labels = ["child", "teen", "young_adult", "adult", "senior"]
    df["age_group"] = pd.cut(df["Age"], bins=bins, labels=labels)
    df["Side_AgeGroup"] = df["Side"] + "_" + df["age_group"].astype(str)
    df["Is_Starboard_YoungAdult"] = (df["Side"] == "S") & (df["age_group"] == "young_adult")
    df = pd.get_dummies(df, columns=["HomePlanet", "Destination", "Deck", "Side", "age_group", "Side_AgeGroup"], drop_first=True)
    df[["Age", "CabinNum"]] = scaler.fit_transform(df[["Age", "CabinNum"]])
    return df

In [6]:
def save_files(df1, df2):
    df1.to_csv("data/train_clean.csv", index=False)
    df2.to_csv("data/test_clean.csv", index=False)

    print("Preprocessing complete. Cleaned data saved.")

In [7]:
# Will me wrapped up into main()

train, test = load_data()
train, test = clean_data(train), clean_data(test)
train, test = fill_missing_values(train), fill_missing_values(test)
train, test = clean_features(train), clean_features(test)
save_files(train, test)


Preprocessing complete. Cleaned data saved.


## Training Portion ##

In [8]:
# import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import numpy as np
import pickle
import os

In [9]:
def load_data_train():
    df = pd.read_csv("data/train_clean.csv")
    return df

In [10]:
def tree_classifier(train_df):
    df = load_data_train()

    X = df.drop(columns=["Transported", "PassengerId"])
    y = df["Transported"].astype(int)

    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    tree = DecisionTreeClassifier(max_depth=5, random_state=42)
    tree.fit(X_train, y_train)

    importances = pd.Series(tree.feature_importances_, index=X_train.columns).sort_values(ascending=False)
    return print(importances.head(10))

In [11]:
def train_model(df):
    X = df.drop(columns=["Transported", "PassengerId"])
    y = df["Transported"].astype(int)

    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    model = LogisticRegression(penalty="l2", max_iter=1000, solver="liblinear")
    model.fit(X_train, y_train)

    y_pred = model.predict(X_val)
    y_proba = model.predict_proba(X_val)[:, 1]

    print("=== Classification Report ===")
    print(classification_report(y_val, y_pred))
    print("=== Confusion Matrix ===")
    print(confusion_matrix(y_val, y_pred))
    print(f"=== ROC AUC Score ===\n{roc_auc_score(y_val, y_proba):.4f}")

    return model


In [12]:
def save_model(model, filename="logistic_regression.pkl"):
    model_dir = "models"
    os.makedirs(model_dir, exist_ok=True)
    model_path = os.path.join(model_dir, filename)
    
    with open(model_path, "wb") as f:
        pickle.dump(model, f)
    
    print(f"Model saved successfully to {model_path}")

In [13]:
df = load_data_train()
tree_classifier(df)
model = train_model(df)
save_model(model)

Spend                      0.669133
Deck_G                     0.094745
CabinNum                   0.073389
Side_S                     0.041173
Deck_E                     0.039429
Deck_C                     0.022935
HomePlanet_Mars            0.018403
Age                        0.017572
Destination_TRAPPIST-1e    0.010468
CryoSleep                  0.006575
dtype: float64
=== Classification Report ===
              precision    recall  f1-score   support

           0       0.71      0.79      0.75       861
           1       0.77      0.69      0.73       878

    accuracy                           0.74      1739
   macro avg       0.74      0.74      0.74      1739
weighted avg       0.74      0.74      0.74      1739

=== Confusion Matrix ===
[[678 183]
 [271 607]]
=== ROC AUC Score ===
0.7998
Model saved successfully to models/logistic_regression.pkl


  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
