In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer

In [2]:
def load_data():
    train_df = pd.read_csv("data/train.csv")
    test_df = pd.read_csv("data/test.csv")
    return train_df, test_df

In [3]:
def clean_data(df):
    spend_cols = ["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]
    df["Spend"] = df[spend_cols].fillna(0).sum(axis=1) > 0
    df = df.drop(columns=spend_cols)
    df = df.drop(columns=["Name"])
    return df

In [4]:
def fill_missing_values(df):
    freq_imputer = SimpleImputer(strategy="most_frequent")
    num_imputer = SimpleImputer(strategy="median")
    df[["HomePlanet", "Destination", "VIP", "CryoSleep"]] = freq_imputer.fit_transform(df[["HomePlanet", "Destination", "VIP", "CryoSleep"]]) # freq
    df[["Age"]] = num_imputer.fit_transform(df[["Age"]]) # Numerical
    df["Cabin"] = df["Cabin"].fillna("X/0000/X")
    df[["Deck", "CabinNum", "Side"]] = df["Cabin"].str.split("/", expand=True)
    df["CabinNum"] = pd.to_numeric(df["CabinNum"], errors="coerce")
    df = df.drop(columns="Cabin")
    return df

In [5]:
def clean_features(df):
    bins = [-0.1, 12, 17, 30, 50, 80]
    labels = ["child", "teen", "young_adult", "adult", "senior"]
    df["age_group"] = pd.cut(df["Age"], bins=bins, labels=labels)
    df["Side_AgeGroup"] = df["Side"] + "_" + df["age_group"].astype(str)
    df["Is_Starboard_YoungAdult"] = (df["Side"] == "S") & (df["age_group"] == "young_adult")
    df = pd.get_dummies(df, columns=["HomePlanet", "Destination", "Deck", "Side", "age_group", "Side_AgeGroup"], drop_first=True)

    return df

In [None]:
def save_files(df):
    

In [6]:
train, test = load_data()
train, test = clean_data(train), clean_data(test)
train, test = fill_missing_values(train), fill_missing_values(test)
train, test = clean_features(train), clean_features(test)