In [None]:
import pandas as pd
from sklearn.impute import SimpleImputer

In [3]:
def load_data():
    train_df = pd.read_csv("data/train.csv")
    test_df = pd.read_csv("data/test.csv")
    return train_df, test_df

In [4]:
def clean_data(df):
    spend_cols = ["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]
    df["Spend"] = df[spend_cols].fillna(0).sum(axis=1) > 0
    df = df.dropna(subset=["Name"])
    df = df.drop(columns=spend_cols)
    df[["first_name", "last_name"]] = df["Name"].str.split(" ", expand=True)
    df = df.drop(columns=["Name"])
    return df

In [5]:
# freq_imputer = SimpleImputer(strategy="most_frequent")
# num_imputer = SimpleImputer(strategy="median")
# train_df[["HomePlanet", "Destination", "VIP", "CryoSleep"]] = freq_imputer.fit_transform(train_df[["HomePlanet", "Destination", "VIP", "CryoSleep"]]) # freq
# train_df[["Age"]] = num_imputer.fit_transform(train_df[["Age"]]) # Numerical
# train_df["Cabin"] = train_df["Cabin"].fillna("X/0000/X")
# train_df.isnull().mean().sort_values(ascending=False)
# train_df

In [6]:
def fill_missing_values(df):
    freq_imputer = SimpleImputer(strategy="most_frequent")
    num_imputer = SimpleImputer(strategy="median")
    df[["HomePlanet", "Destination", "VIP", "CryoSleep"]] = freq_imputer.fit_transform(df[["HomePlanet", "Destination", "VIP", "CryoSleep"]]) # freq
    df[["Age"]] = num_imputer.fit_transform(df[["Age"]]) # Numerical
    df["Cabin"] = df["Cabin"].fillna("X/0000/X")
    df[["Deck", "CabinNum", "Side"]] = df["Cabin"].str.split("/", expand=True)
    df = df.drop(columns="Cabin")
    return df

In [7]:
train, test = load_data()
train, test = clean_data(train), clean_data(test)
train, test = fill_missing_values(train), fill_missing_values(test)

In [None]:
bins = [0, 12, 17, 30, 50, 80]
labels = ["child", "teen", "young_adult", "adult", "senior"]
train["age_group"] = pd.cut(train["Age"], bins=bins, labels=labels)

{'Darci', 'Agnesse', 'Consuelo', 'Nerissa', 'Ania', 'Clarissa', 'Danyelle', 'Rosa', 'Penelope', 'Courtenay', 'Nanni', 'Yolanda', 'Mirilla', 'Gael', 'Brena', 'Cal', 'Clio', 'Hedda', 'Madlen', 'Cybel', 'Christy', 'Alyssa', 'Fan', 'Lolande', 'Pearline', 'Harriett', 'Ardine', 'Shoshana', 'Celie', 'Annamari', 'Jerrie', 'Tiertza', 'Andria', 'Natalee', 'Maddalena', 'Rosabel', 'LeeAnn', 'Minetta', 'Myranda', 'Emilie', 'Orsola', 'Brietta', 'Mignon', 'Dorella', 'Jessika', 'Conchita', 'Judith', 'Nicolle', 'Berty', 'Marlyn', 'Merola', 'Minna', 'Maxine', 'Beverly', 'Kristan', 'Halli', 'Jaquenette', 'Ernaline', 'Melonie', 'Geneva', 'Ianthe', 'Cybill', 'Peg', 'Callie', 'Clair', 'Lelah', 'Lib', 'Minny', 'Jamie', 'Miquela', 'Kalila', 'Rosaline', 'Jenna', 'Madeline', 'Emiline', 'Dulcinea', 'Gale ', 'Natalia', 'Erinna', 'Katine', 'Cicely', 'Carolan', 'Hattie', 'Darby', 'Charita', 'Jessie', 'Addie', 'Deloris', 'Judye', 'Murial', 'Pam', 'Atlanta', 'Diahann', 'Teresita', 'Novelia', 'Kelsi', 'Cassandre', 'No