In [150]:
import pandas as pd
from ydata_profiling import ProfileReport
import re
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler

In [151]:
path = "../input/train.csv"
df = pd.read_csv(path)
df.head()
x = df.drop(columns=["Transported"])
y = df.loc[:, "Transported"].astype(int).values

In [152]:
profile = ProfileReport(df, title="Profiling Report")
# profile.to_notebook_iframe()

In [153]:
def camel_to_snake(text):

    return re.sub(r"(?<!^)(?=[A-Z])", "_", text).lower()


def clean_data(df, fillna=False, number_imputer=None):

    dtypes = {
        "str": [
            "passenger_id",
            "home_planet",
            "cryo_sleep",
            "cabin",
            "destination",
            "name",
        ],
        "float": [
            "age",
            "room_service",
            "food_court",
            "shopping_mall",
            "spa",
            "vr_deck",
        ],
        "bool": [
            "vip",
        ],
    }

    # rename columns
    df = df.rename(columns=lambda c: camel_to_snake(c))
    df = df.rename(columns={"v_i_p": "vip", "v_r_deck": "vr_deck"})

    # fillna
    if fillna:
        if not number_imputer:
            raise ValueError("number_imputer must be specified")
        for col in dtypes["str"]:  # fill string with blank
            df[col] = df[col].fillna("BLANK")
        for col in dtypes["float"]:  # fill number with imputer
            values = df[col].values.reshape(-1, 1)
            df[col] = number_imputer.fit_transform(values)
        for col in dtypes["bool"]:  # fill bool with imputer
            values = df[col].astype(float).values.reshape(-1, 1)
            df[col] = number_imputer.fit_transform(values).round().astype(bool)

    return df


imputer = IterativeImputer()
x_clean = clean_data(x, fillna=True, number_imputer=imputer)

In [154]:
def feature_engineering(df):
    return (df
        .assign(
            group_id=lambda df_: (df_["passenger_id"].str.split("_").str[0].astype(int)),
            travel_in_group=lambda df_: (
                df_["group_id"].duplicated(keep=False).astype(bool)
            ),
            deck=lambda df_: (df_["cabin"].str.split("/").str[0]),
            room_no=lambda df_: (df_["cabin"].str.split("/").str[1].astype(float)),
            side=lambda df_: (df_["cabin"].str.split("/").str[2].astype(str)),
        )
        .drop(columns=["passenger_id", "cabin", "name"])
    )


x_engineered = feature_engineering(x_clean)

In [155]:
def encode(df):
    return pd.get_dummies(df).astype(float)

x_encoded = encode(x_engineered)

In [156]:
def scale(df, scaler):
    cols = df.columns
    return pd.DataFrame(scaler.fit_transform(df), columns=cols)


scaler = StandardScaler()
x_scaled = scale(x_encoded, scaler)

In [157]:
def pipeline(df, imputer, scaler):
    df = df.copy()
    df = clean_data(df, fillna=True, number_imputer=imputer)
    df = feature_engineering(df)
    df = encode(df)
    df = scale(df, scaler=scaler)
    return df

pipeline(x, imputer, scaler)

Unnamed: 0,age,vip,room_service,food_court,shopping_mall,spa,vr_deck,group_id,travel_in_group,room_no,...,deck_BLANK,deck_C,deck_D,deck_E,deck_F,deck_G,deck_T,side_P,side_S,side_nan
0,0.709437,-0.153063,-0.340590,-0.287314,-0.290817,-0.276663,-0.269023,-1.734409,-0.899532,-1.172966,...,-0.153063,-0.30661,-0.241218,-0.334759,-0.688215,-0.645897,-0.02399,1.032865,-0.986630,-0.153063
1,-0.336717,-0.153063,-0.175364,-0.281669,-0.248968,0.211505,-0.230194,-1.734034,-0.899532,-1.172966,...,-0.153063,-0.30661,-0.241218,-0.334759,1.453035,-0.645897,-0.02399,-0.968181,1.013551,-0.153063
2,2.034566,6.533255,-0.275409,1.955616,-0.290817,5.694289,-0.225782,-1.733660,1.111690,-1.172966,...,-0.153063,-0.30661,-0.241218,-0.334759,-0.688215,-0.645897,-0.02399,-0.968181,1.013551,-0.153063
3,0.290975,-0.153063,-0.340590,0.517406,0.330225,2.683471,-0.098708,-1.733660,1.111690,-1.172966,...,-0.153063,-0.30661,-0.241218,-0.334759,-0.688215,-0.645897,-0.02399,-0.968181,1.013551,-0.153063
4,-0.894666,-0.153063,0.118709,-0.243409,-0.038048,0.225732,-0.267258,-1.733286,-0.899532,-1.171013,...,-0.153063,-0.30661,-0.241218,-0.334759,1.453035,-0.645897,-0.02399,-0.968181,1.013551,-0.153063
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,0.848924,6.533255,-0.340590,3.989682,-0.290817,1.184286,-0.203720,1.738236,-0.899532,-0.981499,...,-0.153063,-0.30661,-0.241218,-0.334759,-0.688215,-0.645897,-0.02399,1.032865,-0.986630,-0.153063
8689,-0.755179,-0.153063,-0.340590,-0.287314,-0.290817,-0.276663,-0.269023,1.738984,-0.899532,1.755700,...,-0.153063,-0.30661,-0.241218,-0.334759,-0.688215,1.548235,-0.02399,-0.968181,1.013551,-0.153063
8690,-0.197230,-0.153063,-0.340590,-0.287314,2.842851,-0.275774,-0.269023,1.739359,-0.899532,1.757654,...,-0.153063,-0.30661,-0.241218,-0.334759,-0.688215,1.548235,-0.02399,-0.968181,1.013551,-0.153063
8691,0.221232,-0.153063,-0.340590,0.370637,-0.290817,0.037223,2.585740,1.739733,1.111690,0.014912,...,-0.153063,-0.30661,-0.241218,2.987225,-0.688215,-0.645897,-0.02399,-0.968181,1.013551,-0.153063
