In [8]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

## Feature Engineering

In [9]:
def extract_cabin(data):
    def get_most_common_cabin(combined):
        decks = {}
        nums = {}
        sides = {}

        for i in range(len(combined)):
            if combined['Cabin'][i] is np.nan or combined['Cabin'][i].count('/') != 2:
                continue

            deck, num, side = combined['Cabin'][i].split('/')

            if deck not in decks:
                decks[deck] = 1
            else:
                decks[deck] += 1
            if num not in nums:
                nums[num] = 1
            else:
                nums[num] += 1
            if side not in sides:
                sides[side] = 1
            else:
                sides[side] += 1

        # Get the most common deck, num, and side
        deck_mode = max(decks, key=decks.get)
        num_mode = max(nums, key=nums.get)
        side_mode = max(sides, key=sides.get)

        return f"{deck_mode}/{num_mode}/{side_mode}"

    cabin_mode = get_most_common_cabin(data)

    imputer_cabin = SimpleImputer(strategy='constant', fill_value=cabin_mode)
    data['Cabin'] = imputer_cabin.fit_transform(data[['Cabin']]).ravel()

    # Split the Cabin column into Deck, Num, and Side
    data['Deck'] = pd.Series(dtype='str')
    data['Num'] = pd.Series(dtype='float64')
    data['Side'] = pd.Series(dtype='str')

    for i in range(len(data)):
        if pd.isna(data['Cabin'][i]) or data['Cabin'][i].count('/') != 2:
            continue

        deck, num, side = data['Cabin'][i].split('/')

        data.loc[i, 'Deck'] = deck
        data.loc[i, 'Num'] = int(num)
        data.loc[i, 'Side'] = side

    data = data.drop(columns=['Cabin'])

    return data

In [10]:
def impute_features(data):
    imputer_mode = SimpleImputer(strategy='most_frequent')
    features = ['HomePlanet', 'CryoSleep', 'Deck', 'Num', 'Side', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    
    for feature in features:
        data[feature] = imputer_mode.fit_transform(data[[feature]]).ravel()
    data = data.drop(columns=['Name'])

    return data

## Encoding & Scaling
- One-Hot Encoding: HomePlanet, Destination
- Ordinal Encoding: Deck
- Binary Encoding: CryoSleep, VIP, Side
- Scaling: Age, RoomService, FoodCourt, ShoppingMall, Spa, VRDeck, Num

In [11]:
def encode(data):
    one_hot = OneHotEncoder(drop='first', sparse_output=False)
    ordinal = OrdinalEncoder(categories=[['A', 'B', 'C', 'D', 'E', 'F', 'G', 'T']])

    preprocessor=ColumnTransformer([("onehot_encoding", one_hot,["HomePlanet", "Destination"]), 
                                ("ordinal_encoding", ordinal,["Deck"])], remainder="passthrough")
    combined_encoded=preprocessor.fit_transform(data)
    combined_encoded=pd.DataFrame(combined_encoded,columns=preprocessor.get_feature_names_out())

    combined_encoded['CryoSleep'] = data['CryoSleep'].astype(int)
    combined_encoded['VIP'] = data['VIP'].astype(int)
    combined_encoded['Side'] = data['Side'].map({"P": 0, "S": 1})

    combined_encoded.drop("remainder__CryoSleep", axis=1, inplace=True)
    combined_encoded.drop("remainder__VIP", axis=1, inplace=True)
    combined_encoded.drop("remainder__Side", axis=1, inplace=True)

    combined_encoded.columns = combined_encoded.columns.str.replace('remainder__', '', regex=False)

    return combined_encoded

In [12]:
def scale(data):
    scale = StandardScaler()
    scaler = ColumnTransformer([("scaling", scale, ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Num'])], remainder="passthrough")
    combined_scaled = scaler.fit_transform(data)
    combined_scaled=pd.DataFrame(combined_scaled,columns=scaler.get_feature_names_out())
    combined_scaled.columns = combined_scaled.columns.str.replace('remainder__', '', regex=False)

    return combined_scaled

In [15]:
def preprocessing(data):
    extracted = extract_cabin(data)
    imputed = impute_features(extracted)
    encoded = encode(imputed)
    scaled = scale(encoded)
    
    # Convert object columns to numeric
    for col in scaled.columns:
        if col != 'PassengerId' and col != 'Transported':
            scaled[col] = pd.to_numeric(scaled[col])
    
    # Convert Transported to boolean
    if 'Transported' in scaled.columns:
        scaled['Transported'] = scaled['Transported'].astype(bool)

    return scaled