# 2. Preprocessing

In this notebook I create the pipelines that I will use for training the models. 

In [38]:
import pandas as pd
import numpy as np

In [39]:
train_data_path = './data/train.csv'
train_dataframe = pd.read_csv(train_data_path)
train_dataframe.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [40]:
def preprocess_cabin(dataframe):
    df = dataframe.copy()
    df['Cabin'] = df['Cabin'].fillna('Unknown/-1/Unknown')
    df[['Deck', 'CabinNumber', 'Side']] = df['Cabin'].str.split('/', expand=True)
    df.drop('Cabin', axis = 1, inplace = True)
    df['CabinNumber'] = df['CabinNumber'].astype(int)

    deck_mapping = {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7, 'T': 0, 'Unknown' : np.nan}
    df['Deck'] = df['Deck'].map(deck_mapping)

    df[['CabinNumber', 'Side']] = df[['CabinNumber', 'Side']].replace(['Unknown', '-1'], np.nan)
    df = pd.get_dummies(df, columns = ['Side'], dummy_na = True)
    df.loc[df['Side_nan'] == 1, ['Side_P', 'Side_S']] = np.nan
    df.drop('Side_nan', axis = 1, inplace = True)
    return df

def preprocess_home_planet(dataframe,):
    df = dataframe.copy()
    df = pd.get_dummies(df, columns = ['HomePlanet'], dummy_na = True)
    df.loc[df['HomePlanet_nan'] == 1, ['HomePlanet_Earth', 'HomePlanet_Europa', 'HomePlanet_Mars']] = np.nan
    df.drop('HomePlanet_nan', axis = 1, inplace = True)
    return df

def preprocess_destination(dataframe):
    df = dataframe.copy()
    df = pd.get_dummies(df, columns = ['Destination'], dummy_na = True)
    df.loc[df['Destination_nan'] == 1, ['Destination_55 Cancri e', 'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e']] = np.nan
    df.drop('Destination_nan', axis = 1, inplace = True)
    return df

def preprocess_cryo_sleep(dataframe):
    df = dataframe.copy()
    df['TotalExpenses'] = df[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)
    df.loc[(df['CryoSleep'].isna()) & (df['TotalExpenses'] != 0), 'CryoSleep'] = False
    df.drop('TotalExpenses', axis=1, inplace=True)
    df = pd.get_dummies(df, columns = ['CryoSleep'], dummy_na = True)
    df.loc[df['CryoSleep_nan'] == 1, ['CryoSleep_True', 'CryoSleep_False'] ] = np.nan
    df.drop('CryoSleep_nan', axis = 1, inplace = True)
    return df

def preprocess_vip(dataframe):
    df = dataframe.copy()
    df = pd.get_dummies(df, columns = ['VIP'], dummy_na = True)
    df.loc[df['VIP_nan'] == 1, ['VIP_True', 'VIP_False'] ] = np.nan
    df.drop('VIP_nan', axis = 1, inplace = True)
    return df

def preprocess_name(dataframe):
    df = dataframe.copy()
    df['Name'].fillna('Name _Unknown', inplace=True)
    df[['FirstName', 'Surname']] = df['Name'].str.split(' ', expand=True)
    df.drop(columns=['FirstName', 'Name'], inplace=True)
    df['Surname'].replace('_Unknown', pd.NA, inplace=True)
    return df


def preprocess_passenger_id(dataframe):
    df = dataframe.copy()
    df[['GroupId', 'PersonalId']] = df['PassengerId'].str.split('_', expand=True).astype(int)
    group_sizes = df.groupby('GroupId')['PersonalId'].nunique()
    df['GroupSize'] = df['GroupId'].map(group_sizes)
    df.drop(['PassengerId'], axis = 1, inplace = True)
    return df

def preprocess_spendings(dataframe):
    df = dataframe.copy()
    df['LowIncomeSpendings'] = df['FoodCourt'] + df['ShoppingMall']
    df['HighIncomeSpendings'] = df['RoomService'] + df['VRDeck'] + df['Spa']
    df['TotalSpendings'] = df['LowIncomeSpendings'] + df['HighIncomeSpendings']
    df['PassiveSpender'] = np.where(df['TotalSpendings'].fillna(pd.NA).isna(), pd.NA, df['TotalSpendings'] == 0)
    df = pd.get_dummies(df, columns=['PassiveSpender'], dummy_na=True)
    df.loc[df['PassiveSpender_nan'] == 1, ['PassiveSpender_True', 'PassiveSpender_False']] = np.nan
    df.drop('PassiveSpender_nan', axis=1, inplace=True)

    return df

In [41]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.impute import KNNImputer

In [42]:
class PreprocessorTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, preprocessor_function, leave_one_out = None):
        self.preprocessor_function = preprocessor_function
        self.leave_one_out = leave_one_out

    def fit(self, X, y = None):
        return self

    def transform(self, X, y = None):
        if self.leave_one_out is None:
            return self.preprocessor_function(X)
        else:
            return self.preprocessor_function(X, self.leave_one_out)

    def fit_transform(self, X, y = None):
        self.fit(X,y)
        return self.transform(X,y)

In [43]:
preprocessor_pipeline = Pipeline([
    ('cryo_sleep', PreprocessorTransformer(preprocess_cryo_sleep)),
    ('vip', PreprocessorTransformer(preprocess_vip)),
    ('home_planet', PreprocessorTransformer(preprocess_home_planet)),
    ('destination', PreprocessorTransformer(preprocess_destination)),
    ('cabin', PreprocessorTransformer(preprocess_cabin)),
    ('name', PreprocessorTransformer(preprocess_name)),
    ('passenger_id', PreprocessorTransformer(preprocess_passenger_id)),
    ('spendings', PreprocessorTransformer(preprocess_spendings)),
])

In [44]:
df = train_dataframe.copy()
df = preprocessor_pipeline.fit_transform(df)
df

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,CryoSleep_False,CryoSleep_True,VIP_False,...,Side_S,Surname,GroupId,PersonalId,GroupSize,LowIncomeSpendings,HighIncomeSpendings,TotalSpendings,PassiveSpender_False,PassiveSpender_True
0,39.0,0.0,0.0,0.0,0.0,0.0,False,1.0,0.0,1.0,...,0.0,Ofracculy,1,1,1,0.0,0.0,0.0,0.0,1.0
1,24.0,109.0,9.0,25.0,549.0,44.0,True,1.0,0.0,1.0,...,1.0,Vines,2,1,1,34.0,702.0,736.0,1.0,0.0
2,58.0,43.0,3576.0,0.0,6715.0,49.0,False,1.0,0.0,0.0,...,1.0,Susent,3,1,2,3576.0,6807.0,10383.0,1.0,0.0
3,33.0,0.0,1283.0,371.0,3329.0,193.0,False,1.0,0.0,1.0,...,1.0,Susent,3,2,2,1654.0,3522.0,5176.0,1.0,0.0
4,16.0,303.0,70.0,151.0,565.0,2.0,True,1.0,0.0,1.0,...,1.0,Santantines,4,1,1,221.0,870.0,1091.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,41.0,0.0,6819.0,0.0,1643.0,74.0,False,1.0,0.0,0.0,...,0.0,Noxnuther,9276,1,1,6819.0,1717.0,8536.0,1.0,0.0
8689,18.0,0.0,0.0,0.0,0.0,0.0,False,0.0,1.0,1.0,...,1.0,Mondalley,9278,1,1,0.0,0.0,0.0,0.0,1.0
8690,26.0,0.0,0.0,1872.0,1.0,0.0,True,1.0,0.0,1.0,...,1.0,Connon,9279,1,1,1872.0,1.0,1873.0,1.0,0.0
8691,32.0,0.0,1049.0,0.0,353.0,3235.0,False,1.0,0.0,1.0,...,1.0,Hontichre,9280,1,2,1049.0,3588.0,4637.0,1.0,0.0


In [45]:
class DropColumnsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_transformed = X.copy()
        for column in self.columns:
            X_transformed.drop(column, axis = 1, inplace = True)
        return X_transformed

In [46]:
irrelevant_columns = [
    ['GroupId', 'Surname', 'CabinNumber','PersonalId'] ]

dropping_pipeline = Pipeline([('dropping', DropColumnsTransformer(irrelevant_columns))])

In [47]:
df = train_dataframe.copy()

pipeline = make_pipeline(
    preprocessor_pipeline,
    dropping_pipeline
)

df = pipeline.fit_transform(df)
df.head()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,CryoSleep_False,CryoSleep_True,VIP_False,...,Destination_TRAPPIST-1e,Deck,Side_P,Side_S,GroupSize,LowIncomeSpendings,HighIncomeSpendings,TotalSpendings,PassiveSpender_False,PassiveSpender_True
0,39.0,0.0,0.0,0.0,0.0,0.0,False,1.0,0.0,1.0,...,1.0,2.0,1.0,0.0,1,0.0,0.0,0.0,0.0,1.0
1,24.0,109.0,9.0,25.0,549.0,44.0,True,1.0,0.0,1.0,...,1.0,6.0,0.0,1.0,1,34.0,702.0,736.0,1.0,0.0
2,58.0,43.0,3576.0,0.0,6715.0,49.0,False,1.0,0.0,0.0,...,1.0,1.0,0.0,1.0,2,3576.0,6807.0,10383.0,1.0,0.0
3,33.0,0.0,1283.0,371.0,3329.0,193.0,False,1.0,0.0,1.0,...,1.0,1.0,0.0,1.0,2,1654.0,3522.0,5176.0,1.0,0.0
4,16.0,303.0,70.0,151.0,565.0,2.0,True,1.0,0.0,1.0,...,1.0,6.0,0.0,1.0,1,221.0,870.0,1091.0,1.0,0.0


In [48]:
class ImputerKNNTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.imputer = KNNImputer(missing_values=np.nan)

    def fit(self, X, y=None):
        self.imputer.fit(X)
        return self

    def transform(self, X, y=None):
        X_transformed = X.copy()
        X_transformed = pd.DataFrame(self.imputer.transform(X), columns = X.columns)
        return X_transformed

    def fit_transform(self, X, y=None):
        self.fit(X, y)
        return self.transform(X, y)

In [49]:
reduntant_columns = [
    'CryoSleep_False',
    'VIP_False',
    'HomePlanet_Earth',
    'Destination_55 Cancri e',
    'Side_P',
    'PassiveSpender_False'
]

imputer_pipeline = Pipeline([
    ('knn', ImputerKNNTransformer())
])

In [50]:
df = train_dataframe.copy()

pipeline = make_pipeline(
    preprocessor_pipeline,
    dropping_pipeline,
    imputer_pipeline
)

df = pipeline.fit_transform(df)
df.head()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,CryoSleep_False,CryoSleep_True,VIP_False,...,Destination_TRAPPIST-1e,Deck,Side_P,Side_S,GroupSize,LowIncomeSpendings,HighIncomeSpendings,TotalSpendings,PassiveSpender_False,PassiveSpender_True
0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,1.0,2.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,24.0,109.0,9.0,25.0,549.0,44.0,1.0,1.0,0.0,1.0,...,1.0,6.0,0.0,1.0,1.0,34.0,702.0,736.0,1.0,0.0
2,58.0,43.0,3576.0,0.0,6715.0,49.0,0.0,1.0,0.0,0.0,...,1.0,1.0,0.0,1.0,2.0,3576.0,6807.0,10383.0,1.0,0.0
3,33.0,0.0,1283.0,371.0,3329.0,193.0,0.0,1.0,0.0,1.0,...,1.0,1.0,0.0,1.0,2.0,1654.0,3522.0,5176.0,1.0,0.0
4,16.0,303.0,70.0,151.0,565.0,2.0,1.0,1.0,0.0,1.0,...,1.0,6.0,0.0,1.0,1.0,221.0,870.0,1091.0,1.0,0.0


In [51]:
reduntant_columns = [
    'CryoSleep_False',
    'VIP_False',
    'HomePlanet_Earth',
    'Destination_55 Cancri e',
    'Side_P',
    'PassiveSpender_False'
]

reduntant_dropping_pipeline = Pipeline([('dropping', DropColumnsTransformer(reduntant_columns))])

In [52]:
df = train_dataframe.copy()

pipeline = make_pipeline(
    preprocessor_pipeline,
    dropping_pipeline,
    imputer_pipeline,
    reduntant_dropping_pipeline
)

df = pipeline.fit_transform(df)
df.head()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,CryoSleep_True,VIP_True,HomePlanet_Europa,HomePlanet_Mars,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Deck,Side_S,GroupSize,LowIncomeSpendings,HighIncomeSpendings,TotalSpendings,PassiveSpender_True
0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,2.0,0.0,1.0,0.0,0.0,0.0,1.0
1,24.0,109.0,9.0,25.0,549.0,44.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,6.0,1.0,1.0,34.0,702.0,736.0,0.0
2,58.0,43.0,3576.0,0.0,6715.0,49.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,2.0,3576.0,6807.0,10383.0,0.0
3,33.0,0.0,1283.0,371.0,3329.0,193.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,2.0,1654.0,3522.0,5176.0,0.0
4,16.0,303.0,70.0,151.0,565.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,6.0,1.0,1.0,221.0,870.0,1091.0,0.0


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False
