In [44]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.base import BaseEstimator, TransformerMixin

In [45]:
test_raw = pd.read_csv('raw_data/test.csv')
train_raw = pd.read_csv('raw_data/train.csv')

In [46]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2)

for train_index, test_index in split.split(train_raw, train_raw[['Survived', 'Pclass', 'Sex']]):
    train_set = train_raw.loc[train_index]
    val_set = train_raw.loc[test_index]

train_X, train_y = train_set.drop(['Survived'], axis=1), train_set['Survived']
val_X, val_y = val_set.drop(['Survived'], axis=1), val_set['Survived']

In [47]:
numeric_features = ['Age', 'Fare', 'SibSp', 'Parch']
cat_features = ['Sex', 'Embarked', 'Pclass']

In [48]:
class FeatureDropperAndCategorySetter(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # Set categorical features
        X[cat_features] = X[cat_features].astype('category')
        # Drop useless features
        return X.drop(['Name', 'Ticket', 'Cabin', 'PassengerId'], axis=1)

In [49]:
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy='most_frequent')),
        ("encoder", OneHotEncoder()),
    ]
)

In [50]:
numeric_transformer = Pipeline(
    steps=[
        ("imputer", KNNImputer(n_neighbors=5)), 
    ]
)

In [51]:
pipe = Pipeline(
    steps=[
        ('dropper', FeatureDropperAndCategorySetter()),
        ('preprocessor', ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, numeric_features),
                ('cat', categorical_transformer, cat_features),
            ],
            remainder='passthrough'
        )),
        ("scaler", StandardScaler()),
    ]
)

In [52]:
transformed_train = pipe.fit_transform(train_X)
transformed_train

array([[-0.69141088,  4.54772632,  1.38159739, ...,  1.77187327,
        -0.51226102, -1.10679718],
       [ 1.14389996,  0.57381132,  0.46311248, ...,  1.77187327,
        -0.51226102, -1.10679718],
       [ 1.58437457, -0.37769641, -0.45537243, ..., -0.56437445,
         1.95212981, -1.10679718],
       ...,
       [-0.83823575, -0.47850981, -0.45537243, ..., -0.56437445,
        -0.51226102,  0.9035079 ],
       [-0.69141088, -0.48089574, -0.45537243, ..., -0.56437445,
        -0.51226102,  0.9035079 ],
       [-1.93942225, -0.30461745,  0.46311248, ..., -0.56437445,
        -0.51226102,  0.9035079 ]])

In [53]:
transformed_val = pipe.fit_transform(val_X)
transformed_val

array([[-0.40600161, -0.55448381, -0.5501797 , ..., -0.57089923,
        -0.50174521,  0.89893315],
       [ 1.64018266,  4.71210608, -0.5501797 , ...,  1.75162262,
        -0.50174521, -1.11242977],
       [ 0.80626605, -0.05540492, -0.5501797 , ...,  1.75162262,
        -0.50174521, -1.11242977],
       ...,
       [-0.05853636,  0.52531664, -0.5501797 , ..., -0.57089923,
        -0.50174521,  0.89893315],
       [-0.05853636, -0.53923361,  1.1928675 , ..., -0.57089923,
        -0.50174521,  0.89893315],
       [-0.36739436,  0.7117074 ,  0.3213439 , ..., -0.57089923,
         1.99304346, -1.11242977]])

In [54]:
transformed_test = pipe.fit_transform(test_raw)
transformed_test

array([[ 0.29802685, -0.49740241, -0.49947002, ..., -0.58655899,
        -0.5349335 ,  0.95782629],
       [ 1.26468542, -0.51226703,  0.61699237, ..., -0.58655899,
        -0.5349335 ,  0.95782629],
       [ 2.4246757 , -0.46408966, -0.49947002, ..., -0.58655899,
         1.86939125, -1.04403065],
       ...,
       [ 0.60735759, -0.50778541, -0.49947002, ..., -0.58655899,
        -0.5349335 ,  0.95782629],
       [ 0.18976109, -0.49344424, -0.49947002, ..., -0.58655899,
        -0.5349335 ,  0.95782629],
       [-0.69956479, -0.23694703,  0.61699237, ..., -0.58655899,
        -0.5349335 ,  0.95782629]])

In [55]:
transformed_train.shape, transformed_test.shape

((712, 12), (418, 12))

In [56]:
train_y

742    1
92     0
714    0
865    1
433    0
      ..
881    0
466    0
566    0
408    0
10     1
Name: Survived, Length: 712, dtype: int64

In [57]:
passenger_ids = test_raw["PassengerId"]

In [58]:
np.savetxt('preprocessed_data/train_X.csv', transformed_train, delimiter=',', fmt="%f")
np.savetxt('preprocessed_data/train_y.csv', train_y, delimiter=',', fmt='%d')
np.savetxt('preprocessed_data/val_X.csv', transformed_val, delimiter=',', fmt="%f")
np.savetxt('preprocessed_data/val_y.csv', val_y, delimiter=',', fmt='%d')
np.savetxt('preprocessed_data/test_X.csv', transformed_test, delimiter=',', fmt="%f")
np.savetxt('preprocessed_data/passenger_ids.csv', passenger_ids, delimiter=',', fmt='%d')