In [2]:
import pandas as pd
import matplotlib.pyplot as plt

import numpy as np

from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

In [5]:
test_raw = pd.read_csv('raw_data/test.csv')
train_raw = pd.read_csv('raw_data/train.csv')

In [49]:
useless_features = ['Name', 'Ticket', 'Cabin']
passenger_id = 'PassengerId'
label = 'Survived'

numeric_features = ['Age', 'Fare']
cat_features = ['Pclass', 'Sex', 'SibSp', 'Embarked', 'Parch']

In [51]:
def cleanup(df, keepPassengerId=False):
    if keepPassengerId:
        df = df.drop(columns=useless_features) # drop useless columns
    else:
        df = df.drop(columns=[passenger_id, *useless_features]) # drop useless columns
        
    df = df.drop_duplicates() # drop duplicate rows
    df[cat_features] = df[cat_features].astype('category') # convert to category
    return df

In [52]:
train_clean = cleanup(train_raw)

In [53]:
train_X, train_y = train_clean.drop(columns=[label]), train_clean[label]

In [58]:
cat_features_categories = []

for column in cat_features:
    cats = set(train_raw[column].unique().tolist() + test_raw[column].unique().tolist())
    cats.discard(np.nan)
    cat_features_categories.append(sorted(list(cats)))
    
print(cat_features_categories)

[[1, 2, 3], ['female', 'male'], [0, 1, 2, 3, 4, 5, 8], ['C', 'Q', 'S'], [0, 1, 2, 3, 4, 5, 6, 9]]


In [59]:
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(missing_values=np.NaN, strategy='most_frequent')),
        ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False, categories=cat_features_categories)),
    ]
)

In [60]:
numeric_transformer = Pipeline(
    steps=[
        ("imputer", KNNImputer(missing_values=np.NaN, n_neighbors=5)), 
        ("scaler", StandardScaler())
    ]
)

In [61]:
pipe = Pipeline(
    steps=[
        ('preprocessor', ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, numeric_features),
                ('cat', categorical_transformer, cat_features),
            ] 
        )),
    ]
)

In [62]:
pipe.steps[0][1].transformers[1][1].steps[1][1].get_params()

{'categories': [[1, 2, 3],
  ['female', 'male'],
  [0, 1, 2, 3, 4, 5, 8],
  ['C', 'Q', 'S'],
  [0, 1, 2, 3, 4, 5, 6, 9]],
 'drop': None,
 'dtype': numpy.float64,
 'feature_name_combiner': 'concat',
 'handle_unknown': 'ignore',
 'max_categories': None,
 'min_frequency': None,
 'sparse': 'deprecated',
 'sparse_output': False}

In [63]:
transformed_train = pipe.fit_transform(train_X)

In [64]:
test_clean = cleanup(test_raw, keepPassengerId=True)
test_X, passenger_ids = test_clean.drop(columns=[passenger_id]), test_clean[passenger_id]
transformed_test = pipe.fit_transform(test_X)

In [66]:
transformed_train.shape, transformed_test.shape

((780, 25), (418, 25))

In [65]:
transformed_test[0]

array([ 0.37270783, -0.50073294,  0.        ,  0.        ,  1.        ,
        0.        ,  1.        ,  1.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        1.        ,  0.        ,  1.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ])

In [68]:
np.savetxt('preprocessed_data/train_X.csv', transformed_train, delimiter=',', fmt=['%f'] * 2 + ['%d'] * 23)
np.savetxt('preprocessed_data/train_y.csv', train_y, delimiter=',', fmt='%d')
np.savetxt('preprocessed_data/test_X.csv', transformed_test, delimiter=',', fmt=['%f'] * 2 + ['%d'] * 23)
np.savetxt('preprocessed_data/passenger_ids.csv', passenger_ids, delimiter=',', fmt='%d')