In [12]:
import os
import sys
sys.path.insert(0, os.path.abspath('../src/'))

import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline, FeatureUnion

from modules.transformers import ImputeTransformer, OneHotTransformer, SmoothMeanTransformer, DropTransformer

In [13]:
FOLDER_NAME = "../data"

validation_X = pd.read_csv(os.path.join(FOLDER_NAME, "test.csv"))
X = pd.read_csv(os.path.join(FOLDER_NAME, "train.csv"))

# Drop useless cols
drop_cols = ["Name", "PassengerId"]
validation_X = validation_X.drop(columns=drop_cols)
X = X.drop(columns=drop_cols)

Y = X["Survived"]
X = X.drop(columns="Survived")
X

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,male,22.0,1,0,A/5 21171,7.2500,,S
1,1,female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,1,female,35.0,1,0,113803,53.1000,C123,S
4,3,male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...
886,2,male,27.0,0,0,211536,13.0000,,S
887,1,female,19.0,0,0,112053,30.0000,B42,S
888,3,female,,1,2,W./C. 6607,23.4500,,S
889,1,male,26.0,0,0,111369,30.0000,C148,C


In [14]:
# Lets break the data into 10-fold sets

skf = StratifiedKFold(n_splits=10)
X_train_sets, X_test_sets = [], []
Y_train_sets, Y_test_sets = [], []
for train_idx, test_idx in skf.split(X, Y):
    X_train_sets.append(X.loc[train_idx, :].reset_index(drop=True))
    Y_train_sets.append(Y.loc[train_idx].reset_index(drop=True))
    X_test_sets.append(X.loc[test_idx, :].reset_index(drop=True))
    Y_test_sets.append(Y.loc[test_idx].reset_index(drop=True))

del skf

In [15]:
steps = [
    ("embarked_imp", ImputeTransformer(col="Embarked", missing_values=np.nan, strategy='most_frequent')), 
    ("fare_imp", ImputeTransformer(col="Fare", missing_values=np.nan, strategy='mean')),
    ("age_imp", ImputeTransformer(col="Age", missing_values=np.nan, strategy='mean')),
    ("emabarked_enc", OneHotTransformer(col="Embarked", sparse=False, drop="first")),
    ("sex_enc", OneHotTransformer(col="Sex", sparse=False, drop="first")),
    ("ticket_enc", SmoothMeanTransformer(col="Ticket", m=200)),
    ("drop_cols", DropTransformer(cols=["Sex", "Cabin"])), 
]

X_train = X_train_sets[0]
Y_train = Y_train_sets[0]
basic_pl = Pipeline(steps=steps, verbose=True)


[Pipeline] ...... (step 1 of 7) Processing embarked_imp, total=   0.0s
[Pipeline] .......... (step 2 of 7) Processing fare_imp, total=   0.0s
[Pipeline] ........... (step 3 of 7) Processing age_imp, total=   0.0s
[Pipeline] ..... (step 4 of 7) Processing emabarked_enc, total=   0.0s
[Pipeline] ........... (step 5 of 7) Processing sex_enc, total=   0.0s
[Pipeline] ........ (step 6 of 7) Processing ticket_enc, total=   0.0s
[Pipeline] ......... (step 7 of 7) Processing drop_cols, total=   0.0s


Unnamed: 0,Pclass,Age,SibSp,Parch,Ticket,Fare,Embarked,Embarked_Q,Embarked_S,Sex_male
0,3,30.057205,0,0,0.386339,7.7875,Q,1.0,0.0,0.0
1,2,17.000000,0,0,0.386339,10.5000,S,0.0,1.0,0.0
2,3,33.000000,3,0,0.384427,15.8500,S,0.0,1.0,0.0
3,1,23.000000,3,2,0.387459,263.0000,S,0.0,1.0,0.0
4,3,59.000000,0,0,0.381364,7.2500,S,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...
796,2,27.000000,0,0,0.381364,13.0000,S,0.0,1.0,1.0
797,1,19.000000,0,0,0.386339,30.0000,S,0.0,1.0,0.0
798,3,30.057205,1,2,0.379476,23.4500,S,0.0,1.0,0.0
799,1,26.000000,0,0,0.386339,30.0000,C,0.0,0.0,1.0
