In [22]:
import os
import sys
sys.path.insert(0, os.path.abspath('../src/'))

import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline, FeatureUnion

from modules.transformers import ImputeTransformer, OneHotTransformer, SmoothMeanTransformer, DropTransformer, Debug, ResetIndexTransformer

In [23]:
FOLDER_NAME = "../data"

validation_X = pd.read_csv(os.path.join(FOLDER_NAME, "test.csv"))
X = pd.read_csv(os.path.join(FOLDER_NAME, "train.csv"))

# Drop useless cols
drop_cols = ["Name", "PassengerId"]
validation_X = validation_X.drop(columns=drop_cols)
X = X.drop(columns=drop_cols)

Y = X["Survived"]
X = X.drop(columns="Survived")
X

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,male,22.0,1,0,A/5 21171,7.2500,,S
1,1,female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,1,female,35.0,1,0,113803,53.1000,C123,S
4,3,male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...
886,2,male,27.0,0,0,211536,13.0000,,S
887,1,female,19.0,0,0,112053,30.0000,B42,S
888,3,female,,1,2,W./C. 6607,23.4500,,S
889,1,male,26.0,0,0,111369,30.0000,C148,C


In [24]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=35)

In [25]:
steps = [
    ("reset_index", ResetIndexTransformer()),
    ("embarked_imp", ImputeTransformer(col="Embarked", missing_values=np.nan, strategy='most_frequent')),
    # ("debug1", Debug()),
    ("fare_imp", ImputeTransformer(col="Fare", missing_values=np.nan, strategy='mean')),
    # ("debug2", Debug()),
    ("age_imp", ImputeTransformer(col="Age", missing_values=np.nan, strategy='mean')),
    # ("debug3", Debug()),
    ("emabarked_enc", OneHotTransformer(col="Embarked")),
    # ("debug4", Debug()),
    ("sex_enc", OneHotTransformer(col="Sex")),
    # ("debug5", Debug()),
    ("ticket_enc", SmoothMeanTransformer(col="Ticket", m=200)),
    ("drop_cols", DropTransformer(cols=["Sex", "Cabin", "Embarked"])),
    # ("debug6", Debug()),
]


transform_pl = Pipeline(steps=steps, verbose=False)

# Test the pipeline on train
transform_pl.fit_transform(X_train, Y_train)


Unnamed: 0,Pclass,Age,SibSp,Parch,Ticket,Fare,Embarked_Q,Embarked_S,Sex_male
0,3,38.0,0,0,0.382971,7.0500,0.0,1.0,1.0
1,3,34.0,0,0,0.387946,6.4958,0.0,1.0,1.0
2,2,54.0,0,0,0.387946,14.0000,0.0,1.0,1.0
3,1,49.0,1,0,0.386026,56.9292,0.0,0.0,1.0
4,1,64.0,1,4,0.392045,263.0000,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...
707,2,54.0,1,0,0.390976,26.0000,0.0,1.0,1.0
708,3,5.0,2,1,0.382241,19.2583,0.0,0.0,0.0
709,2,66.0,0,0,0.387946,10.5000,0.0,1.0,1.0
710,3,25.0,0,0,0.386026,0.0000,0.0,1.0,1.0


In [26]:
transform_pl.transform(X_test)

Unnamed: 0,Pclass,Age,SibSp,Parch,Ticket,Fare,Embarked_Q,Embarked_S,Sex_male
0,3,17.000000,0,0,0.384886,8.6625,0.0,1.0,1.0
1,2,18.000000,0,2,0.384886,13.0000,0.0,1.0,0.0
2,2,42.000000,1,0,0.384886,26.0000,0.0,1.0,0.0
3,3,33.000000,0,0,0.384886,7.8958,0.0,1.0,1.0
4,3,18.000000,1,0,0.382971,17.8000,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...
174,1,71.000000,0,0,0.384886,34.6542,0.0,0.0,1.0
175,2,34.000000,0,0,0.384886,13.0000,0.0,1.0,1.0
176,2,54.000000,0,0,0.382971,26.0000,0.0,1.0,1.0
177,2,21.000000,1,0,0.384886,11.5000,0.0,1.0,1.0


In [27]:
from sklearn.tree import DecisionTreeClassifier

steps = [
    ("transform", transform_pl),
    ("clf", None)
]

clf_pl = Pipeline(steps=steps, verbose=False)
clf_pl = clf_pl.fit(X_train, Y_train)

In [28]:
from sklearn.model_selection import GridSearchCV

grid_params = {
    'clf': (DecisionTreeClassifier(),),
    'clf__criterion': ['gini', 'entropy'],
    'clf__max_depth': np.linspace(1, 1000, 10),
    'clf__max_features': ['auto', 'sqrt', 'log2']
}
clf = GridSearchCV(clf_pl, grid_params, verbose=1, cv=10, scoring="f1")
clf.fit(X_train, Y_train)
print("Best Score: ", clf.best_score_)
print("Best Params: ", clf.best_params_)


Fitting 10 folds for each of 60 candidates, totalling 600 fits
Best Score:  0.7110783752484846
Best Params:  {'clf': DecisionTreeClassifier(criterion='entropy', max_depth=334.0,
                       max_features='auto'), 'clf__criterion': 'entropy', 'clf__max_depth': 334.0, 'clf__max_features': 'auto'}
