In [10]:
import os
import sys
sys.path.insert(0, os.path.abspath('../src/'))

import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion

from modules.transformers import (
    ImputeTransformer, 
    OneHotTransformer, 
    SmoothMeanTransformer, 
    DropTransformer, 
    Debug, 
    ResetIndexTransformer, 
    StandardScalerTransformer
)

In [11]:
FOLDER_NAME = "../data"

validation_X = pd.read_csv(os.path.join(FOLDER_NAME, "test.csv"))
X = pd.read_csv(os.path.join(FOLDER_NAME, "train.csv"))

# Drop useless cols
drop_cols = ["Name", "PassengerId"]
validation_X = validation_X.drop(columns=drop_cols)
X = X.drop(columns=drop_cols)

Y = X["Survived"]
X = X.drop(columns="Survived")
X

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,male,22.0,1,0,A/5 21171,7.2500,,S
1,1,female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,1,female,35.0,1,0,113803,53.1000,C123,S
4,3,male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...
886,2,male,27.0,0,0,211536,13.0000,,S
887,1,female,19.0,0,0,112053,30.0000,B42,S
888,3,female,,1,2,W./C. 6607,23.4500,,S
889,1,male,26.0,0,0,111369,30.0000,C148,C


In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=35)

In [13]:
steps = [
    ("reset_index", ResetIndexTransformer()),
    ("embarked_imp", ImputeTransformer(col="Embarked", missing_values=np.nan, strategy='most_frequent')),
    # ("debug1", Debug()),
    ("fare_imp", ImputeTransformer(col="Fare", missing_values=np.nan, strategy='mean')),
    # ("debug2", Debug()),
    ("age_imp", ImputeTransformer(col="Age", missing_values=np.nan, strategy='mean')),
    # ("debug3", Debug()),
    ("emabarked_enc", OneHotTransformer(col="Embarked")),
    # ("debug4", Debug()),
    ("sex_enc", OneHotTransformer(col="Sex")),
    # ("debug5", Debug()),
    ("ticket_enc", SmoothMeanTransformer(col="Ticket", m=200)),
    ("drop_cols", DropTransformer(cols=["Sex", "Cabin", "Embarked"])),
    # ("debug6", Debug()),
    ("scale", StandardScalerTransformer())
]


transform_pl = Pipeline(steps=steps, verbose=False)

# Test the pipeline on train
transform_pl.fit_transform(X_train, Y_train)


Unnamed: 0,Pclass,Age,SibSp,Parch,Ticket,Fare,Embarked_Q,Embarked_S,Sex_male
0,1.0,0.472229,0.000,0.0,0.293694,0.013761,0.0,1.0,1.0
1,1.0,0.421965,0.000,0.0,0.680952,0.012679,0.0,1.0,1.0
2,0.5,0.673285,0.000,0.0,0.680952,0.027326,0.0,1.0,1.0
3,0.0,0.610455,0.125,0.0,0.531461,0.111118,0.0,0.0,1.0
4,0.0,0.798944,0.125,0.8,1.000000,0.513342,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...
707,0.5,0.673285,0.125,0.0,0.916801,0.050749,0.0,1.0,1.0
708,1.0,0.057552,0.250,0.2,0.236875,0.037590,0.0,0.0,0.0
709,0.5,0.824076,0.000,0.0,0.680952,0.020495,0.0,1.0,1.0
710,1.0,0.308872,0.000,0.0,0.531461,0.000000,0.0,1.0,1.0


In [14]:
transform_pl.transform(X_test)

Unnamed: 0,Pclass,Age,SibSp,Parch,Ticket,Fare,Embarked_Q,Embarked_S,Sex_male
0,1.0,0.208344,0.000,0.0,0.442744,0.016908,0.0,1.0,1.0
1,0.5,0.220910,0.000,0.4,0.442744,0.025374,0.0,1.0,0.0
2,0.5,0.522493,0.125,0.0,0.442744,0.050749,0.0,1.0,0.0
3,1.0,0.409399,0.000,0.0,0.442744,0.015412,0.0,1.0,1.0
4,1.0,0.220910,0.125,0.0,0.293694,0.034743,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...
174,0.0,0.886906,0.000,0.0,0.442744,0.067640,0.0,0.0,1.0
175,0.5,0.421965,0.000,0.0,0.442744,0.025374,0.0,1.0,1.0
176,0.5,0.673285,0.000,0.0,0.293694,0.050749,0.0,1.0,1.0
177,0.5,0.258608,0.125,0.0,0.442744,0.022447,0.0,1.0,1.0


In [15]:
steps = [
    ("transform", transform_pl),
    ("clf", None)
]

clf_pl = Pipeline(steps=steps, verbose=False)
clf_pl = clf_pl.fit(X_train, Y_train)

In [29]:
%%time

from sklearn.model_selection import GridSearchCV

from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

RANDOM_STATE=45

grid_params = [
    {
        'clf': (KNeighborsClassifier(),),
        'clf__n_neighbors': np.arange(1, 20, 3),
        'clf__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
        'clf__metric': ['euclidean', 'minkowski'],
    },
    {
        'clf': (DecisionTreeClassifier(random_state=RANDOM_STATE),),
        'clf__criterion': ['gini', 'entropy'],
        'clf__max_depth': np.linspace(1, 1000, 100),
        'clf__max_features': ['auto', 'sqrt', 'log2'],
    },
    {
        'clf': (LinearSVC(max_iter=100000, random_state=RANDOM_STATE),),
        'clf__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    },
    {
        'clf': (SVC(max_iter=10000, random_state=RANDOM_STATE),),
        'clf__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
        'clf__kernel': ['poly', 'rbf', 'sigmoid'],
        'clf__gamma': ['scale', 'auto'],
    },
    {
        'clf': (RandomForestClassifier(random_state=RANDOM_STATE),),
        'clf__n_estimators': np.arange(10, 1000, 20),
        'clf__max_features': ['auto', 'sqrt', 'log2'],
    },

]
grid = GridSearchCV(clf_pl, 
                    grid_params, 
                    verbose=4, 
                    cv=8, 
                    scoring="f1", 
                    n_jobs=4, 
                    error_score='raise')
grid.fit(X_train, Y_train)

print("Best F1 Score: ", clf.best_score_)
print("Best Params: ", clf.best_params_)


Fitting 8 folds for each of 855 candidates, totalling 6840 fits
Best F1 Score:  0.7487104938462578
Best Params:  {'clf': RandomForestClassifier(n_estimators=250, random_state=45), 'clf__max_features': 'auto', 'clf__n_estimators': 250}
CPU times: user 18.2 s, sys: 956 ms, total: 19.2 s
Wall time: 14min 11s


As we can see GridSearch is a very expensive operation - `RandomForestClassifier` performs best with the above configuration. Now lets train it using the whole training dataset and see how well our classifier performs.

In [32]:
from sklearn.metrics import accuracy_score, f1_score

clf = RandomForestClassifier(random_state=RANDOM_STATE, n_estimators=250, max_features='auto')

steps = [
    ("transform", transform_pl),
    ("clf", clf)
]

final_pl = Pipeline(steps=steps, verbose=True)
final_pl = final_pl.fit(X_train, Y_train)

Y_pred = final_pl.predict(X_test)

print(f"Accuracy: {accuracy_score(Y_test, Y_pred)}")
print(f"F1 scores: {f1_score(Y_test, Y_pred)}")

[Pipeline] ......... (step 1 of 2) Processing transform, total=   0.0s
[Pipeline] ............... (step 2 of 2) Processing clf, total=   0.5s
Accuracy: 0.8100558659217877
F1 scores: 0.757142857142857
