In [203]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [204]:
import sklearn
from sklearn import *
import sys, os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [205]:
# get current user
user = !id -un
user = user[0]
data_path = f'/Users/{user}/datasets/kaggle/titanic'

In [206]:
train_data_path = f'{data_path}/train.csv'
test_data_path = f'{data_path}/test.csv'
test_submission_path = f'{data_path}/gender_submission.csv'
sumission_path = f'{data_path}/submission.csv'

In [207]:
train_data = pd.read_csv(train_data_path)
test_data =  pd.read_csv(test_data_path)
test_submission =  pd.read_csv(test_submission_path)

In [240]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import LabelEncoder

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

class CategoricalEncoder(LabelEncoder):
    def fit(self, X, y=None):
        """this would allow us to fit the model based on the X input."""
        super(CategoricalEncoder, self).fit(X)
    def transform(self, X, y=None):
        return super(CategoricalEncoder, self).transform(X)

    def fit_transform(self, X, y=None):
        return super(CategoricalEncoder, self).fit(X).transform(X)
    
class CategoricalBinarizer(LabelBinarizer):
    def fit(self, X, y=None):
        """this would allow us to fit the model based on the X input."""
        super(CategoricalBinarizer, self).fit(X)
    def transform(self, X, y=None):
        return super(CategoricalBinarizer, self).transform(X)

    def fit_transform(self, X, y=None):
        return super(CategoricalBinarizer, self).fit(X).transform(X)
    
class FirstCharacterExtractor(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return [s[0] for s in list(X.flatten())]
    
class ThreeClassesExtractor(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        class_mapping = []
        for x in X:
            if(x in ['A', 'B', 'C']):
                class_mapping.append('A')
            if(x in ['D', 'E', 'F']):
                class_mapping.append('B')
            if(x in ['G', 'T', 'Z']):
                class_mapping.append('C')
        return class_mapping

In [241]:
def trainRandomForestModel(X_numeric, y):
    from sklearn.ensemble import RandomForestClassifier
    clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
    clf = clf.fit(X_numeric, y)
    return clf

In [242]:
num_attributes = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
binary_attribute = ['Sex']
cat_attributes = ['Embarked']
cabin_attribute = ['Cabin']

In [243]:
num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_attributes)),
    ('imputer', SimpleImputer(strategy='median'))
])

In [244]:
binary_pipeline = Pipeline([
    ('selector', DataFrameSelector(binary_attribute)),
    ('categorical_binarizer', CategoricalBinarizer())
])

In [245]:
cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(cat_attributes)),
    ('imputer', SimpleImputer(strategy='constant', fill_value='Z')),
    ('categorical_binarizer', CategoricalBinarizer())
])

In [246]:
cabin_pipeline = Pipeline([
        ('selector', DataFrameSelector(cabin_attribute)),
        ('imputer', SimpleImputer(strategy='constant', fill_value='Z')),
        ('first_char_selector', FirstCharacterExtractor()),
        ('three_classes_extractor', ThreeClassesExtractor()),
        ('categorical_encoder', CategoricalBinarizer())
    ])

In [247]:
full_pipeline = FeatureUnion(transformer_list = [
    ('num_pipeline', num_pipeline),
    ('cat_pipeline', cat_pipeline),
    ('binary_pipeline', binary_pipeline),
    ('cabin_pipeline', cabin_pipeline)
 ])

x_train = full_pipeline.fit_transform(train_data)

In [248]:
def createSimpleSubmission(clf, pipeline, test_data):
    x_numeric = full_pipeline.transform(test_data)
    
    X_passengerIds = test_data['PassengerId']
    y_pred = clf.predict(x_numeric)
    data =list(zip(list(X_passengerIds),list(y_pred)))
    
    modelName = type(clf).__name__
    import csv
    file_name = f"{modelName}.csv"
    with open(file_name, 'w') as f:
        writer = csv.writer(f , lineterminator='\n')
        writer.writerow(['PassengerId','Survived'])
        for tup in data:
            writer.writerow(tup)

In [249]:
y_train = train_data['Survived']

clf = trainRandomForestModel(x_train, y_train)
cabin_categorical_encoder = cabin_pipeline.fit(train_data).named_steps.categorical_encoder

createSimpleSubmission(clf, full_pipeline, test_data)