In [1]:
%load_ext autoreload
%autoreload 2

In [396]:
import os
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression

from dumbrain.ml.kaggle.lib import initCompetition, submit
from dumbrain.lib.cleaners import *
from dumbrain.lib.cleaners.column import *

%matplotlib inline

In [5]:
COMPETITION_NAME = 'titanic'
RANDOM_STATE = 4111

In [342]:
test_file, train_file, example_output = initCompetition( COMPETITION_NAME )

In [343]:
all_train_data = pd.read_csv( train_file )
all_test_data = pd.read_csv( test_file )
all_example_data = pd.read_csv( example_output )

In [344]:
train_data_uncleaned = all_train_data

In [345]:
def predictAgePerTitle( _data, _test_data ):
    lr_cleaners = [
        RemoveColumnCleaner( [ 'PassengerId', 'Survived', 'Name', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Sex' ] )
    #     DummyColumnCleaner( 'Title', ['Mr', 'Mrs', 'Miss', 'Master', 'Royal', 'Officer', 'Major', 'Col','Capt'] ),
    ]
    
    def getTitle( name ):
        for token in name.split( ' ' ):
            if '.' in token:
                return token.replace( '.', '' )
        return None

    _data[ 'Title' ] = _data[ 'Name' ].apply( getTitle )
    _test_data[ 'Title' ] = _test_data[ 'Name' ].apply( getTitle )

    groups = {
        'Misc': [ 'Col', 'Major', 'Capt', 'Jonkheer', 'Don', 'Sir', 'Lady', 'Countess', 'Dr', 'Rev' ],
        'Mrs': [ 'Ms', 'Mme', 'Mlle' ]
    }
    for to_title, from_titles in groups.items():
        for from_title in from_titles:
            _data.loc[ _data[ 'Title' ] == from_title, 'Title' ] = to_title
            _test_data.loc[ _test_data[ 'Title' ] == from_title, 'Title' ] = to_title

    for title in _data.Title.unique():
        lr_cleaned = cleanData( lr_cleaners, _data[ _data.Title == title ] )
        lr_cleaned = lr_cleaned.drop( 'Title', axis=1 )

        test = lr_cleaned[ lr_cleaned.Age.isna() ]
        lr_cleaned_train = lr_cleaned.drop( test.index )

        age_model = LinearRegression()
        age_model.fit( lr_cleaned_train.drop( [ 'Age' ], axis=1 ), lr_cleaned_train[ 'Age' ] )

        if test.shape[ 0 ] != 0:
            print( 'Score', title, cross_val_score( age_model, lr_cleaned_train.drop( [ 'Age' ], axis=1 ), lr_cleaned_train[ 'Age' ], cv=2 ).mean() )
            _data.loc[ ( _data.Title == title ) & ( _data.Age.isna() ), 'Age' ] = age_model.predict( test.drop( 'Age', axis=1 ) )

        _test_filter = ( _test_data.Title == title ) & ( _test_data.Age.isna() )
        _test_cleaned = cleanData( lr_cleaners, _test_data.loc[ _test_filter ] ).drop( 'Title', axis=1 )
        print( len( _test_cleaned ) )
        if len( _test_cleaned ) != 0:
            _test_data.loc[ _test_filter, 'Age' ] = age_model.predict( _test_cleaned.drop( 'Age', axis=1 ) )

predictAgePerTitle( all_train_data, all_test_data )

Score Mr 0.16808358149081604
57
Score Mrs 0.007817496278279823
11
Score Miss 0.39980187807731593
14
Score Master -0.3181188549468281
4
Score Misc -0.23013750799417143
0


In [381]:
def getFamilySize( data ):
    return data.SibSp + data.Parch

def isAlone( data ):
    return np.where( data.FamilySize == 1, 1, 0 )

def convertToInt16( data ):
    return np.int16( data )

def getLastName( data ):
    return data.split( ',' )[ 0 ].strip()

def strlen( data ):
    if isinstance( data, float ) and  np.isnan( data ):
        return 0
    return len( data )

def nameScoreMean( data ):
    return data[ 'Name_score' ].where( ~data[ 'Name_score' ].isna(), data[ 'Name_score' ].mean() )

def fareMean( data ):
    return data[ 'Fare' ].where( ~data[ 'Fare' ].isna(), data[ 'Fare' ].mean() )

last_names = train_data_uncleaned.Name.apply( getLastName )
last_names_dataset = pd.DataFrame( { 'Name': last_names, 'Survived': train_data_uncleaned.Survived } )
name_sentiment_cleaner = BasicTokenSentimentColumnCleaner( 'Name', last_names_dataset, 'Survived' )

# titles = train_data_uncleaned.Title
# last_names_dataset = pd.DataFrame( { 'Title': last_names, 'Survived': train_data_uncleaned.Survived } )
# name_sentiment_cleaner = BasicTokenSentimentColumnCleaner( 'Name', last_names_dataset, 'Survived' )

cleaners = [
    RemoveColumnCleaner( 'PassengerId' ),
    CalculatedColumnCleaner( 'Fare', fareMean ),
    MapColumnCleaner( 'Name', getLastName ),
    DummyColumnCleaner( 'Name', last_names ),
    DummyColumnCleaner( 'Sex', [ 'male', 'female' ] ),
    RemoveColumnCleaner( 'Sex' ),
#     RemoveColumnCleaner( 'Fare' ),
#     RemoveColumnCleaner( 'Title' ),
    DummyColumnCleaner( 'Title', train_data_uncleaned.Title.unique() ),
#     DummyColumnCleaner( 'Pclass', [ 1, 2, 3 ] ),
    RemoveColumnCleaner( 'Ticket' ),                     # Todo: Use this data
#     MapColumnCleaner( 'Ticket', strlen ),
#     RemoveColumnCleaner( 'Cabin' ),                      # Todo: Use this data
    MapColumnCleaner( 'Cabin', strlen ),
    DummyColumnCleaner( 'Embarked', [ 'S', 'C', 'Q' ] ),
    CalculatedColumnCleaner( 'FamilySize', getFamilySize ),
    RemoveColumnCleaner( 'SibSp' ),
    RemoveColumnCleaner( 'Parch' ),
    CalculatedColumnCleaner( 'isAlone', isAlone ),
    RemoveColumnCleaner( 'FamilySize' ),
    ConvertDataCleaner( np.float64 ),
    
]

train_data = cleanData( cleaners, train_data_uncleaned )
# validate_data = cleanData( cleaners, validate_data_uncleaned )

# print( pd.get_dummies( train_data.Name ).shape )

train_data.head()

# name_sentiment_cleaner.sentiments.sort_values( 'scores', ascending=False ).head()
# train_data.corr()

Unnamed: 0,Survived,Pclass,Age,Fare,Cabin,_Braund_Name,_Cumings_Name,_Heikkinen_Name,_Futrelle_Name,_Allen_Name,...,_female_Sex,_Mr_Title,_Mrs_Title,_Miss_Title,_Master_Title,_Misc_Title,_S_Embarked,_C_Embarked,_Q_Embarked,isAlone
0,0.0,3.0,22.0,7.25,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,1.0,1.0,38.0,71.2833,3.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
2,1.0,3.0,26.0,7.925,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
3,1.0,1.0,35.0,53.1,4.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,0.0,3.0,35.0,8.05,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [382]:
x_train_data = train_data.drop( 'Survived', axis=1 )
y_train_data = train_data[ 'Survived' ]

In [383]:
def scoreModel( model, x_train_data, y_train_data ):
    return cv_score

def predict( model, _all_test_data ):
    test_data_cleaned = cleanData( cleaners, _all_test_data )
    predicted = model.predict( test_data_cleaned )
    output = pd.DataFrame( { 'PassengerId': all_test_data[ 'PassengerId' ], 'Survived': predicted.astype( np.int64 ) } )
    output = output.set_index( 'PassengerId' )
    return output

In [384]:
kfold = StratifiedKFold( 5, random_state=RANDOM_STATE )

In [385]:
model = RandomForestClassifier( n_estimators = 10, max_depth = 10, random_state=RANDOM_STATE )
cross_val_score( model, x_train_data, y_train_data, cv=kfold )

array([0.77094972, 0.82122905, 0.8258427 , 0.74157303, 0.83615819])

In [386]:
# With Grid Parameter Search
cv_score = cross_val_score( model, x_train_data, y_train_data, cv=kfold )

RF = RandomForestClassifier( random_state=RANDOM_STATE )
PRF = [ { 'n_estimators': [ 10, 100 ], 'max_depth': [ 3, 6, 10 ] , 'criterion': [ 'gini', 'entropy' ] } ]
GSRF = GridSearchCV( estimator=RF, param_grid=PRF, scoring='accuracy', cv=kfold )
cross_val_score( GSRF, x_train_data, y_train_data, cv=kfold )



array([0.84357542, 0.82122905, 0.80898876, 0.76404494, 0.83615819])

In [387]:
GSRF.fit( x_train_data, y_train_data )
# predict( GSRF, all_test_data )

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=4111, shuffle=False),
       error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=4111, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'n_estimators': [10, 100], 'max_depth': [3, 6, 10], 'criterion': ['gini', 'entropy']}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [388]:
cross_val_score( GSRF, x_train_data, y_train_data, cv=kfold )



array([0.84357542, 0.82122905, 0.80898876, 0.76404494, 0.83615819])

In [389]:
output = predict( GSRF, all_test_data )

In [391]:
svc_model = make_pipeline( StandardScaler(), SVC( random_state=RANDOM_STATE ) )
# r = [ 0.0001, 0.001, 0.1, 1, 10, 50, 100 ]
# svc_model.fit( x_train_data, y_train_data )

r = [ 0.0001, 0.001, 0.1, 1, 10, 50, 100 ]
parameters_svc = [
    { 'svc__C': r, 'svc__kernel': [ 'linear' ] }, 
    { 'svc__C': r, 'svc__gamma': r, 'svc__kernel': [ 'rbf' ] } 
]
gs_svc_model = GridSearchCV( estimator=svc_model, param_grid=parameters_svc, scoring='accuracy', cv=kfold, verbose=1, n_jobs=20 )

gs_svc_cv_score = cross_val_score( gs_svc_model, x_train_data.astype( np.float64 ), y_train_data, cv = kfold )
gs_svc_cv_score

Fitting 5 folds for each of 56 candidates, totalling 280 fits


[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    7.9s
[Parallel(n_jobs=20)]: Done 160 tasks      | elapsed:   29.6s
[Parallel(n_jobs=20)]: Done 280 out of 280 | elapsed:   47.5s finished


Fitting 5 folds for each of 56 candidates, totalling 280 fits


[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    2.6s
[Parallel(n_jobs=20)]: Done 160 tasks      | elapsed:   24.5s
[Parallel(n_jobs=20)]: Done 280 out of 280 | elapsed:   42.5s finished


Fitting 5 folds for each of 56 candidates, totalling 280 fits


[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    2.5s
[Parallel(n_jobs=20)]: Done 160 tasks      | elapsed:   24.4s
[Parallel(n_jobs=20)]: Done 280 out of 280 | elapsed:   42.5s finished


Fitting 5 folds for each of 56 candidates, totalling 280 fits


[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    2.6s
[Parallel(n_jobs=20)]: Done 160 tasks      | elapsed:   24.7s
[Parallel(n_jobs=20)]: Done 280 out of 280 | elapsed:   42.5s finished


Fitting 5 folds for each of 56 candidates, totalling 280 fits


[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    2.9s
[Parallel(n_jobs=20)]: Done 160 tasks      | elapsed:   24.6s
[Parallel(n_jobs=20)]: Done 280 out of 280 | elapsed:   43.1s finished


array([0.86592179, 0.84916201, 0.84269663, 0.80337079, 0.86440678])

In [392]:
gs_svc_cv_score.mean()

0.8451115988548071

In [393]:
gs_svc_model.fit( x_train_data.astype( np.float64 ), y_train_data )

Fitting 5 folds for each of 56 candidates, totalling 280 fits


[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    4.7s
[Parallel(n_jobs=20)]: Done 160 tasks      | elapsed:   42.4s
[Parallel(n_jobs=20)]: Done 280 out of 280 | elapsed:  1.2min finished


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=4111, shuffle=False),
       error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('svc', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=4111,
  shrinking=True, tol=0.001, verbose=False))]),
       fit_params=None, iid='warn', n_jobs=20,
       param_grid=[{'svc__C': [0.0001, 0.001, 0.1, 1, 10, 50, 100], 'svc__kernel': ['linear']}, {'svc__C': [0.0001, 0.001, 0.1, 1, 10, 50, 100], 'svc__gamma': [0.0001, 0.001, 0.1, 1, 10, 50, 100], 'svc__kernel': ['rbf']}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=1)

In [401]:
gb_model = GradientBoostingClassifier()

r = [ 0.0001, 0.001, 0.1, 1, 10, 50, 100 ]
parameters_gb = { 'learning_rate': r }
gs_gb_model = GridSearchCV( estimator=gb_model, param_grid=parameters_gb, scoring='accuracy', cv=kfold, verbose=1, n_jobs=-1 )
cross_val_score( gs_gb_model, x_train_data, y_train_data, cv=kfold )

Fitting 5 folds for each of 7 candidates, totalling 35 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  35 out of  35 | elapsed:    6.2s finished


Fitting 5 folds for each of 7 candidates, totalling 35 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  35 out of  35 | elapsed:    4.1s finished


Fitting 5 folds for each of 7 candidates, totalling 35 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  35 out of  35 | elapsed:    4.1s finished


Fitting 5 folds for each of 7 candidates, totalling 35 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  35 out of  35 | elapsed:    4.3s finished


Fitting 5 folds for each of 7 candidates, totalling 35 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  35 out of  35 | elapsed:    4.1s finished


array([0.8603352 , 0.83240223, 0.83707865, 0.80337079, 0.85310734])

In [402]:
gs_gb_model.fit( x_train_data, y_train_data )

Fitting 5 folds for each of 7 candidates, totalling 35 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  35 out of  35 | elapsed:    6.4s finished


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=4111, shuffle=False),
       error_score='raise-deprecating',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_sampl...      subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'learning_rate': [0.0001, 0.001, 0.1, 1, 10, 50, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=1)

In [417]:
test_data_cleaned = cleanData( cleaners, all_test_data )

predictions = pd.DataFrame( { 'PassengerId': all_test_data.PassengerId } )

predictions[ 'rf' ] = GSRF.predict( test_data_cleaned )
predictions[ 'svc' ] = gs_svc_model.predict( test_data_cleaned )
predictions[ 'gb' ] = gs_gb_model.predict( test_data_cleaned )

predictions.corr()

Unnamed: 0,PassengerId,rf,svc,gb
PassengerId,1.0,-0.015743,0.023574,0.01317
rf,-0.015743,1.0,0.813572,0.877715
svc,0.023574,0.813572,1.0,0.898436
gb,0.01317,0.877715,0.898436,1.0


In [422]:
predictions[ 'output' ] = 0
predictions[ 'output' ] = np.where( predictions[[ 'rf', 'svc', 'gb' ]].sum( axis=1 ) >= 2, 1, 0 )
predictions

output = pd.DataFrame( { 'PassengerId': predictions.PassengerId, 'Survived': predictions.output.astype( np.int64 ) } )
output = output.set_index( 'PassengerId' )
output.head()

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,1
894,0
895,0
896,1


In [423]:
output.sum()

Survived    159
dtype: int64

In [425]:
svc_output = predict( gs_svc_model, all_test_data )
svc_output.sum()

Survived    149
dtype: int64

In [428]:
( output != svc_output ).sum()

Survived    16
dtype: int64

In [424]:
output_csv = 'output/results.csv'
output.to_csv( output_csv )
submit( COMPETITION_NAME, output_csv )