In [23]:
import pandas as pd
import numpy as np

In [24]:
processed = 'processed'
submissions = 'submissions'
features = 'features'
df_train = pd.read_csv(f'{processed}/train.csv')
df_test =  pd.read_csv(f'{processed}/test.csv')

In [25]:
target_column = 'Survived'

In [26]:
feature_names = ['SibSp', 'Sex_male', 'Sex_female']\
    + [col for col in df_test.columns if 'Age_is' in col]\
    + [col for col in df_test.columns if 'Pclass_is' in col]\
    + [col for col in df_test.columns if 'Name_title_is' in col]\
    + [col for col in df_test.columns if 'Name_last_odds_' in col]
feature_names

['SibSp',
 'Sex_male',
 'Sex_female',
 'Age_is(0..2]',
 'Age_is(2..6]',
 'Age_is(6..14]',
 'Age_is(14..16]',
 'Age_is(16..18]',
 'Age_is(18..25]',
 'Age_is(25..35]',
 'Age_is(35..40]',
 'Age_is(40..45]',
 'Age_is(45..60]',
 'Age_is(60..80]',
 'Pclass_is_1',
 'Pclass_is_2',
 'Pclass_is_3',
 'Name_title_is_Master.',
 'Name_title_is_Miss.',
 'Name_title_is_Mr.',
 'Name_title_is_Mrs.',
 'Name_title_is_Other.',
 'Name_last_odds_survival']

In [27]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

parameter_grid = {
             'max_depth' : [2, 3, 4, 6, 8],
             'n_estimators': [50, 10],
             'max_features': ['sqrt', 'auto', 'log2'],
             'min_samples_split': [2, 3, 10],
             'min_samples_leaf': [1, 3, 10],
             'bootstrap': [True, False],
             }
forest = RandomForestClassifier()
cross_validation = StratifiedKFold(n_splits=5)

grid_search = GridSearchCV(forest,
                           scoring='accuracy',
                           param_grid=parameter_grid,
                           cv=cross_validation,
                           verbose=1,
                           n_jobs=4
                          )

grid_search.fit(df_train[feature_names], df_train[target_column])
model = grid_search
parameters = grid_search.best_params_

print('Best score: {}'.format(grid_search.best_score_))
print('Best parameters: {}'.format(grid_search.best_params_))

Fitting 5 folds for each of 540 candidates, totalling 2700 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done 121 tasks      | elapsed:    2.9s
[Parallel(n_jobs=4)]: Done 1621 tasks      | elapsed:   15.6s


Best score: 0.9809203142536476
Best parameters: {'bootstrap': True, 'max_depth': 8, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 50}


[Parallel(n_jobs=4)]: Done 2700 out of 2700 | elapsed:   24.6s finished


In [28]:
dir(grid_search)

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_check_is_fitted',
 '_estimator_type',
 '_format_results',
 '_get_param_names',
 '_get_tags',
 '_required_parameters',
 '_run_search',
 'best_estimator_',
 'best_index_',
 'best_params_',
 'best_score_',
 'classes_',
 'cv',
 'cv_results_',
 'decision_function',
 'error_score',
 'estimator',
 'fit',
 'get_params',
 'iid',
 'inverse_transform',
 'multimetric_',
 'n_jobs',
 'n_splits_',
 'param_grid',
 'pre_dispatch',
 'predict',
 'predict_log_proba',
 'predict_proba',
 'refit',
 'refit_time_',
 'return_train_score',
 'score',
 'scorer_',
 'scoring',
 '

In [29]:
[r for r in grid_search.cv_results_.keys()]

['mean_fit_time',
 'std_fit_time',
 'mean_score_time',
 'std_score_time',
 'param_bootstrap',
 'param_max_depth',
 'param_max_features',
 'param_min_samples_leaf',
 'param_min_samples_split',
 'param_n_estimators',
 'params',
 'split0_test_score',
 'split1_test_score',
 'split2_test_score',
 'split3_test_score',
 'split4_test_score',
 'mean_test_score',
 'std_test_score',
 'rank_test_score']

In [34]:
grid_search.cv_results_['mean_test_score']

array([0.87542088, 0.83950617, 0.87205387, 0.87429854, 0.8956229 ,
       0.87205387, 0.89225589, 0.91470258, 0.89450056, 0.87766554,
       0.89786756, 0.90796857, 0.87878788, 0.87429854, 0.89337823,
       0.91245791, 0.91021324, 0.89450056, 0.88776655, 0.86419753,
       0.87205387, 0.86756453, 0.87991021, 0.87991021, 0.88776655,
       0.89001122, 0.89225589, 0.91470258, 0.9023569 , 0.92031425,
       0.91133558, 0.88664422, 0.91806958, 0.92031425, 0.88664422,
       0.87542088, 0.87878788, 0.8698092 , 0.87766554, 0.87429854,
       0.88552189, 0.88664422, 0.89450056, 0.86756453, 0.88776655,
       0.86083053, 0.88215488, 0.87429854, 0.88103255, 0.92143659,
       0.89113356, 0.92031425, 0.88664422, 0.85409652, 0.94837262,
       0.94837262, 0.92255892, 0.94949495, 0.9382716 , 0.92929293,
       0.95286195, 0.90347924, 0.94837262, 0.92255892, 0.94837262,
       0.94276094, 0.94725028, 0.93041526, 0.92143659, 0.9349046 ,
       0.92031425, 0.95847363, 0.94051627, 0.91358025, 0.93490

In [37]:
grid_search.cv_results_['mean_test_score'][grid_search.cv_results_['mean_test_score'].argmin()]

0.8395061728395061

In [38]:
params_worst = grid_search.cv_results_['params'][grid_search.cv_results_['mean_test_score'].argmin()]
params_worst

{'bootstrap': True,
 'max_depth': 2,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 10}

In [39]:
params_best = grid_search.cv_results_['params'][grid_search.cv_results_['mean_test_score'].argmax()]
params_best

{'bootstrap': True,
 'max_depth': 8,
 'max_features': 'log2',
 'min_samples_leaf': 1,
 'min_samples_split': 3,
 'n_estimators': 50}

In [70]:
# Note to self: I wonder if worst model is just the one which is the least overfitting

In [71]:
def predict_test(params, submission_name):
    model = RandomForestClassifier(**params)
    model.fit(df_train[feature_names], df_train[target_column])
    predictions = model.predict(df_test[feature_names]).astype(int)
    print(f"Avg predicted test survival rate: {predictions.mean()}")

    passengerIds = df_test['PassengerId']
    df_submit = pd.DataFrame({'PassengerId': passengerIds, 'Survived': predictions})
    df_submit.to_csv(f'{submissions}/submission025{submission_name}.csv', index=False)

In [72]:
# LB: 0.63
predict_test(params_best, 'best')

Avg predicted test survival rate: 0.3444976076555024


In [79]:
# Note to self: this guy has much more variability than above, so it's sort of also worst in that respect
# LB: 0.66
predict_test(params_worst, 'worst')

Avg predicted test survival rate: 0.3803827751196172


In [80]:
# LB: 0.62
predict_test(params_worst, 'worst2')

Avg predicted test survival rate: 0.33014354066985646
