In [1]:
import pandas as pd
import os
import numpy as np

processed_data_path = os.path.join(os.path.pardir,'data','processed')
train_file_path = os.path.join(processed_data_path,'train.csv')
test_file_path = os.path.join(processed_data_path,'test.csv')

# read train and test data as dataframes
train_df = pd.read_csv(train_file_path,index_col='PassengerId')
test_df = pd.read_csv(test_file_path,index_col='PassengerId')

# prepare features as matrix of floats and output as array
X = train_df.loc[:,'Age':].as_matrix().astype('float')
y = train_df['Survived'].ravel()

# designate 80% for training and 20% for validation within the training set
import sklearn
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=0)

In [2]:
# apply a model
from sklearn.ensemble import RandomForestClassifier
# Create a random forest Classifier. By convention, clf means 'Classifier'
clf = RandomForestClassifier(n_jobs=-1, random_state=2)

# Train the Classifier to take the training features and learn how they relate
# to the training y
clf.fit(X_train, y_train)

# Apply the Classifier we trained to the test data (which, remember, it has never seen before)
clf.predict(X_test)

array([1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1,
       1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1])

In [3]:
print('score for random forest: {0:.2f}'.format(clf.score(X_test,y_test)))

score for random forest: 0.84


In [4]:
# Hyperparam opt
from sklearn.model_selection import GridSearchCV
parameters = {'n_estimators':[120,300,500],'max_depth':[5,8,15],'min_samples_split':[2,5]}
g_search_clf = GridSearchCV(clf,param_grid=parameters,cv=3)
g_search_clf.fit(X_train,y_train)

GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=2, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': [5, 8, 15], 'n_estimators': [120, 300, 500], 'min_samples_split': [2, 5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [5]:
def get_submission_file(model,filename):
    # convert to matrix
    test_X = test_df.as_matrix().astype('float')
    # make predictions
    predictions = model.predict(test_X)
    # submission dataframe
    df_submission = pd.DataFrame({'PassengerId': test_df.index,'Survived': predictions})
    # submission file
    submission_data_path = os.path.join(os.path.pardir,'data','external')
    submission_file_path = os.path.join(submission_data_path,filename)
    # write to file
    df_submission.to_csv(submission_file_path,index=False)

print(g_search_clf.best_params_)
# evaluate model
print('score for random forest classifier: {0:.2f}'.format(g_search_clf.score(X_test,y_test)))
# create Kaggle submission file
# get_submission_file(g_search_clf,'rfc.csv')

{'max_depth': 8, 'n_estimators': 500, 'min_samples_split': 5}
score for random forest classifier: 0.82
