In [48]:
import os
import random

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import feature_engineering

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier

In [49]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

data = [train, test]

print('Initial Data Shape : ', train.shape, test.shape, data[0].shape, data[1].shape)

data[0].columns

Initial Data Shape :  (891, 12) (418, 11) (891, 12) (418, 11)


Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

## Feature Engineering

In [50]:
for df in data:
    df.drop(['PassengerId', 'SibSp', 'Parch', 'Cabin', 'Ticket'], axis=1, inplace=True)

data = feature_engineering.name(data)

data = feature_engineering.impute_age(data)

data = feature_engineering.impute_embarked(data)

data = feature_engineering.impute_fare(data)

data[0], data[1] = feature_engineering.set_dummies(data[0], data[1])

data[0].head(10)

Unnamed: 0,Survived,Age,Fare,NameLen,AgeNull,Pclass_3,Pclass_1,Pclass_2,Sex_male,Sex_female,Embarked_S,Embarked_C,Embarked_Q,NameTitle_Mr.,NameTitle_Mrs.,NameTitle_Miss.,NameTitle_Master.,NameTitle_Rare,NameTitle_Ms.
0,0,22.0,7.25,23,0,1,0,0,1,0,1,0,0,1,0,0,0,0,0
1,1,38.0,71.2833,51,0,0,1,0,0,1,0,1,0,0,1,0,0,0,0
2,1,26.0,7.925,22,0,1,0,0,0,1,1,0,0,0,0,1,0,0,0
3,1,35.0,53.1,44,0,0,1,0,0,1,1,0,0,0,1,0,0,0,0
4,0,35.0,8.05,24,0,1,0,0,1,0,1,0,0,1,0,0,0,0,0
5,0,28.0,8.4583,16,1,1,0,0,1,0,0,0,1,1,0,0,0,0,0
6,0,54.0,51.8625,23,0,0,1,0,1,0,1,0,0,1,0,0,0,0,0
7,0,2.0,21.075,30,0,1,0,0,1,0,1,0,0,0,0,0,1,0,0
8,1,27.0,11.1333,49,0,1,0,0,0,1,1,0,0,0,1,0,0,0,0
9,1,14.0,30.0708,35,0,0,0,1,0,1,0,1,0,0,1,0,0,0,0


In [59]:
model = RandomForestClassifier(max_features='auto', oob_score=True, random_state=1, n_jobs=-1)

param_grid = { "criterion" : ["gini", "entropy"],
               "min_samples_leaf" : [1, 5, 10], 
               "min_samples_split" : [2, 4, 10, 12, 16, 18], 
               "n_estimators": [50, 100, 400]}

gs = GridSearchCV(estimator=model, param_grid=param_grid, scoring='accuracy', cv=3, n_jobs=-1)

gs = gs.fit(data[0].iloc[:, 1:], data[0].iloc[:, 0])

bestscore = gs.best_score_
bestparams = gs.best_params_
cvresults = gs.cv_results_

print(bestscore)
print(bestparams)
#print(cvresults)

0.8294051627384961
{'criterion': 'gini', 'min_samples_leaf': 1, 'min_samples_split': 16, 'n_estimators': 50}


In [60]:
rf = RandomForestClassifier(criterion=bestparams['criterion'], 
                             n_estimators=bestparams['n_estimators'],
                             min_samples_split=bestparams['min_samples_split'],
                             min_samples_leaf=bestparams['min_samples_leaf'],
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)

rf.fit(data[0].iloc[:, 1:], data[0].iloc[:, 0])
print('{:4f}'.format(rf.oob_score_))

0.836139


In [61]:
predictions = rf.predict(data[1])
predictions = pd.DataFrame(predictions, columns=['Survived'])
test = pd.read_csv('./data/test.csv')
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv('./data/y_test_rf.csv', sep=",", index = False)g