In [356]:
import pandas as pd
import numpy as np

df = pd.read_csv('data.csv')

df_train = df.iloc[:712, :]
df_test = df.iloc[712:, :]

In [357]:
df_train = df_train.drop(['Name', 'Ticket', 'Cabin'], axis=1)

age_mean = df_train['Age'].mean()
df_train['Age'] = df_train['Age'].fillna(age_mean)

from collections import Counter

keys = [k for k, v in Counter(df_train['Embarked']).most_common()]
df_train['Embarked'] = df_train['Embarked'].fillna(keys[0])

In [358]:
df_train['Sex'] = df_train['Sex'].map({'female':0, 'male':1})

pd.get_dummies(df_train['Embarked'], prefix='Embarked').head(5)
df_train = pd.concat([df_train, pd.get_dummies(df_train['Embarked'], prefix='Embarked')], axis=1)
df_train = df_train.drop(['Embarked'], axis=1)

x_train = df_train.iloc[:, 2:].values
y_train = df_train['Survived']

In [359]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

parameter_grid = {
    'max_features': [0.5, 1.],
    'max_depth': [5., None]
}

grid_search = GridSearchCV(RandomForestClassifier(n_estimators = 100), parameter_grid, cv=5, verbose=3)

In [360]:
grid_search.fit(x_train, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 1/5] END ................max_depth=5.0, max_features=0.5; total time=   0.0s
[CV 2/5] END ................max_depth=5.0, max_features=0.5; total time=   0.0s
[CV 3/5] END ................max_depth=5.0, max_features=0.5; total time=   0.0s
[CV 4/5] END ................max_depth=5.0, max_features=0.5; total time=   0.0s
[CV 5/5] END ................max_depth=5.0, max_features=0.5; total time=   0.0s
[CV 1/5] END ................max_depth=5.0, max_features=1.0; total time=   0.0s
[CV 2/5] END ................max_depth=5.0, max_features=1.0; total time=   0.0s
[CV 3/5] END ................max_depth=5.0, max_features=1.0; total time=   0.0s
[CV 4/5] END ................max_depth=5.0, max_features=1.0; total time=   0.0s
[CV 5/5] END ................max_depth=5.0, max_features=1.0; total time=   0.0s
[CV 1/5] END ...............max_depth=None, max_features=0.5; total time=   0.0s
[CV 2/5] END ...............max_depth=None, max_f

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [5.0, None], 'max_features': [0.5, 1.0]},
             verbose=3)

In [361]:
grid_search.cv_results_['mean_test_score']

array([0.81748252, 0.82447552, 0.8020585 , 0.80899242])

In [362]:
grid_search.best_params_

{'max_depth': 5.0, 'max_features': 1.0}

In [363]:
model = RandomForestClassifier(n_estimators = 100, max_features=0.5, max_depth=5.0, random_state=0)
model = model.fit(x_train, y_train)

In [364]:
df_test = df_test.drop(['Name', 'Ticket', 'Cabin'], axis=1)

df_test['Age'] = df_test['Age'].fillna(age_mean)
df_test['Embarked'] = df_test['Embarked'].fillna(keys[0])

df_test['Sex'] = df_test['Sex'].map({'female': 0, 'male':1})
df_test= pd.concat([df_test, pd.get_dummies(df_test['Embarked'], prefix='Embarked')], axis=1)
df_test= df_test.drop(['Embarked'], axis=1)

x_test = df_test.iloc[:, 2:]
y_test = df_test['Survived']

In [365]:
y_prediction = model.predict(x_test)

In [366]:
np.sum(y_prediction == y_test)

155

In [367]:
np.sum(y_prediction == y_test) / float(len(y_test))

0.8659217877094972

In [368]:
np.sum(y_test) / float(len(y_test))

0.3575418994413408