In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import mglearn

from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

%matplotlib inline

In [2]:
train = pd.read_csv('data\\titanic\\train.csv', index_col='PassengerId')

In [3]:
def transform_data(data):
    data = data.drop(['Ticket','Cabin'], axis=1)
    data['Title'] = data.Name.str.extract('([A-Za-z]+)\.', expand=False)
    data.loc[data.Title == 'Mlle', 'Title'] = 'Miss'
    data.loc[data.Title == 'Mme', 'Title']  = 'Mrs'
    data.loc[data.Title == 'Ms', 'Title']   = 'Miss'
    rare_titles = ['Dr', 'Rev', 'Col', 'Major', 'Countess', 'Don', 'Jonkheer', 'Capt', 'Lady', 'Sir']
    data.Title = data.Title.replace(rare_titles, 'Rare')
    age_by_title = data.groupby('Title').Age.mean()
    data.loc[data.Age.isnull() & (data.Title == 'Mr'), 'Age'] = age_by_title['Mr']
    data.loc[data.Age.isnull() & (data.Title == 'Mrs'), 'Age'] = age_by_title['Mrs']
    data.loc[data.Age.isnull() & (data.Title == 'Miss'), 'Age'] = age_by_title['Miss']
    data.loc[data.Age.isnull() & (data.Title == 'Master'), 'Age'] = age_by_title['Master']
    data.loc[data.Age.isnull() & (data.Title == 'Rare'), 'Age'] = age_by_title['Rare']
    data = data.drop(['Title','Name'], axis=1)
    data.Embarked = data.Embarked.fillna('S')
    data['EmbarkedC'] = (data.Embarked == 'C').astype(float)
    data['EmbarkedQ'] = (data.Embarked == 'Q').astype(float)
    data['EmbarkedS'] = (data.Embarked == 'S').astype(float)
    data['Pclass1'] = (data.Pclass == 1).astype(float)
    data['Pclass2'] = (data.Pclass == 2).astype(float)
    data['Pclass3'] = (data.Pclass == 3).astype(float)
    data['IsFemale']    = (data.Sex == 'female').astype(float)
    data['IsMale']    = (data.Sex == 'male').astype(float)
    data['FamilySize'] = data['Parch'] + data['SibSp'] + 1
    data['IsChild'] = (data.Age < 18).astype(float)
    data['IsAdult'] = ((18 <= data.Age) & (data.Age <= 60)).astype(float)
    data['IsPensioner'] = (data.Age >= 60).astype(float)
    data = data.drop(['Sex','Pclass','Embarked','Parch','SibSp','Age'], axis=1)
    return data

In [4]:
working_train = train.copy()

In [5]:
working_train = transform_data(working_train)

In [6]:
X = working_train.drop('Survived', axis=1)
y = working_train['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, stratify=y)

In [7]:
rfc = RandomForestClassifier(random_state=10, n_jobs=2, n_estimators=1000, min_samples_split=15).fit(X_train, y_train)

In [8]:
test = pd.read_csv('data\\titanic\\test.csv', index_col='PassengerId')

In [9]:
test = transform_data(test)

In [10]:
test.Fare = test.Fare.fillna(test.Fare.mean())

In [11]:
predictions = rfc.predict(test)
frame = pd.DataFrame({
    'PassengerId': pd.read_csv('data\\titanic\\test.csv').PassengerId,
    'Survived': predictions
})
frame = frame.set_index('PassengerId')
frame.to_csv('data\\titanic\\predictions.csv')
frame.head()

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,1
894,0
895,0
896,0
