# Load Data

In [1]:
import numpy as np
import pandas as pd

In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
data = pd.concat([train_data, test_data])
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 12 columns):
Age            1046 non-null float64
Cabin          295 non-null object
Embarked       1307 non-null object
Fare           1308 non-null float64
Name           1309 non-null object
Parch          1309 non-null int64
PassengerId    1309 non-null int64
Pclass         1309 non-null int64
Sex            1309 non-null object
SibSp          1309 non-null int64
Survived       891 non-null float64
Ticket         1309 non-null object
dtypes: float64(3), int64(4), object(5)
memory usage: 132.9+ KB


# Feature Engineering

In [3]:
features = pd.DataFrame()
# features['PassengerId'] = data.PassengerId

## Age

In [4]:
features['Age'] = data.Age.fillna(data.Age.median())

## Fare

In [5]:
features['Fare'] = data.Fare.fillna(data.Fare.mean())

## Title

In [6]:
titles = {"Capt" : "Officer", "Col" : "Officer", "Major" : "Officer", "Dr" : "Officer", "Rev" : "Officer",
          "Jonkheer" : "Royalty", "Don" : "Royalty", "Sir" : "Royalty", 
          "the Countess" : "Royalty", "Dona" : "Royalty", "Lady" : "Royalty",
          "Mme" : "Mrs", "Ms" : "Mrs", "Mrs" : "Mrs",
          "Mlle": "Miss", "Miss" : "Miss",
          "Mr" : "Mr",  
          "Master" : "Master"}
title_feature = data.Name.map(lambda name: titles[name.split('.')[0].split(',')[1].strip()])
title_dummies = pd.get_dummies(title_feature, prefix = 'Title')
features = pd.concat([features, title_dummies], axis = 1)

## Pclass

In [7]:
pclass_dummies = pd.get_dummies(data.Pclass, prefix = 'Pclass')
features = pd.concat([features, pclass_dummies], axis = 1)

## Family Size

In [8]:
features['Parch'] = data.Parch
features['SibSp'] = data.SibSp
features['FamilySize'] = data.Parch + data.SibSp + 1

## Sex

In [9]:
features['Sex'] = data.Sex.map({'female' : 0, 'male' : 1})

# Modeling

## Feature Selection

In [10]:
Xtrain = features[:train_data.PassengerId.count()]
ytrain = train_data.Survived
Xtest = features[train_data.PassengerId.count():]

In [11]:
from sklearn.ensemble import ExtraTreesClassifier
extra_classifier = ExtraTreesClassifier(n_estimators = 200)
extra_classifier.fit(Xtrain, ytrain)
importances = pd.DataFrame()
importances['FeatureName'] = Xtrain.columns
importances['Importance'] = extra_classifier.feature_importances_
importances.sort_values('Importance', ascending = False)

Unnamed: 0,FeatureName,Importance
1,Fare,0.25134
0,Age,0.207879
4,Title_Mr,0.129258
14,Sex,0.121268
10,Pclass_3,0.054431
3,Title_Miss,0.03864
5,Title_Mrs,0.038583
13,FamilySize,0.037176
8,Pclass_1,0.034728
12,SibSp,0.029645


In [None]:
from sklearn.feature_selection import SelectFromModel
select_model = SelectFromModel(extra_classifier, prefit = True)
Xtrain_selected = select_model.transform(Xtrain)
Xtest_selected = select_model.transform(Xtest)
Xtest_selected.shape

(418, 4)

## Parameters Tuning

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV

random_forest = RandomForestClassifier()
parameter_grid = {'max_depth' : [4, 5, 6, 7, 8], 
                  'n_estimators' : [200, 210, 240, 250],
                  'criterion' : ['gini', 'entropy']}
grid_search = GridSearchCV(random_forest, param_grid = parameter_grid)
grid_search.fit(Xtrain_selected, ytrain)
print('Best Score: {}'.format(grid_search.best_score_))
print('Best Parameter: {}'.format(grid_search.best_params_))