# Load Data

In [1]:
import numpy as np
import pandas as pd

In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
data = pd.concat([train_data, test_data])
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 12 columns):
Age            1046 non-null float64
Cabin          295 non-null object
Embarked       1307 non-null object
Fare           1308 non-null float64
Name           1309 non-null object
Parch          1309 non-null int64
PassengerId    1309 non-null int64
Pclass         1309 non-null int64
Sex            1309 non-null object
SibSp          1309 non-null int64
Survived       891 non-null float64
Ticket         1309 non-null object
dtypes: float64(3), int64(4), object(5)
memory usage: 132.9+ KB


# Feature Engineering

In [3]:
features = pd.DataFrame()
# features['PassengerId'] = data.PassengerId

## Title

In [4]:
titles = {"Capt" : "Officer", "Col" : "Officer", "Major" : "Officer", "Dr" : "Officer", "Rev" : "Officer",
          "Jonkheer" : "Royalty", "Don" : "Royalty", "Sir" : "Royalty", 
          "the Countess" : "Royalty", "Dona" : "Royalty", "Lady" : "Royalty",
          "Mme" : "Mrs", "Ms" : "Mrs", "Mrs" : "Mrs",
          "Mlle": "Miss", "Miss" : "Miss",
          "Mr" : "Mr",  
          "Master" : "Master"}
data['Title'] = data.Name.map(lambda name: titles[name.split('.')[0].split(',')[1].strip()])
title_dummies = pd.get_dummies(data.Title, prefix = 'Title')
features = pd.concat([features, title_dummies], axis = 1)

## Age

In [5]:
grouped = data.groupby(['Sex', 'Pclass', 'Title'])
grouped.median()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Age,Fare,Parch,PassengerId,SibSp,Survived
Sex,Pclass,Title,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
female,1,Miss,30.0,99.9625,0.0,529.5,0.0,1.0
female,1,Mrs,45.0,78.1125,0.0,853.5,1.0,1.0
female,1,Officer,49.0,25.9292,0.0,797.0,0.0,1.0
female,1,Royalty,39.0,86.5,0.0,760.0,0.0,1.0
female,2,Miss,20.0,20.25,0.0,606.5,0.0,1.0
female,2,Mrs,30.0,26.0,0.0,533.0,1.0,1.0
female,3,Miss,18.0,8.05,0.0,603.5,0.0,0.5
female,3,Mrs,31.0,15.5,1.0,668.5,1.0,0.5
male,1,Master,6.0,134.5,2.0,803.0,1.0,1.0
male,1,Mr,41.5,47.1,0.0,634.0,0.0,0.0


In [6]:
def fillAges(row):
        if row['Sex']=='female' and row['Pclass'] == 1:
            if row['Title'] == 'Miss':
                return 30
            elif row['Title'] == 'Mrs':
                return 45
            elif row['Title'] == 'Officer':
                return 49
            elif row['Title'] == 'Royalty':
                return 39

        elif row['Sex']=='female' and row['Pclass'] == 2:
            if row['Title'] == 'Miss':
                return 20
            elif row['Title'] == 'Mrs':
                return 30

        elif row['Sex']=='female' and row['Pclass'] == 3:
            if row['Title'] == 'Miss':
                return 18
            elif row['Title'] == 'Mrs':
                return 31

        elif row['Sex']=='male' and row['Pclass'] == 1:
            if row['Title'] == 'Master':
                return 6
            elif row['Title'] == 'Mr':
                return 41.5
            elif row['Title'] == 'Officer':
                return 52
            elif row['Title'] == 'Royalty':
                return 40

        elif row['Sex']=='male' and row['Pclass'] == 2:
            if row['Title'] == 'Master':
                return 2
            elif row['Title'] == 'Mr':
                return 30
            elif row['Title'] == 'Officer':
                return 41.5

        elif row['Sex']=='male' and row['Pclass'] == 3:
            if row['Title'] == 'Master':
                return 6
            elif row['Title'] == 'Mr':
                return 26
    
features['Age'] = data.apply(lambda r : fillAges(r) if np.isnan(r['Age']) else r['Age'], axis = 1)

## Cabin

In [7]:
cabin_dummies = pd.get_dummies(data.Cabin.fillna('U').map(lambda cabin: cabin[0]), prefix = 'Cabin')
features = pd.concat([features, cabin_dummies], axis = 1)

## Embarked

In [8]:
embarked_dummies = pd.get_dummies(data.Embarked.fillna('S'), prefix = 'Embarked')
features = pd.concat([features, embarked_dummies], axis = 1)

## Fare

In [9]:
features['Fare'] = data.Fare.fillna(data.Fare.mean())

## Pclass

In [10]:
pclass_dummies = pd.get_dummies(data.Pclass, prefix = 'Pclass')
features = pd.concat([features, pclass_dummies], axis = 1)

## Family Size

In [11]:
features['Parch'] = data.Parch
features['SibSp'] = data.SibSp
features['FamilySize'] = data.Parch + data.SibSp + 1
features['Singleton'] = features.FamilySize.map(lambda fs: 1 if fs == 1 else 0)
features['SmallFamily'] = features.FamilySize.map(lambda fs: 1 if 2 <= fs <= 4 else 0)
features['LargeFamily'] = features.FamilySize.map(lambda fs: 1 if fs >= 5 else 0)

## Sex

In [12]:
features['Sex'] = data.Sex.map({'female' : 0, 'male' : 1})

## Ticket

In [13]:
def cleanTicket(ticket):
    ticket = ticket.replace('/', '')
    ticket = ticket.replace('.', '')
    ticket = ticket.split()
    ticket = [t.strip() for t in ticket]
    ticket = [t if not t.isdigit() else 'XXX' for t in ticket]
    return ticket[0]
ticket_feature = data.Ticket.map(cleanTicket)
ticket_dummies = pd.get_dummies(ticket_feature, prefix = 'Ticket')
features = pd.concat([features, ticket_dummies], axis = 1)

## Normalize

In [14]:
# features = features.apply(lambda x: x / x.max())
# features.info()

# Modeling

## Feature Selection

In [15]:
Xtrain = features[:train_data.PassengerId.count()]
ytrain = train_data.Survived
Xtest = features[train_data.PassengerId.count():]

In [16]:
from sklearn.ensemble import ExtraTreesClassifier
extra_classifier = ExtraTreesClassifier(n_estimators = 200)
extra_classifier.fit(Xtrain, ytrain)
importances = pd.DataFrame()
importances['FeatureName'] = Xtrain.columns
importances['Importance'] = extra_classifier.feature_importances_
importances.sort_values('Importance', ascending = False)

Unnamed: 0,FeatureName,Importance
6,Age,0.182361
19,Fare,0.155640
2,Title_Mr,0.119916
29,Sex,0.102207
3,Title_Mrs,0.041880
1,Title_Miss,0.039991
22,Pclass_3,0.037469
15,Cabin_U,0.034439
25,FamilySize,0.022264
27,SmallFamily,0.021469


In [17]:
from sklearn.feature_selection import SelectFromModel
select_model = SelectFromModel(extra_classifier, prefit = True)
Xtrain_selected = select_model.transform(Xtrain)
Xtest_selected = select_model.transform(Xtest)
Xtrain_selected.shape, Xtest_selected.shape, select_model

((891, 13),
 (418, 13),
 SelectFromModel(estimator=ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
            oob_score=False, random_state=None, verbose=0, warm_start=False),
         prefit=True, threshold=None))

## Parameters Tuning

In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import StratifiedKFold
from sklearn.grid_search import GridSearchCV

random_forest = RandomForestClassifier(max_features='sqrt')
parameter_grid = {'max_depth' : [4, 5, 6, 7, 8], 
                  'n_estimators' : [200, 210, 240, 250],
                  'criterion' : ['gini', 'entropy']}
cross_validation = StratifiedKFold(ytrain, n_folds = 5)
grid_search = GridSearchCV(random_forest, param_grid = parameter_grid, cv = cross_validation)
grid_search.fit(Xtrain_selected, ytrain)
print('Best Score: {}'.format(grid_search.best_score_))
print('Best Parameter: {}'.format(grid_search.best_params_))

Best Score: 0.835016835016835
Best Parameter: {'max_depth': 4, 'criterion': 'entropy', 'n_estimators': 250}


## Predict

In [19]:
survived = grid_search.predict(Xtest_selected)
predict_data = pd.DataFrame()
predict_data['PassengerId'] = test_data.PassengerId
predict_data['Survived'] = survived.astype(int)
predict_data.to_csv('predict.csv', index = False)
survived.sum(), len(survived)

(165, 418)

## Cross Validation

In [20]:
from sklearn.cross_validation import cross_val_score
random_forest = RandomForestClassifier(n_estimators = 250, criterion = 'gini', max_depth = 4)
score = cross_val_score(random_forest, Xtrain_selected, ytrain, cv = 5)
print(score)
print(score.mean())

[ 0.84357542  0.81564246  0.8258427   0.81460674  0.85310734]
0.830554931986


In [21]:
random_forest.fit(Xtrain_selected, ytrain)
survived = random_forest.predict(Xtest_selected)
predict_data = pd.DataFrame()
predict_data['PassengerId'] = test_data.PassengerId
predict_data['Survived'] = survived.astype(int)
predict_data.to_csv('predict_validation.csv', index = False)
survived.sum(), len(survived)

(164, 418)