In [1]:
import pandas as pd
import numpy as np
import pylab as plt

In [2]:
raw_dtrain = pd.read_csv('data/train.csv')
raw_dtest = pd.read_csv('data/test.csv')

In [3]:
from sklearn import cross_validation
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier,GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.svm import SVC

In [4]:
raw_dtrain.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S


In [11]:
def process_data(in_df):
    df = in_df.copy().fillna(-1)

    age_average = {' Major': 48.5, ' the Countess': 33.0, ' Don': 40.0, ' Sir': 49.0, ' Miss': 21.773972602739725, 
                   ' Mlle': 24.0, ' Mrs': 35.898148148148145, ' Capt': 70.0, ' Rev': 43.166666666666664, ' Dr': 42.0, 
                   ' Master': 4.5741666666666667, ' Mr': 32.368090452261306, ' Ms': 28.0, ' Jonkheer': 38.0, 
                   ' Col': 58.0, ' Lady': 48.0, ' Mme': 24.0, ' Dona': 39}
    title_convert = {' Major': 'Army', ' the Countess': 'Upper', ' Don': 'Mr', ' Miss': 'Miss', ' Sir': 'Upper', ' Mlle': 'Upper', 
                        ' Mrs': 'Mrs', ' Capt': 'Upper', ' Rev': 'Rev', ' Dr': 'Dr', ' Master': 'Master', ' Mr': 'Mr', ' Ms': 'Miss', 
                        ' Jonkheer': 'Upper', ' Col': 'Army', ' Lady': 'Upper', ' Mme': 'Upper', ' Dona': 'Upper'}
    
    # feature transformation
#     df['Family'] = df['SibSp'] + df['Parch']
    df['orgTitle'] = df['Name'].map(lambda x: x.split(',')[1].split('.')[0]) # extract "Title" from "Name"
    df['Title'] = df['orgTitle'].map(lambda x: title_convert[x]) # then also merge some rare Title into commom ones 
    df['Cabin'] = df['Cabin'].map(lambda x: str(x)[0])
    # df['Cabin'] = df['Cabin'].map(lambda x: x if x != 'T' else 'n')
    df['Sex'] = df['Sex'].map(lambda x: 0 if x == 'male' else 1) # male: 0 female: 1
    
    # deal with NaN and 0
    #df['Fare'] = df['Fare'].groupby(df['Pclass']).apply(lambda g: g.fillna(g.mean())) # the average Pclass fare
    #df['Fare'] = df['Fare'].groupby(df['Pclass']).apply(lambda g: g.replace(0, g.mean())) # the average Pclass fare
    df['Embarked'] = df['Embarked'].fillna('n')  # the most frequent item
    df['Age'] = df['Age'].groupby(df['orgTitle']).apply(lambda g: g.fillna(age_average[g.name])) # average age of Title
    
    # normalization
#     if training:
#         mean_train = df[['Age','SibSp','Parch','Fare']].mean()
#         std_train = df[['Age','SibSp','Parch','Fare']].std()
        
#     df[['Age','SibSp','Parch','Fare']]= (df[['Age','SibSp','Parch','Fare']]- mean_train) / std_train
    
    
    # transfer category feature into dummy feature   
    df = pd.concat([df, pd.get_dummies(df['Embarked'], prefix='Embarked')], axis=1)
    df = pd.concat([df, pd.get_dummies(df['Pclass'], prefix='Pclass')], axis=1)
    df = pd.concat([df, pd.get_dummies(df['Cabin'], prefix='Cabin')], axis=1)
    # df = pd.concat([df, pd.get_dummies(df['Title'], prefix='Title')], axis=1)
    
    # drop features we don't need 
    df = df.drop(['orgTitle'], axis = 1)
#     df = df.drop(['Embarked', 'Name', 'SibSp', 'Parch', 'Ticket', 'PassengerId', 'Pclass', 'Cabin','Title'], axis = 1)  
    df = df.drop(['Embarked', 'Name', 'Ticket', 'PassengerId', 'Pclass', 'Cabin','Title'], axis = 1)
    
    return df

In [13]:
raw_dtrain = pd.read_csv('data/train.csv')
raw_dtest = pd.read_csv('data/test.csv')
dtrain = process_data(raw_dtrain)
print dtrain.isnull().sum()
dtrain.head()

Survived       0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked_-1    0
Embarked_C     0
Embarked_Q     0
Embarked_S     0
Pclass_1       0
Pclass_2       0
Pclass_3       0
Cabin_-        0
Cabin_A        0
Cabin_B        0
Cabin_C        0
Cabin_D        0
Cabin_E        0
Cabin_F        0
Cabin_G        0
Cabin_T        0
dtype: int64


Unnamed: 0,Survived,Sex,Age,SibSp,Parch,Fare,Embarked_-1,Embarked_C,Embarked_Q,Embarked_S,...,Pclass_3,Cabin_-,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T
0,0,0,22,1,0,7.25,0,0,0,1,...,1,1,0,0,0,0,0,0,0,0
1,1,1,38,1,0,71.2833,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
2,1,1,26,0,0,7.925,0,0,0,1,...,1,1,0,0,0,0,0,0,0,0
3,1,1,35,1,0,53.1,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
4,0,0,35,0,0,8.05,0,0,0,1,...,1,1,0,0,0,0,0,0,0,0


In [7]:
dtest = process_data(raw_dtest)

In [8]:
dtest

Unnamed: 0,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,Pclass_3,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_n
0,0,34.500000,0,0,7.8292,0,1,0,0,0,1,0,0,0,0,0,0,0,1
1,1,47.000000,1,0,7.0000,0,0,1,0,0,1,0,0,0,0,0,0,0,1
2,0,62.000000,0,0,9.6875,0,1,0,0,1,0,0,0,0,0,0,0,0,1
3,0,27.000000,0,0,8.6625,0,0,1,0,0,1,0,0,0,0,0,0,0,1
4,1,22.000000,1,1,12.2875,0,0,1,0,0,1,0,0,0,0,0,0,0,1
5,0,14.000000,0,0,9.2250,0,0,1,0,0,1,0,0,0,0,0,0,0,1
6,1,30.000000,0,0,7.6292,0,1,0,0,0,1,0,0,0,0,0,0,0,1
7,0,26.000000,1,1,29.0000,0,0,1,0,1,0,0,0,0,0,0,0,0,1
8,1,18.000000,0,0,7.2292,1,0,0,0,0,1,0,0,0,0,0,0,0,1
9,0,21.000000,2,0,24.1500,0,0,1,0,0,1,0,0,0,0,0,0,0,1


In [14]:
def test_model(model, data, cv = 10):
    X = data.drop(['Survived'], axis = 1)
    y = data['Survived']
    model.fit(X, y)
    training = model.score(X, y)
    validation = cross_validation.cross_val_score(model, X, y, cv=cv).mean()
    print 'Training accuracy:\t\t\t', training
    print '%s-fold cross-validation accuracy:\t' % cv, validation
    print 'Delta(training - validation): \t\t', training - validation
    return model

In [15]:
for depth in np.arange(1,25,1): 
    print '=========Test on depth = %s=========' % depth
    test_model(RandomForestClassifier(n_estimators=100, max_depth=depth), dtrain)

Training accuracy:			0.717171717172
10-fold cross-validation accuracy:	0.720655146975
Delta(training - validation): 		-0.00348342980365
Training accuracy:			0.799102132435
10-fold cross-validation accuracy:	0.774627170582
Delta(training - validation): 		0.0244749618532
Training accuracy:			0.806958473625
10-fold cross-validation accuracy:	0.77797298831
Delta(training - validation): 		0.0289854853151
Training accuracy:			0.833894500561
10-fold cross-validation accuracy:	0.782492055385
Delta(training - validation): 		0.0514024451759
Training accuracy:			0.846240179574
10-fold cross-validation accuracy:	0.807049426853
Delta(training - validation): 		0.0391907527207
Training accuracy:			0.867564534231
10-fold cross-validation accuracy:	0.814952332312
Delta(training - validation): 		0.0526122019193
Training accuracy:			0.876543209877
10-fold cross-validation accuracy:	0.812654919986
Delta(training - validation): 		0.0638882898902
Training accuracy:			0.900112233446
10-fold cross-validation 

In [16]:
for n in np.arange(50,700,50): 
    print '=========Test on n_estimators = %s=========' % n
    test_model(GradientBoostingClassifier(n_estimators=n), dtrain)

Training accuracy:			0.868686868687
10-fold cross-validation accuracy:	0.820457099081
Delta(training - validation): 		0.0482297696062
Training accuracy:			0.887766554433
10-fold cross-validation accuracy:	0.829458914993
Delta(training - validation): 		0.0583076394406
Training accuracy:			0.909090909091
10-fold cross-validation accuracy:	0.828348087618
Delta(training - validation): 		0.0807428214732
Training accuracy:			0.915824915825
10-fold cross-validation accuracy:	0.830570026104
Delta(training - validation): 		0.0852548897212
Training accuracy:			0.933782267116
10-fold cross-validation accuracy:	0.821580978323
Delta(training - validation): 		0.112201288793
Training accuracy:			0.947250280584
10-fold cross-validation accuracy:	0.82273011009
Delta(training - validation): 		0.124520170494
Training accuracy:			0.952861952862
10-fold cross-validation accuracy:	0.830557825445
Delta(training - validation): 		0.122304127416
Training accuracy:			0.956228956229
10-fold cross-validation accur

In [17]:
for depth in np.arange(5,16,1): 
    print '=========Test on depth = %s=========' % depth
    test_model(RandomForestClassifier(n_estimators=300, max_depth=depth), dtrain)

Training accuracy:			0.850729517396
10-fold cross-validation accuracy:	0.812642435592
Delta(training - validation): 		0.0380870818043
Training accuracy:			0.86531986532
10-fold cross-validation accuracy:	0.819384292362
Delta(training - validation): 		0.045935572958
Training accuracy:			0.883277216611
10-fold cross-validation accuracy:	0.822742594484
Delta(training - validation): 		0.0605346221264
Training accuracy:			0.903479236813
10-fold cross-validation accuracy:	0.822754795142
Delta(training - validation): 		0.0807244416701
Training accuracy:			0.92480359147
10-fold cross-validation accuracy:	0.822754511406
Delta(training - validation): 		0.102049080064
Training accuracy:			0.942760942761
10-fold cross-validation accuracy:	0.818310350698
Delta(training - validation): 		0.124450592063
Training accuracy:			0.950617283951
10-fold cross-validation accuracy:	0.821656168426
Delta(training - validation): 		0.128961115525
Training accuracy:			0.961840628507
10-fold cross-validation accurac