# To improve Titanic accuracy
### --By Jiancheng
Work on [Kaggle Titanic](https://www.kaggle.com/c/titanic)

Start on 2016/03/25

# Summary:
1. Data processing module changes
1. Testing separate model on SVC, GBC, etc
1. Voting test

In [1]:
import pandas as pd
import numpy as np
import pylab as plt

In [2]:
raw_dtrain = pd.read_csv('data/train.csv')
raw_dtest = pd.read_csv('data/test.csv')

In [78]:
from sklearn import cross_validation
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier,GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.svm import SVC

In [16]:
raw_dtrain.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S


## Add new features into data processing module:
1. Delete 'Family' features, replace with the original features 'SibSp' and 'Parch'
1. Normalizing the features on the training and testing data set  

In [35]:
def process_data(in_df, mean_train = None, std_train = None, training = False):
    df = in_df.copy()

    age_average = {' Major': 48.5, ' the Countess': 33.0, ' Don': 40.0, ' Sir': 49.0, ' Miss': 21.773972602739725, 
                   ' Mlle': 24.0, ' Mrs': 35.898148148148145, ' Capt': 70.0, ' Rev': 43.166666666666664, ' Dr': 42.0, 
                   ' Master': 4.5741666666666667, ' Mr': 32.368090452261306, ' Ms': 28.0, ' Jonkheer': 38.0, 
                   ' Col': 58.0, ' Lady': 48.0, ' Mme': 24.0, ' Dona': 39}
    title_convert = {' Major': 'Army', ' the Countess': 'Upper', ' Don': 'Mr', ' Miss': 'Miss', ' Sir': 'Upper', ' Mlle': 'Upper', 
                        ' Mrs': 'Mrs', ' Capt': 'Upper', ' Rev': 'Rev', ' Dr': 'Dr', ' Master': 'Master', ' Mr': 'Mr', ' Ms': 'Miss', 
                        ' Jonkheer': 'Upper', ' Col': 'Army', ' Lady': 'Upper', ' Mme': 'Upper', ' Dona': 'Upper'}
    
    # feature transformation
#     df['Family'] = df['SibSp'] + df['Parch']
    df['orgTitle'] = df['Name'].map(lambda x: x.split(',')[1].split('.')[0]) # extract "Title" from "Name"
    df['Title'] = df['orgTitle'].map(lambda x: title_convert[x]) # then also merge some rare Title into commom ones 
    df['Cabin'] = df['Cabin'].map(lambda x: str(x)[0])
    df['Cabin'] = df['Cabin'].map(lambda x: x if x != 'T' else 'n')
    df['Sex'] = df['Sex'].map(lambda x: 0 if x == 'male' else 1) # male: 0 female: 1
    
    # deal with NaN and 0
    df['Fare'] = df['Fare'].groupby(df['Pclass']).apply(lambda g: g.fillna(g.mean())) # the average Pclass fare
    df['Fare'] = df['Fare'].groupby(df['Pclass']).apply(lambda g: g.replace(0, g.mean())) # the average Pclass fare
    df['Embarked'] = df['Embarked'].fillna(value = df['Embarked'].value_counts().index[0])  # the most frequent item
    df['Age'] = df['Age'].groupby(df['orgTitle']).apply(lambda g: g.fillna(age_average[g.name])) # average age of Title
    
    # normalization
    if training:
        mean_train = df[['Age','SibSp','Parch','Fare']].mean()
        std_train = df[['Age','SibSp','Parch','Fare']].std()
        
    df[['Age','SibSp','Parch','Fare']]= (df[['Age','SibSp','Parch','Fare']]- mean_train) / std_train
    
    
    # transfer category feature into dummy feature   
    df = pd.concat([df, pd.get_dummies(df['Embarked'], prefix='Embarked')], axis=1)
    df = pd.concat([df, pd.get_dummies(df['Pclass'], prefix='Pclass')], axis=1)
    df = pd.concat([df, pd.get_dummies(df['Cabin'], prefix='Cabin')], axis=1)
    df = pd.concat([df, pd.get_dummies(df['Title'], prefix='Title')], axis=1)
    
    # drop features we don't need 
    df = df.drop(['orgTitle'], axis = 1)
#     df = df.drop(['Embarked', 'Name', 'SibSp', 'Parch', 'Ticket', 'PassengerId', 'Pclass', 'Cabin','Title'], axis = 1)  
    df = df.drop(['Embarked', 'Name', 'Ticket', 'PassengerId', 'Pclass', 'Cabin','Title'], axis = 1) 
    return df, mean_train, std_train

In [43]:
raw_dtrain = pd.read_csv('data/train.csv')
raw_dtest = pd.read_csv('data/test.csv')
dtrain, mean_train, std_train = process_data(raw_dtrain, training = True)
print dtrain.isnull().sum()
dtrain.head()

Survived        0
Sex             0
Age             0
SibSp           0
Parch           0
Fare            0
Embarked_C      0
Embarked_Q      0
Embarked_S      0
Pclass_1        0
Pclass_2        0
Pclass_3        0
Cabin_A         0
Cabin_B         0
Cabin_C         0
Cabin_D         0
Cabin_E         0
Cabin_F         0
Cabin_G         0
Cabin_n         0
Title_Army      0
Title_Dr        0
Title_Master    0
Title_Miss      0
Title_Mr        0
Title_Mrs       0
Title_Rev       0
Title_Upper     0
dtype: int64


Unnamed: 0,Survived,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,...,Cabin_G,Cabin_n,Title_Army,Title_Dr,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Rev,Title_Upper
0,0,0,-0.584059,0.43255,-0.473408,-0.515736,0,0,1,0,...,0,1,0,0,0,0,1,0,0,0
1,1,1,0.621016,0.43255,-0.473408,0.772917,1,0,0,1,...,0,0,0,0,0,0,0,1,0,0
2,1,1,-0.28279,-0.474279,-0.473408,-0.502152,0,0,1,0,...,0,1,0,0,0,1,0,0,0,0
3,1,1,0.395064,0.43255,-0.473408,0.406983,0,0,1,1,...,0,0,0,0,0,0,0,1,0,0
4,0,0,0.395064,-0.474279,-0.473408,-0.499636,0,0,1,0,...,0,1,0,0,0,0,1,0,0,0


In [44]:
mean_train

Age      29.754659
SibSp     0.523008
Parch     0.381594
Fare     32.876990
dtype: float64

In [45]:
std_train

Age      13.277179
SibSp     1.102743
Parch     0.806057
Fare     49.690114
dtype: float64

In [46]:
dtest, mean_train, std_train = process_data(raw_dtest, mean_train, std_train)

In [47]:
mean_train

Age      29.754659
SibSp     0.523008
Parch     0.381594
Fare     32.876990
dtype: float64

In [48]:
std_train

Age      13.277179
SibSp     1.102743
Parch     0.806057
Fare     49.690114
dtype: float64

In [63]:
dtest.head()

Unnamed: 0,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,...,Cabin_G,Cabin_n,Title_Army,Title_Dr,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Rev,Title_Upper
0,0,0.357406,-0.474279,-0.473408,-0.50408,0,1,0,0,0,...,0,1,0,0,0,0,1,0,0,0
1,1,1.298871,0.43255,-0.473408,-0.520767,0,0,1,0,0,...,0,1,0,0,0,0,0,1,0,0
2,0,2.428629,-0.474279,-0.473408,-0.466682,0,1,0,0,1,...,0,1,0,0,0,0,1,0,0,0
3,0,-0.207473,-0.474279,-0.473408,-0.48731,0,0,1,0,0,...,0,1,0,0,0,0,1,0,0,0
4,1,-0.584059,0.43255,0.767199,-0.414358,0,0,1,0,0,...,0,1,0,0,0,0,0,1,0,0


In [60]:
def test_model(model, data, cv = 10):
    X = data.drop(['Survived'], axis = 1)
    y = data['Survived']
    model.fit(X, y)
    training = model.score(X, y)
    validation = cross_validation.cross_val_score(model, X, y, cv=cv).mean()
    print 'Training accuracy:\t\t\t', training
    print '%s-fold cross-validation accuracy:\t' % cv, validation
    print 'Delta(training - validation): \t\t', training - validation
    return model

In [61]:
test_model(RandomForestClassifier(n_estimators=300, max_depth=3), dtrain)

Training accuracy:			0.820426487093
10-fold cross-validation accuracy:	0.811443649983
Delta(training - validation): 		0.00898283711018


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=3, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [62]:
for depth in np.arange(1,25,1): 
    print '=========Test on depth = %s=========' % depth
    test_model(RandomForestClassifier(n_estimators=300, max_depth=depth), dtrain)

Training accuracy:			0.785634118967
10-fold cross-validation accuracy:	0.781218079673
Delta(training - validation): 		0.00441603929432
Training accuracy:			0.801346801347
10-fold cross-validation accuracy:	0.791192827148
Delta(training - validation): 		0.0101539741989
Training accuracy:			0.817059483726
10-fold cross-validation accuracy:	0.8136783566
Delta(training - validation): 		0.00338112712645
Training accuracy:			0.833894500561
10-fold cross-validation accuracy:	0.826013222109
Delta(training - validation): 		0.00788127845244
Training accuracy:			0.851851851852
10-fold cross-validation accuracy:	0.830558109182
Delta(training - validation): 		0.0212937426701
Training accuracy:			0.872053872054
10-fold cross-validation accuracy:	0.832830268982
Delta(training - validation): 		0.0392236030719
Training accuracy:			0.885521885522
10-fold cross-validation accuracy:	0.829459482465
Delta(training - validation): 		0.0560624030568
Training accuracy:			0.901234567901
10-fold cross-validation 

In [66]:
for depth in np.arange(1,25,1): 
    print '=========Test on depth = %s=========' % depth
    test_model(ExtraTreesClassifier(n_estimators=300, max_depth=depth), dtrain)

Training accuracy:			0.791245791246
10-fold cross-validation accuracy:	0.784550845534
Delta(training - validation): 		0.0066949457118
Training accuracy:			0.791245791246
10-fold cross-validation accuracy:	0.797909431393
Delta(training - validation): 		-0.00666364014679
Training accuracy:			0.803591470258
10-fold cross-validation accuracy:	0.810257065032
Delta(training - validation): 		-0.00666559477421
Training accuracy:			0.828282828283
10-fold cross-validation accuracy:	0.818197991147
Delta(training - validation): 		0.0100848371354
Training accuracy:			0.840628507295
10-fold cross-validation accuracy:	0.818223527409
Delta(training - validation): 		0.0224049798863
Training accuracy:			0.850729517396
10-fold cross-validation accuracy:	0.829510271252
Delta(training - validation): 		0.0212192461443
Training accuracy:			0.874298540965
10-fold cross-validation accuracy:	0.828386675746
Delta(training - validation): 		0.045911865219
Training accuracy:			0.886644219978
10-fold cross-validatio

In [67]:
for depth in np.arange(1,20,1): 
    print '=========Test on depth = %s=========' % depth
    test_model(RandomForestClassifier(n_estimators=300, max_depth=depth, criterion='entropy'), dtrain)

Training accuracy:			0.784511784512
10-fold cross-validation accuracy:	0.784601066848
Delta(training - validation): 		-8.92823364733e-05
Training accuracy:			0.793490460157
10-fold cross-validation accuracy:	0.788970604926
Delta(training - validation): 		0.00451985523147
Training accuracy:			0.820426487093
10-fold cross-validation accuracy:	0.812554477358
Delta(training - validation): 		0.00787200973531
Training accuracy:			0.835016835017
10-fold cross-validation accuracy:	0.82716207014
Delta(training - validation): 		0.00785476487724
Training accuracy:			0.846240179574
10-fold cross-validation accuracy:	0.830558109182
Delta(training - validation): 		0.0156820703918
Training accuracy:			0.867564534231
10-fold cross-validation accuracy:	0.833966632618
Delta(training - validation): 		0.0335979016129
Training accuracy:			0.878787878788
10-fold cross-validation accuracy:	0.83617608671
Delta(training - validation): 		0.0426117920781
Training accuracy:			0.896745230079
10-fold cross-validati

In [74]:
for n in np.arange(50,800,50): 
    print '=========Test on n_estimators = %s=========' % n
    test_model(GradientBoostingClassifier(n_estimators=n), dtrain)

Training accuracy:			0.878787878788
10-fold cross-validation accuracy:	0.832817500851
Delta(training - validation): 		0.0459703779367
Training accuracy:			0.894500561167
10-fold cross-validation accuracy:	0.830595278629
Delta(training - validation): 		0.0639052825382
Training accuracy:			0.913580246914
10-fold cross-validation accuracy:	0.838473499035
Delta(training - validation): 		0.0751067478783
Training accuracy:			0.918069584736
10-fold cross-validation accuracy:	0.837337419135
Delta(training - validation): 		0.0807321656011
Training accuracy:			0.937149270483
10-fold cross-validation accuracy:	0.838423561457
Delta(training - validation): 		0.0987257090253
Training accuracy:			0.945005611672
10-fold cross-validation accuracy:	0.837312450346
Delta(training - validation): 		0.107693161326
Training accuracy:			0.953984287318
10-fold cross-validation accuracy:	0.83505277494
Delta(training - validation): 		0.118931512377
Training accuracy:			0.957351290685
10-fold cross-validation accu

In [80]:
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
Xtrain = dtrain.drop(['Survived'], axis = 1)
ytrain = dtrain['Survived']

In [None]:
clf = GridSearchCV(SVC(C=1), tuned_parameters, cv=5)
clf.fit(Xtrain, ytrain)

In [None]:
print 1