# To improve Titanic accuracy
### --By Jiancheng
Work on [Kaggle Titanic](https://www.kaggle.com/c/titanic)

Start on 2016/03/25

# Summary:
1. Data processing module changes
1. Testing separate model on SVC, GBC, etc
1. Voting test

In [1]:
import pandas as pd
import numpy as np
import pylab as plt

In [2]:
raw_dtrain = pd.read_csv('data/train.csv')
raw_dtest = pd.read_csv('data/test.csv')

In [3]:
from sklearn import cross_validation
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier,GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.svm import SVC

In [4]:
raw_dtrain.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S


## Add new features into data processing module:
1. Delete 'Family' features, replace with the original features 'SibSp' and 'Parch'
1. Normalizing the features on the training and testing data set
1. Delete 'Title feature'

In [5]:
def process_data(in_df, mean_train = None, std_train = None, training = False):
    df = in_df.copy()

    age_average = {' Major': 48.5, ' the Countess': 33.0, ' Don': 40.0, ' Sir': 49.0, ' Miss': 21.773972602739725, 
                   ' Mlle': 24.0, ' Mrs': 35.898148148148145, ' Capt': 70.0, ' Rev': 43.166666666666664, ' Dr': 42.0, 
                   ' Master': 4.5741666666666667, ' Mr': 32.368090452261306, ' Ms': 28.0, ' Jonkheer': 38.0, 
                   ' Col': 58.0, ' Lady': 48.0, ' Mme': 24.0, ' Dona': 39}
    title_convert = {' Major': 'Army', ' the Countess': 'Upper', ' Don': 'Mr', ' Miss': 'Miss', ' Sir': 'Upper', ' Mlle': 'Upper', 
                        ' Mrs': 'Mrs', ' Capt': 'Upper', ' Rev': 'Rev', ' Dr': 'Dr', ' Master': 'Master', ' Mr': 'Mr', ' Ms': 'Miss', 
                        ' Jonkheer': 'Upper', ' Col': 'Army', ' Lady': 'Upper', ' Mme': 'Upper', ' Dona': 'Upper'}
    
    # feature transformation
#     df['Family'] = df['SibSp'] + df['Parch']
    df['orgTitle'] = df['Name'].map(lambda x: x.split(',')[1].split('.')[0]) # extract "Title" from "Name"
    df['Title'] = df['orgTitle'].map(lambda x: title_convert[x]) # then also merge some rare Title into commom ones 
    df['Cabin'] = df['Cabin'].map(lambda x: str(x)[0])
    df['Cabin'] = df['Cabin'].map(lambda x: x if x != 'T' else 'n')
    df['Sex'] = df['Sex'].map(lambda x: 0 if x == 'male' else 1) # male: 0 female: 1
    
    # deal with NaN and 0
    df['Fare'] = df['Fare'].groupby(df['Pclass']).apply(lambda g: g.fillna(g.mean())) # the average Pclass fare
    df['Fare'] = df['Fare'].groupby(df['Pclass']).apply(lambda g: g.replace(0, g.mean())) # the average Pclass fare
    df['Embarked'] = df['Embarked'].fillna('n')  # the most frequent item
    df['Age'] = df['Age'].groupby(df['orgTitle']).apply(lambda g: g.fillna(age_average[g.name])) # average age of Title
    
    # normalization
    if training:
        mean_train = df[['Age','SibSp','Parch','Fare']].mean()
        std_train = df[['Age','SibSp','Parch','Fare']].std()
        
    df[['Age','SibSp','Parch','Fare']]= (df[['Age','SibSp','Parch','Fare']]- mean_train) / std_train
    
    
    # transfer category feature into dummy feature   
    df = pd.concat([df, pd.get_dummies(df['Embarked'], prefix='Embarked')], axis=1)
    df = pd.concat([df, pd.get_dummies(df['Pclass'], prefix='Pclass')], axis=1)
    df = pd.concat([df, pd.get_dummies(df['Cabin'], prefix='Cabin')], axis=1)
#     df = pd.concat([df, pd.get_dummies(df['Title'], prefix='Title')], axis=1)
    
    # drop features we don't need 
    df = df.drop(['orgTitle'], axis = 1)
#     df = df.drop(['Embarked', 'Name', 'SibSp', 'Parch', 'Ticket', 'PassengerId', 'Pclass', 'Cabin','Title'], axis = 1)  
    df = df.drop(['Embarked', 'Name', 'Ticket', 'PassengerId', 'Pclass', 'Cabin','Title'], axis = 1) 
    return df, mean_train, std_train

In [6]:
raw_dtrain = pd.read_csv('data/train.csv')
raw_dtest = pd.read_csv('data/test.csv')
dtrain, mean_train, std_train = process_data(raw_dtrain, training = True)
print dtrain.isnull().sum()
dtrain.head()

Survived      0
Sex           0
Age           0
SibSp         0
Parch         0
Fare          0
Embarked_C    0
Embarked_Q    0
Embarked_S    0
Embarked_n    0
Pclass_1      0
Pclass_2      0
Pclass_3      0
Cabin_A       0
Cabin_B       0
Cabin_C       0
Cabin_D       0
Cabin_E       0
Cabin_F       0
Cabin_G       0
Cabin_n       0
dtype: int64


Unnamed: 0,Survived,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Embarked_n,...,Pclass_2,Pclass_3,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_n
0,0,0,-0.584059,0.43255,-0.473408,-0.515736,0,0,1,0,...,0,1,0,0,0,0,0,0,0,1
1,1,1,0.621016,0.43255,-0.473408,0.772917,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,1,1,-0.28279,-0.474279,-0.473408,-0.502152,0,0,1,0,...,0,1,0,0,0,0,0,0,0,1
3,1,1,0.395064,0.43255,-0.473408,0.406983,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,0.395064,-0.474279,-0.473408,-0.499636,0,0,1,0,...,0,1,0,0,0,0,0,0,0,1


In [7]:
mean_train

Age      29.754659
SibSp     0.523008
Parch     0.381594
Fare     32.876990
dtype: float64

In [8]:
std_train

Age      13.277179
SibSp     1.102743
Parch     0.806057
Fare     49.690114
dtype: float64

In [9]:
dtest, mean_train, std_train = process_data(raw_dtest, mean_train, std_train)

In [10]:
mean_train

Age      29.754659
SibSp     0.523008
Parch     0.381594
Fare     32.876990
dtype: float64

In [11]:
std_train

Age      13.277179
SibSp     1.102743
Parch     0.806057
Fare     49.690114
dtype: float64

In [12]:
dtest.head()

Unnamed: 0,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,Pclass_3,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_n
0,0,0.357406,-0.474279,-0.473408,-0.50408,0,1,0,0,0,1,0,0,0,0,0,0,0,1
1,1,1.298871,0.43255,-0.473408,-0.520767,0,0,1,0,0,1,0,0,0,0,0,0,0,1
2,0,2.428629,-0.474279,-0.473408,-0.466682,0,1,0,0,1,0,0,0,0,0,0,0,0,1
3,0,-0.207473,-0.474279,-0.473408,-0.48731,0,0,1,0,0,1,0,0,0,0,0,0,0,1
4,1,-0.584059,0.43255,0.767199,-0.414358,0,0,1,0,0,1,0,0,0,0,0,0,0,1


In [13]:
def test_model(model, data, cv = 10):
    X = data.drop(['Survived'], axis = 1)
    y = data['Survived']
    model.fit(X, y)
    training = model.score(X, y)
    validation = cross_validation.cross_val_score(model, X, y, cv=cv).mean()
    print 'Training accuracy:\t\t\t', training
    print '%s-fold cross-validation accuracy:\t' % cv, validation
    print 'Delta(training - validation): \t\t', training - validation
    return model

In [14]:
test_model(RandomForestClassifier(n_estimators=300, max_depth=3), dtrain)

Training accuracy:			0.806958473625
10-fold cross-validation accuracy:	0.790257065032
Delta(training - validation): 		0.0167014085928


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=3, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [16]:
for depth in np.arange(3,20,1): 
    print '=========Test on depth = %s=========' % depth
    test_model(RandomForestClassifier(n_estimators=300, max_depth=depth), dtrain)

Training accuracy:			0.808080808081
10-fold cross-validation accuracy:	0.791380944274
Delta(training - validation): 		0.0166998638066
Training accuracy:			0.848484848485
10-fold cross-validation accuracy:	0.818310634434
Delta(training - validation): 		0.0301742140506
Training accuracy:			0.859708193042
10-fold cross-validation accuracy:	0.826126432868
Delta(training - validation): 		0.0335817601735
Training accuracy:			0.870931537598
10-fold cross-validation accuracy:	0.830633299285
Delta(training - validation): 		0.0402982383132
Training accuracy:			0.885521885522
10-fold cross-validation accuracy:	0.829559925094
Delta(training - validation): 		0.0559619604283
Training accuracy:			0.910213243547
10-fold cross-validation accuracy:	0.830670752469
Delta(training - validation): 		0.0795424910781
Training accuracy:			0.920314253648
10-fold cross-validation accuracy:	0.825065259335
Delta(training - validation): 		0.0952489943127
Training accuracy:			0.94051627385
10-fold cross-validation ac

In [17]:
for depth in np.arange(3,20,1): 
    print '=========Test on depth = %s=========' % depth
    test_model(ExtraTreesClassifier(n_estimators=300, max_depth=depth), dtrain)

Training accuracy:			0.79797979798
10-fold cross-validation accuracy:	0.790332538872
Delta(training - validation): 		0.00764725910793
Training accuracy:			0.837261503928
10-fold cross-validation accuracy:	0.803753546703
Delta(training - validation): 		0.0335079572252
Training accuracy:			0.841750841751
10-fold cross-validation accuracy:	0.809334354784
Delta(training - validation): 		0.032416486967
Training accuracy:			0.849607182941
10-fold cross-validation accuracy:	0.816038474634
Delta(training - validation): 		0.0335687083065
Training accuracy:			0.861952861953
10-fold cross-validation accuracy:	0.817149585745
Delta(training - validation): 		0.0448032762078
Training accuracy:			0.874298540965
10-fold cross-validation accuracy:	0.818285665645
Delta(training - validation): 		0.05601287532
Training accuracy:			0.894500561167
10-fold cross-validation accuracy:	0.82390392691
Delta(training - validation): 		0.0705966342577
Training accuracy:			0.905723905724
10-fold cross-validation accur

In [18]:
for depth in np.arange(3,20,1): 
    print '=========Test on depth = %s=========' % depth
    test_model(RandomForestClassifier(n_estimators=300, max_depth=depth, criterion='entropy'), dtrain)

Training accuracy:			0.810325476992
10-fold cross-validation accuracy:	0.792517024174
Delta(training - validation): 		0.0178084528178
Training accuracy:			0.846240179574
10-fold cross-validation accuracy:	0.81047071842
Delta(training - validation): 		0.0357694611534
Training accuracy:			0.859708193042
10-fold cross-validation accuracy:	0.828373623879
Delta(training - validation): 		0.0313345691623
Training accuracy:			0.867564534231
10-fold cross-validation accuracy:	0.831756894791
Delta(training - validation): 		0.0358076394406
Training accuracy:			0.881032547699
10-fold cross-validation accuracy:	0.835127681307
Delta(training - validation): 		0.0459048663918
Training accuracy:			0.900112233446
10-fold cross-validation accuracy:	0.829521904438
Delta(training - validation): 		0.0705903290079
Training accuracy:			0.920314253648
10-fold cross-validation accuracy:	0.827274713426
Delta(training - validation): 		0.0930395402212
Training accuracy:			0.933782267116
10-fold cross-validation ac

In [19]:
for n in np.arange(100,700,50): 
    print '=========Test on n_estimators = %s=========' % n
    test_model(GradientBoostingClassifier(n_estimators=n), dtrain)

Training accuracy:			0.892255892256
10-fold cross-validation accuracy:	0.830608614232
Delta(training - validation): 		0.0616472780237
Training accuracy:			0.90684624018
10-fold cross-validation accuracy:	0.830570593576
Delta(training - validation): 		0.0762756466034
Training accuracy:			0.919191919192
10-fold cross-validation accuracy:	0.828323402565
Delta(training - validation): 		0.0908685166269
Training accuracy:			0.933782267116
10-fold cross-validation accuracy:	0.824990069232
Delta(training - validation): 		0.108792197884
Training accuracy:			0.945005611672
10-fold cross-validation accuracy:	0.819359039837
Delta(training - validation): 		0.125646571836
Training accuracy:			0.953984287318
10-fold cross-validation accuracy:	0.821606514584
Delta(training - validation): 		0.132377772734
Training accuracy:			0.960718294052
10-fold cross-validation accuracy:	0.818235728067
Delta(training - validation): 		0.142482565984
Training accuracy:			0.961840628507
10-fold cross-validation accura

In [80]:
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
Xtrain = dtrain.drop(['Survived'], axis = 1)
ytrain = dtrain['Survived']

In [None]:
clf = GridSearchCV(SVC(C=1), tuned_parameters, cv=5)
clf.fit(Xtrain, ytrain)

In [None]:
print 1