# To improve Titanic accuracy
### --By Jiancheng
Work on [Kaggle Titanic](https://www.kaggle.com/c/titanic)

Start on 2016/03/25

# Summary:
1. Data processing module changes
1. Using grid search to test separate model on SVC, GBC, etc
1. Voting test

In [1]:
import pandas as pd
import numpy as np
import pylab as plt

In [208]:
raw_dtrain = pd.read_csv('data/train.csv',index_col = 0)
raw_dtest = pd.read_csv('data/test.csv',index_col = 0)

In [191]:
from sklearn import cross_validation
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier,GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

In [4]:
raw_dtrain.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S


## Add new features into data processing module:
1. Delete 'Family' features, replace with the original features 'SibSp' and 'Parch'
1. Normalizing the features on the training and testing data set
1. Delete 'Title feature'

In [173]:
def process_data(in_df, mean_train = None, std_train = None, training = False):
    df = in_df.copy()

    age_average = {' Major': 48.5, ' the Countess': 33.0, ' Don': 40.0, ' Sir': 49.0, ' Miss': 21.773972602739725, 
                   ' Mlle': 24.0, ' Mrs': 35.898148148148145, ' Capt': 70.0, ' Rev': 43.166666666666664, ' Dr': 42.0, 
                   ' Master': 4.5741666666666667, ' Mr': 32.368090452261306, ' Ms': 28.0, ' Jonkheer': 38.0, 
                   ' Col': 58.0, ' Lady': 48.0, ' Mme': 24.0, ' Dona': 39}
    title_convert = {' Major': 'Army', ' the Countess': 'Upper', ' Don': 'Mr', ' Miss': 'Miss', ' Sir': 'Upper', ' Mlle': 'Upper', 
                        ' Mrs': 'Mrs', ' Capt': 'Upper', ' Rev': 'Rev', ' Dr': 'Dr', ' Master': 'Master', ' Mr': 'Mr', ' Ms': 'Miss', 
                        ' Jonkheer': 'Upper', ' Col': 'Army', ' Lady': 'Upper', ' Mme': 'Upper', ' Dona': 'Upper'}
    
    # feature transformation
    df['orgTitle'] = df['Name'].map(lambda x: x.split(',')[1].split('.')[0]) # extract "Title" from "Name"
    df['Title'] = df['orgTitle'].map(lambda x: title_convert[x]) # then also merge some rare Title into commom ones 
    df['Cabin'] = df['Cabin'].map(lambda x: str(x)[0])
    df['Cabin'] = df['Cabin'].map(lambda x: x if x != 'T' else 'n')
    df['Sex'] = df['Sex'].map(lambda x: 0 if x == 'male' else 1) # male: 0 female: 1
    
    # deal with NaN and 0
    df['Fare'] = df['Fare'].groupby(df['Pclass']).apply(lambda g: g.fillna(g.mean())) # the average Pclass fare
    df['Fare'] = df['Fare'].groupby(df['Pclass']).apply(lambda g: g.replace(0, g.mean())) # the average Pclass fare
    df['Embarked'] = df['Embarked'].fillna('n')  # the most frequent item
    df['Age'] = df['Age'].groupby(df['orgTitle']).apply(lambda g: g.fillna(age_average[g.name])) # average age of Title
    
    # normalization
    if training:
        mean_train = df[['Age','SibSp','Parch','Fare']].mean()
        std_train = df[['Age','SibSp','Parch','Fare']].std()
        
    df[['Age','SibSp','Parch','Fare']]= (df[['Age','SibSp','Parch','Fare']]- mean_train) / std_train
    
    
    # transfer category feature into dummy feature   
    df = pd.concat([df, pd.get_dummies(df['Embarked'], prefix='Embarked')], axis=1)
    df = pd.concat([df, pd.get_dummies(df['Pclass'], prefix='Pclass')], axis=1)
    df = pd.concat([df, pd.get_dummies(df['Cabin'], prefix='Cabin')], axis=1)
    df = pd.concat([df, pd.get_dummies(df['Title'], prefix='Title')], axis=1)
    
    # drop features we don't need 
    df = df.drop(['Embarked', 'Name', 'Ticket', 'PassengerId', 'Pclass', 'Cabin','Title', 'orgTitle'], axis = 1) 
    
    if 'Embarked_n' not in df.columns:
        df['Embarked_n'] = 0
        
    
    return df, mean_train, std_train

In [174]:
raw_dtrain = pd.read_csv('data/train.csv')
raw_dtest = pd.read_csv('data/test.csv')
dtrain, mean_train, std_train = process_data(raw_dtrain, training = True)
print dtrain.isnull().sum()
dtrain.head()

Survived        0
Sex             0
Age             0
SibSp           0
Parch           0
Fare            0
Embarked_C      0
Embarked_Q      0
Embarked_S      0
Embarked_n      0
Pclass_1        0
Pclass_2        0
Pclass_3        0
Cabin_A         0
Cabin_B         0
Cabin_C         0
Cabin_D         0
Cabin_E         0
Cabin_F         0
Cabin_G         0
Cabin_n         0
Title_Army      0
Title_Dr        0
Title_Master    0
Title_Miss      0
Title_Mr        0
Title_Mrs       0
Title_Rev       0
Title_Upper     0
dtype: int64


Unnamed: 0,Survived,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Embarked_n,...,Cabin_G,Cabin_n,Title_Army,Title_Dr,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Rev,Title_Upper
0,0,0,-0.584059,0.43255,-0.473408,-0.515736,0,0,1,0,...,0,1,0,0,0,0,1,0,0,0
1,1,1,0.621016,0.43255,-0.473408,0.772917,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,1,1,-0.28279,-0.474279,-0.473408,-0.502152,0,0,1,0,...,0,1,0,0,0,1,0,0,0,0
3,1,1,0.395064,0.43255,-0.473408,0.406983,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,0.395064,-0.474279,-0.473408,-0.499636,0,0,1,0,...,0,1,0,0,0,0,1,0,0,0


In [175]:
mean_train

Age      29.754659
SibSp     0.523008
Parch     0.381594
Fare     32.876990
dtype: float64

In [176]:
std_train

Age      13.277179
SibSp     1.102743
Parch     0.806057
Fare     49.690114
dtype: float64

In [177]:
dtest, mean_train, std_train = process_data(raw_dtest, mean_train, std_train)

In [178]:
mean_train

Age      29.754659
SibSp     0.523008
Parch     0.381594
Fare     32.876990
dtype: float64

In [179]:
std_train

Age      13.277179
SibSp     1.102743
Parch     0.806057
Fare     49.690114
dtype: float64

In [180]:
dtest.head()

Unnamed: 0,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,...,Cabin_n,Title_Army,Title_Dr,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Rev,Title_Upper,Embarked_n
0,0,0.357406,-0.474279,-0.473408,-0.50408,0,1,0,0,0,...,1,0,0,0,0,1,0,0,0,0
1,1,1.298871,0.43255,-0.473408,-0.520767,0,0,1,0,0,...,1,0,0,0,0,0,1,0,0,0
2,0,2.428629,-0.474279,-0.473408,-0.466682,0,1,0,0,1,...,1,0,0,0,0,1,0,0,0,0
3,0,-0.207473,-0.474279,-0.473408,-0.48731,0,0,1,0,0,...,1,0,0,0,0,1,0,0,0,0
4,1,-0.584059,0.43255,0.767199,-0.414358,0,0,1,0,0,...,1,0,0,0,0,0,1,0,0,0


In [181]:
class grid_searcher(object):
    # self.model = sklearn model wish to test, build GridSearchCV() with params_grid = list consisting of parameters
    # self.X = features
    # self.y = target
    
    from sklearn.grid_search import GridSearchCV    
    
    def __init__(self, model, params, X, y, cv = 10, n_jobs = -1):
        self.model = GridSearchCV(model, param_grid=params, cv = 5, n_jobs = n_jobs)
        self.X = X
        self.y = y
        self.fitted = False
        
    def get_fit(self):
        self.model.fit(self.X, self.y)
        print('Fitted succeed!\n')
        self.fitted = True
        
    def report(self, n_top = 5):
        if not self.fitted:
            self.get_fit()
        grid_scores = self.model.grid_scores_
        top_scores = sorted(grid_scores, key=lambda x: x[1], reverse=True)[:n_top]
        for i, score in enumerate(top_scores):
            print("=====================================================")
            print("Model with rank: {0}".format(i + 1))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(score.mean_validation_score, np.std(score.cv_validation_scores)))
            print("Parameters: {0}".format(score.parameters))
            print('')
            
    def get_best_model(self):
        if not self.fitted:
            self.get_fit()
        return self.model.best_estimator_

In [182]:
Xtrain = dtrain.drop(['Survived'], axis = 1)
ytrain = dtrain['Survived']

# Work on test SVC (with awful result: 0.44498!!!)
* kernel: linear, sigmoid, poly and rbf
* C = 0.01 * np.logspace(0, 5, 10, base = 4)
* degree = np.arange(1, 10)
* gamma = np.arange(0, 0.3, 0.006)

In [183]:
model = SVC()
params = [{'kernel': ['linear'], 'C': 0.01 * np.logspace(0, 5, 10, base = 4), 'cache_size': [2000]},
          {'kernel': ['sigmoid'], 'C': 0.01 * np.logspace(0, 5, 10, base = 4), 'cache_size': [2000]},
          {'kernel': ['poly'], 'C': 0.01 * np.logspace(0, 5, 10, base = 4), 'degree': np.arange(1, 10), 'cache_size': [2000]}, 
          {'kernel': ['rbf'], 'C': 0.01 * np.logspace(0, 5, 10, base = 4), 'gamma': np.arange(0, 0.3, 0.006), 'cache_size': [2000]}, 
         ]
grid_model = grid_searcher(model = model, params = params, X = Xtrain, y = ytrain, cv = 10, n_jobs = -1)
best_model = grid_model.get_best_model()
grid_model.report(n_top = 20)
best_model

Fitted succeed!

Model with rank: 1
Mean validation score: 0.836 (std: 0.023)
Parameters: {'cache_size': 2000, 'kernel': 'rbf', 'C': 1.015936673259648, 'gamma': 0.114}

Model with rank: 2
Mean validation score: 0.836 (std: 0.023)
Parameters: {'cache_size': 2000, 'kernel': 'rbf', 'C': 1.015936673259648, 'gamma': 0.12}

Model with rank: 3
Mean validation score: 0.836 (std: 0.023)
Parameters: {'cache_size': 2000, 'kernel': 'rbf', 'C': 1.015936673259648, 'gamma': 0.126}

Model with rank: 4
Mean validation score: 0.836 (std: 0.022)
Parameters: {'cache_size': 2000, 'kernel': 'rbf', 'C': 2.1945445961038677, 'gamma': 0.048000000000000001}

Model with rank: 5
Mean validation score: 0.836 (std: 0.024)
Parameters: {'cache_size': 2000, 'kernel': 'rbf', 'C': 4.7404785269109286, 'gamma': 0.0}

Model with rank: 6
Mean validation score: 0.836 (std: 0.024)
Parameters: {'cache_size': 2000, 'kernel': 'rbf', 'C': 4.7404785269109286, 'gamma': 0.036000000000000004}

Model with rank: 7
Mean validation score:

SVC(C=1.015936673259648, cache_size=2000, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.114, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [146]:
model_svc_best = SVC(C=1.015936673259648, cache_size=2000, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.114, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
model_svc_best.fit(Xtrain, ytrain)

SVC(C=1.01593667326, cache_size=2000, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.114, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [170]:
best_model.score(Xtrain, ytrain)

0.84287317620650959

In [171]:
submit_svc = pd.DataFrame()
submit_svc['PassengerId'] = raw_dtest['PassengerId']
submit_svc['Survived'] = best_model.predict(dtest)
submit_svc.head()

Unnamed: 0,PassengerId,Survived
0,892,1
1,893,1
2,894,1
3,895,1
4,896,1


In [172]:
submit_svc.to_csv('submit_svc_new.csv', index=False)

# Find best RF model:
### detect on 
* 'n_estimators': [30,80,150,300,500], 
*          'criterion': ['gini','entropy'],
*           'min_samples_leaf': [1,2,3,4,5],
*           'max_features': ['sqrt','log2',None], 
*           'max_depth': np.arange(5,20,1),

In [157]:
model = RandomForestClassifier()
params = [{'n_estimators': [30,80,150,300,500], 
          'criterion': ['gini','entropy'],
           'min_samples_leaf': [1,2,3,4,5],
           'max_features': ['sqrt','log2',None], 
           'max_depth': np.arange(5,20,1),
          }]
grid_model = grid_searcher(model = model, params = params, X = Xtrain, y = ytrain, cv = 10, n_jobs = -1)
best_model = grid_model.get_best_model()
grid_model.report(n_top = 20)
best_model

Fitted succeed!

Model with rank: 1
Mean validation score: 0.842 (std: 0.027)
Parameters: {'max_features': None, 'n_estimators': 80, 'criterion': 'gini', 'max_depth': 13, 'min_samples_leaf': 5}

Model with rank: 2
Mean validation score: 0.842 (std: 0.027)
Parameters: {'max_features': None, 'n_estimators': 150, 'criterion': 'gini', 'max_depth': 13, 'min_samples_leaf': 5}

Model with rank: 3
Mean validation score: 0.842 (std: 0.023)
Parameters: {'max_features': None, 'n_estimators': 150, 'criterion': 'gini', 'max_depth': 17, 'min_samples_leaf': 3}

Model with rank: 4
Mean validation score: 0.842 (std: 0.023)
Parameters: {'max_features': None, 'n_estimators': 30, 'criterion': 'entropy', 'max_depth': 13, 'min_samples_leaf': 3}

Model with rank: 5
Mean validation score: 0.841 (std: 0.022)
Parameters: {'max_features': None, 'n_estimators': 500, 'criterion': 'gini', 'max_depth': 7, 'min_samples_leaf': 1}

Model with rank: 6
Mean validation score: 0.841 (std: 0.022)
Parameters: {'max_features'

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=13, max_features=None, max_leaf_nodes=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=80, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [158]:
best_model.score(Xtrain, ytrain)

0.89113355780022452

In [159]:
submit = pd.DataFrame()
submit['PassengerId'] = raw_dtest['PassengerId']
submit['Survived'] = best_model.predict(dtest)
submit.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [160]:
submit.to_csv('submit_rf.csv', index=False)

# Find best GBM model: 0.65550
### detect on 
* {'loss': ['deviance', 'exponential'],
*         'n_estimators': np.arange(100,2000,100), 
*          'min_samples_leaf': [1,2,3,4,5],
*          'max_features': ['sqrt','log2',None], 
*          'max_depth': [1,2,3,4,5],
          }

In [186]:
model = GradientBoostingClassifier()
params = [{'loss': ['deviance', 'exponential'],
          'n_estimators': np.arange(100,2000,100), 
           'min_samples_leaf': [1,2,3,4,5],
           'max_features': ['sqrt','log2',None], 
           'max_depth': [1,2,3,4,5],
          }]
grid_model_gbdt = grid_searcher(model = model, params = params, X = Xtrain, y = ytrain, cv = 5, n_jobs = -1)
grid_model_gbdt.report(n_top = 30)

Fitted succeed!

Model with rank: 1
Mean validation score: 0.847 (std: 0.021)
Parameters: {'max_features': None, 'loss': 'deviance', 'n_estimators': 300, 'max_depth': 3, 'min_samples_leaf': 2}

Model with rank: 2
Mean validation score: 0.845 (std: 0.014)
Parameters: {'max_features': None, 'loss': 'deviance', 'n_estimators': 200, 'max_depth': 3, 'min_samples_leaf': 3}

Model with rank: 3
Mean validation score: 0.845 (std: 0.018)
Parameters: {'max_features': None, 'loss': 'exponential', 'n_estimators': 200, 'max_depth': 3, 'min_samples_leaf': 3}

Model with rank: 4
Mean validation score: 0.844 (std: 0.029)
Parameters: {'max_features': 'sqrt', 'loss': 'deviance', 'n_estimators': 500, 'max_depth': 3, 'min_samples_leaf': 5}

Model with rank: 5
Mean validation score: 0.844 (std: 0.016)
Parameters: {'max_features': None, 'loss': 'deviance', 'n_estimators': 300, 'max_depth': 3, 'min_samples_leaf': 1}

Model with rank: 6
Mean validation score: 0.843 (std: 0.017)
Parameters: {'max_features': Non

In [187]:
gbc = GradientBoostingClassifier(**{'max_features': None, 'loss': 'deviance', 'n_estimators': 200, 'max_depth': 3, 'min_samples_leaf': 3})
gbc.fit(Xtrain, ytrain)
gbc.score(Xtrain, ytrain)

0.91694725028058366

In [188]:
submit = pd.DataFrame()
submit['PassengerId'] = raw_dtest['PassengerId']
submit['Survived'] = gbc.predict(dtest)
submit.to_csv('submit_gbc.csv', index=False)

In [189]:
gbc

GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=3, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=200,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [190]:
svc1= SVC(**{'cache_size': 2000, 'kernel': 'rbf', 'C': 4.7404785269109286, 'gamma': 0.053999999999999999})
svc1.fit(Xtrain, ytrain)
svc1.score(Xtrain, ytrain)

0.856341189674523

In [195]:
bayes = GaussianNB().fit(Xtrain, ytrain)
bayes.score(Xtrain, ytrain)

0.77665544332211001

In [196]:
submit = pd.DataFrame()
submit['PassengerId'] = raw_dtest['PassengerId']
submit['Survived'] = bayes.predict(dtest)
submit.to_csv('submit_bayes.csv', index=False)

In [197]:
bayes

GaussianNB()

In [203]:
def submit_csv(model, title):
    submit = pd.DataFrame()
    submit['PassengerId'] = raw_dtest['PassengerId']
    submit['Survived'] = model.predict(dtest)
    submit.to_csv(title + '.csv', index=False)

In [204]:
submit_csv(RandomForestClassifier(300, max_depth = 10).fit(Xtrain, ytrain), 'rf_naive')

In [205]:
RandomForestClassifier(300, max_depth = 10).fit(Xtrain, ytrain).score(Xtrain, ytrain)

0.93153759820426485

In [207]:
RandomForestClassifier(300, max_depth = 10)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)