# Training 
Tips and tricks: https://inclass.kaggle.com/c/deloitte-tackles-titanic/forums/t/9841/getting-high-scores-without-looking-at-actual-data-set

In [74]:
#Import pandas and scikit-learn
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import ensemble, svm
from sklearn import cross_validation
from sklearn import preprocessing
from sklearn import grid_search
from sklearn import metrics

#Plots config
%matplotlib inline
pd.set_option('display.mpl_style', 'default') # Make the graphs a bit prettier
plt.rcParams['figure.figsize'] = (15, 5)

In [75]:
#Add plots module to path
import sys
sys.path.append('/Users/Edu/Development/open-source/sklearn-model-evaluation')
import plots as p

## Data loading

In [76]:
#Read the data
train = pd.read_csv("train_clean.csv", index_col='PassengerId')
test = pd.read_csv('test_clean.csv', index_col='PassengerId')

In [77]:
train.head()

Unnamed: 0_level_0,Fare,Parch,Pclass,SibSp,Survived,EstimatedAge,FamSize,female,male,embarked_C,...,the Countess,deck_A,deck_B,deck_C,deck_D,deck_E,deck_F,deck_G,deck_T,deck_U
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,7.25,0,3,1,0,22,1,0,1,0,...,0,0,0,0,0,0,0,0,0,1
2,71.2833,0,1,1,1,38,1,1,0,1,...,0,0,0,1,0,0,0,0,0,0
3,7.925,0,3,0,1,26,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
4,53.1,0,1,1,1,35,1,1,0,0,...,0,0,0,1,0,0,0,0,0,0
5,8.05,0,3,0,0,35,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1


In [78]:
test.head()

Unnamed: 0_level_0,Fare,Parch,Pclass,SibSp,EstimatedAge,FamSize,female,male,embarked_C,embarked_Q,...,the Countess,deck_A,deck_B,deck_C,deck_D,deck_E,deck_F,deck_G,deck_T,deck_U
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
892,7.8292,0,3,0,34.5,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,1
893,7.0,0,3,1,47.0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
894,9.6875,0,2,0,62.0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,1
895,8.6625,0,3,0,27.0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
896,12.2875,1,3,1,22.0,2,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1


## Model training (with cross-validation)

In [79]:
#RandomForest
rf_param_grid = [{'criterion': ['gini', 'entropy'],
                 'n_estimators': [10, 100, 1000],
                 'max_features' : ['auto', 'log2'],
                 'bootstrap' : [True, False]
                }]
rf = ensemble.RandomForestClassifier(n_jobs = -1)
rf = grid_search.GridSearchCV(rf, rf_param_grid)#, scoring=metrics.accuracy_score)

#AdaBoost
ab_param_grid = {'n_estimators': [10, 50, 100, 1000],
                }
ab = ensemble.AdaBoostClassifier(n_estimators = 50)
ab = grid_search.GridSearchCV(ab, ab_param_grid)#, scoring=metrics.accuracy_score)


#SVC
svc_param_grid = [
  {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
  {'C': [1, 10, 100, 1000], 'gamma': [0.01, 0.001, 0.0001], 'kernel': ['rbf']},
 ]
svc = svm.SVC()
svc = grid_search.GridSearchCV(svc, svc_param_grid)#, scoring=metrics.accuracy_score)

In [80]:
train_x = train.drop(['Survived'], axis=1).values
train_y = train['Survived']
test_x = test.values

#SVC needs feature scaling
scaler = preprocessing.StandardScaler().fit(train_x)
train_x_scaled = scaler.transform(train_x)
test_x_scaled  = scaler.transform(test_x)

In [81]:
#Perform grid search
svc.fit(train_x_scaled, train_y)
svc.best_params_
#{'C': 1000, 'gamma': 0.001, 'kernel': 'rbf'}

{'C': 100, 'gamma': 0.01, 'kernel': 'rbf'}

In [82]:
#Perform rf grid search
rf.fit(train_x, train_y)
rf.best_params_

{'bootstrap': True,
 'criterion': 'gini',
 'max_features': 'log2',
 'n_estimators': 100}

In [83]:
ab.fit(train_x, train_y)
ab.best_params_

{'n_estimators': 10}

## Model evaluation

In [84]:
#svc_scores = cross_validation.cross_val_score(svc, train_x_scaled, train_y, cv=5)
#print("SVC accuracy: %0.2f (+/- %0.2f)" % (svc_scores.mean(), svc_scores.std() * 2))
#SVC accuracy: 0.83 (+/- 0.06)

In [85]:
#rf_scores  = cross_validation.cross_val_score(rf, train_x, train_y, cv=5)
#print("RandomForest accuracy: %0.2f (+/- %0.2f)" % (rf_scores.mean(), rf_scores.std() * 2))
#RandomForest accuracy: 0.79 (+/- 0.06)

In [86]:
#ab_scores  = cross_validation.cross_val_score(ab, train_x, train_y, cv=5)
#print("AdaBoost accuracy: %0.2f (+/- %0.2f)" % (ab_scores.mean(), ab_scores.std() * 2))
#AdaBoost accuracy: 0.81 (+/- 0.02)

## Model evaluation plots

In [87]:
#svc_train_pred = cross_validation.cross_val_predict(svc, train_x_scaled, train_y, cv=5)

In [88]:
#p.plot_confusion_matrix(train_y, svc_train_pred, target_names=[0,1])

## Predictions on test set

In [89]:
#RandonForest
rf_pred = rf.predict(test_x)
#AdaBoost
ab_pred = ab.predict(test_x)
#SVC
svc_pred = svc.predict(test_x_scaled)

## Save predictions to csv

In [90]:
result = pd.DataFrame(data={'PassengerId':test.index, 'Survived':rf_pred.astype('int')})
result.to_csv("rf_result.csv", index=False)

result = pd.DataFrame(data={'PassengerId':test.index, 'Survived':svc_pred.astype('int')})
result.to_csv("svc_result.csv", index=False)

result = pd.DataFrame(data={'PassengerId':test.index, 'Survived':ab_pred.astype('int')})
result.to_csv("ab_result.csv", index=False)