# Kaggle Competition Titanic Dataset Learning
https://github.com/ramansah/kaggle-titanic/blob/master/Analysis.ipynb

https://corpocrat.com/2014/08/29/tutorial-titanic-dataset-machine-learning-for-kaggle/

In [1]:
TRAINNAME = "train.csv"
TESTNAME = "test.csv"

## Load Data

In [2]:
import pandas as pd
import numpy as np

%matplotlib inline

def readTrainData():
    return pd.read_csv(TRAINNAME)
def readTestData():
    return pd.read_csv(TESTNAME)

In [3]:
X_train_all = readTrainData()
X_train_len = X_train_all.shape[0]

In [4]:
X_result = readTestData()

In [5]:
survived = X_train_all['Survived']
X_train_all.drop('Survived', axis=1, inplace=True)

In [6]:
data = pd.concat([X_train_all, X_result], ignore_index=True, sort =False)

## Data Statistics

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 11 columns):
PassengerId    1309 non-null int64
Pclass         1309 non-null int64
Name           1309 non-null object
Sex            1309 non-null object
Age            1046 non-null float64
SibSp          1309 non-null int64
Parch          1309 non-null int64
Ticket         1309 non-null object
Fare           1308 non-null float64
Cabin          295 non-null object
Embarked       1307 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 112.6+ KB


In [8]:
data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [9]:
data.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,1309.0,1309.0,1046.0,1309.0,1309.0,1308.0
mean,655.0,2.294882,29.881138,0.498854,0.385027,33.295479
std,378.020061,0.837836,14.413493,1.041658,0.86556,51.758668
min,1.0,1.0,0.17,0.0,0.0,0.0
25%,328.0,2.0,21.0,0.0,0.0,7.8958
50%,655.0,3.0,28.0,0.0,0.0,14.4542
75%,982.0,3.0,39.0,1.0,0.0,31.275
max,1309.0,3.0,80.0,8.0,9.0,512.3292


## Cleaning Data
https://corpocrat.com/2014/08/29/tutorial-titanic-dataset-machine-learning-for-kaggle/

Need to convert the text fields into numbers to train on

In [10]:
data.drop('PassengerId', axis=1, inplace=True)
data.describe()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
count,1309.0,1046.0,1309.0,1309.0,1308.0
mean,2.294882,29.881138,0.498854,0.385027,33.295479
std,0.837836,14.413493,1.041658,0.86556,51.758668
min,1.0,0.17,0.0,0.0,0.0
25%,2.0,21.0,0.0,0.0,7.8958
50%,3.0,28.0,0.0,0.0,14.4542
75%,3.0,39.0,1.0,0.0,31.275
max,3.0,80.0,8.0,9.0,512.3292


In [11]:
#data = data.dropna()

In [12]:
from sklearn.preprocessing import LabelEncoder
data['Sex'] = LabelEncoder().fit_transform(data['Sex'])

In [13]:
data['Name'] = data['Name'].map(lambda x: x.split(',')[1].split('.')[0].strip())
titles = data['Name'].unique()
titles

array(['Mr', 'Mrs', 'Miss', 'Master', 'Don', 'Rev', 'Dr', 'Mme', 'Ms',
       'Major', 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', 'the Countess',
       'Jonkheer', 'Dona'], dtype=object)

In [14]:
data['Age'].fillna(-1, inplace=True)

medians = dict()
for title in titles:
    median = data.Age[(data["Age"] != -1) & (data['Name'] == title)].median()
    medians[title] = median
    
for index, row in data.iterrows():
    if row['Age'] == -1:
        data.loc[index, 'Age'] = medians[row['Name']]

data.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,Mr,1,22.0,1,0,A/5 21171,7.25,,S
1,1,Mrs,0,38.0,1,0,PC 17599,71.2833,C85,C
2,3,Miss,0,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,Mrs,0,35.0,1,0,113803,53.1,C123,S
4,3,Mr,1,35.0,0,0,373450,8.05,,S


In [15]:
replacement = {
    'Don': 0,
    'Rev': 0,
    'Jonkheer': 0,
    'Capt': 0,
    'Mr': 1,
    'Dr': 2,
    'Col': 3,
    'Major': 3,
    'Master': 4,
    'Miss': 5,
    'Mrs': 6,
    'Mme': 7,
    'Ms': 7,
    'Mlle': 7,
    'Sir': 7,
    'Lady': 7,
    'the Countess': 7
}

data['Name'] = data['Name'].apply(lambda x: replacement.get(x))

from sklearn.preprocessing import StandardScaler
data['Name'] = StandardScaler().fit_transform(data['Name'].values.reshape(-1, 1))

In [16]:
data.head()[['Pclass', 'Age', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked']]

Unnamed: 0,Pclass,Age,SibSp,Parch,Ticket,Cabin,Embarked
0,3,22.0,1,0,A/5 21171,,S
1,1,38.0,1,0,PC 17599,C85,C
2,3,26.0,0,0,STON/O2. 3101282,,S
3,1,35.0,1,0,113803,C123,S
4,3,35.0,0,0,373450,,S


In [17]:
from sklearn.preprocessing import StandardScaler
data['Age'] = StandardScaler().fit_transform(data['Age'].values.reshape(-1, 1))

In [18]:
data['Fare'].fillna(-1, inplace=True)
medians = dict()
for pclass in data['Pclass'].unique():
    median = data.Fare[(data["Fare"] != -1) & (data['Pclass'] == pclass)].median()
    medians[pclass] = median
for index, row in data.iterrows():
    if row['Fare'] == -1:
        data.loc[index, 'Fare'] = medians[row['Pclass']]
data['Fare'] = StandardScaler().fit_transform(data['Fare'].values.reshape(-1, 1))

In [19]:
data['Pclass'] = StandardScaler().fit_transform(data['Pclass'].values.reshape(-1, 1))

In [20]:
replacement = {
    6: 0,
    4: 0,
    5: 1,
    0: 2,
    2: 3,
    1: 4,
    3: 5
}
data['Parch'] = data['Parch'].apply(lambda x: replacement.get(x))
data['Parch'] = StandardScaler().fit_transform(data['Parch'].values.reshape(-1, 1))

In [21]:
data.drop('Ticket', axis=1, inplace=True)

In [22]:
data['Embarked'].fillna('S', inplace=True)

In [23]:
replacement = {
    'S': 0,
    'Q': 1,
    'C': 2
}

data['Embarked'] = data['Embarked'].apply(lambda x: replacement.get(x))
data['Embarked'] = StandardScaler().fit_transform(data['Embarked'].values.reshape(-1, 1))
data.head()['Embarked']

0   -0.622279
1    1.834926
2   -0.622279
3   -0.622279
4   -0.622279
Name: Embarked, dtype: float64

In [24]:
data['SibSp'].unique()

array([1, 0, 3, 4, 2, 5, 8])

In [25]:
replacement = {
    5: 0,
    8: 0,
    4: 1,
    3: 2,
    0: 3,
    2: 4,
    1: 5
}

data['SibSp'] = data['SibSp'].apply(lambda x: replacement.get(x))
data['SibSp'] = StandardScaler().fit_transform(data['SibSp'].values.reshape(-1, 1))
data.head()['SibSp']

0    1.562046
1    1.562046
2   -0.435725
3    1.562046
4   -0.435725
Name: SibSp, dtype: float64

In [26]:
data['Cabin'].fillna('U', inplace=True)
data['Cabin'] = data['Cabin'].apply(lambda x: x[0])
data['Cabin'].unique()

array(['U', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [27]:
replacement = {
    'T': 0,
    'U': 1,
    'A': 2,
    'G': 3,
    'C': 4,
    'F': 5,
    'B': 6,
    'E': 7,
    'D': 8
}

data['Cabin'] = data['Cabin'].apply(lambda x: replacement.get(x))
data['Cabin'] = StandardScaler().fit_transform(data['Cabin'].values.reshape(-1, 1))
data.head()['Cabin']

0   -0.489356
1    1.000334
2   -0.489356
3    1.000334
4   -0.489356
Name: Cabin, dtype: float64

## Split Data

In [28]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(
    data.head(X_train_len), survived, test_size=150, random_state=42)

In [29]:
X_test = data.head(data.shape[0]- X_train_len)

In [30]:
X_train.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
584,0.841916,-0.805647,1,-0.032869,-0.435725,-0.461223,-0.474901,-0.489356,1.834926
514,0.841916,-0.805647,1,-0.412845,-0.435725,-0.461223,-0.498424,-0.489356,-0.622279
688,0.841916,-0.805647,1,-0.868816,-0.435725,-0.461223,-0.492624,-0.489356,-0.622279
109,0.841916,1.055344,0,-0.564835,1.562046,-0.461223,-0.176441,-0.489356,0.606323
77,0.841916,-0.805647,1,-0.032869,-0.435725,-0.461223,-0.487709,-0.489356,-0.622279


In [31]:
X_test.describe()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
count,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0
mean,0.053529,0.049162,0.617225,-0.081003,-0.065325,0.007872,-0.01459,-0.031995,-0.031492
std,0.981475,0.99439,0.486647,0.985755,1.055403,0.978795,0.953682,0.962404,0.978207
min,-1.546098,-1.270895,0.0,-2.173652,-3.43238,-3.110977,-0.643344,-0.985919,-0.622279
25%,-0.352091,-0.805647,0.0,-0.564835,-0.435725,-0.461223,-0.490126,-0.489356,-0.622279
50%,0.841916,-0.805647,1.0,-0.032869,-0.435725,-0.461223,-0.363894,-0.489356,-0.622279
75%,0.841916,1.055344,1.0,0.451599,0.563161,-0.461223,-0.045477,-0.489356,0.606323
max,0.841916,1.98584,1.0,3.158926,1.562046,3.513407,9.261749,2.986588,1.834926


## Learning

### Using GridSearchCV on RandomForestClassifier

In [107]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

rdm_fst = RandomForestClassifier()

In [108]:
num_trees = [500,1000,5000]
num_samples = [2]
tuned_parameters = [{'n_estimators': num_trees,'min_samples_split':num_samples}]
grid_search = GridSearchCV(rdm_fst, tuned_parameters, scoring = 'accuracy', cv=20, verbose=3)

In [109]:
grid_search.fit(X_train, y_train)

Fitting 20 folds for each of 3 candidates, totalling 60 fits
[CV] min_samples_split=2, n_estimators=500 ...........................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  min_samples_split=2, n_estimators=500, score=0.763, total=   0.7s
[CV] min_samples_split=2, n_estimators=500 ...........................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.7s remaining:    0.0s


[CV]  min_samples_split=2, n_estimators=500, score=0.865, total=   0.6s
[CV] min_samples_split=2, n_estimators=500 ...........................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.3s remaining:    0.0s


[CV]  min_samples_split=2, n_estimators=500, score=0.811, total=   0.6s
[CV] min_samples_split=2, n_estimators=500 ...........................
[CV]  min_samples_split=2, n_estimators=500, score=0.865, total=   0.7s
[CV] min_samples_split=2, n_estimators=500 ...........................
[CV]  min_samples_split=2, n_estimators=500, score=0.622, total=   0.7s
[CV] min_samples_split=2, n_estimators=500 ...........................
[CV]  min_samples_split=2, n_estimators=500, score=0.811, total=   0.6s
[CV] min_samples_split=2, n_estimators=500 ...........................
[CV]  min_samples_split=2, n_estimators=500, score=0.838, total=   0.6s
[CV] min_samples_split=2, n_estimators=500 ...........................
[CV]  min_samples_split=2, n_estimators=500, score=0.838, total=   0.5s
[CV] min_samples_split=2, n_estimators=500 ...........................
[CV]  min_samples_split=2, n_estimators=500, score=0.892, total=   0.5s
[CV] min_samples_split=2, n_estimators=500 ...........................

[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:  2.5min finished


GridSearchCV(cv=20, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             ii

In [110]:
grid_search.best_params_

{'min_samples_split': 2, 'n_estimators': 5000}

In [111]:
grid_search.best_score_

0.8245614035087719

.1, 1000

0.08, 500

In [112]:
final_model = grid_search.best_estimator_
final_model.score(X_train,y_train)

0.9811066126855601

In [113]:
final_model.score(X_val,y_val)

0.84

### Using GradientBoostingClassifier

In [39]:
from sklearn import ensemble
clf = ensemble.GradientBoostingClassifier(n_estimators=50)
clf.fit(X_train,y_train)
clf.score(X_train, y_train)

0.8852901484480432

In [40]:
clf.score(X_val,y_val)

0.84

### Using an ensemble learner with RandomForest, ExtraTrees and MLPClassifier

In [41]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier

In [42]:
random_forest_clf = RandomForestClassifier(n_estimators=10, random_state=42)
extra_trees_clf = ExtraTreesClassifier(n_estimators=10, random_state=42)
mlp_clf = MLPClassifier(random_state=42)

In [43]:
estimators = [random_forest_clf, extra_trees_clf, mlp_clf]
for estimator in estimators:
    print("Training the", estimator)
    estimator.fit(X_train, y_train)

Training the RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)
Training the ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
                     max_depth=None, max_features='auto', max_leaf_nodes=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
                     oob_score=False, random_state=42, verbose=0,
                     warm_start=False)
Training th



In [44]:
[estimator.score(X_val, y_val) for estimator in estimators]

[0.8533333333333334, 0.8266666666666667, 0.84]

In [45]:
from sklearn.ensemble import VotingClassifier

In [46]:
named_estimators = [
    ("random_forest_clf", random_forest_clf),
    ("extra_trees_clf", extra_trees_clf),
    ("mlp_clf", mlp_clf),
]

In [47]:
voting_clf = VotingClassifier(named_estimators)

In [48]:
voting_clf.fit(X_train, y_train)



VotingClassifier(estimators=[('random_forest_clf',
                              RandomForestClassifier(bootstrap=True,
                                                     class_weight=None,
                                                     criterion='gini',
                                                     max_depth=None,
                                                     max_features='auto',
                                                     max_leaf_nodes=None,
                                                     min_impurity_decrease=0.0,
                                                     min_impurity_split=None,
                                                     min_samples_leaf=1,
                                                     min_samples_split=2,
                                                     min_weight_fraction_leaf=0.0,
                                                     n_estimators=10,
                                                     n_jobs=No

In [49]:
voting_clf.score(X_train, y_train)

0.9689608636977058

In [50]:
voting_clf.score(X_val, y_val)

0.84

In [51]:
voting_clf.voting = "soft"

In [52]:
voting_clf.score(X_val, y_val)

0.8466666666666667

## Predicting

In [53]:
y_results_search = final_model.predict(X_test)
y_results_boost = final_model.predict(X_test)
y_results_ensemble = final_model.predict(X_test)

final_predictions = y_results_ensemble

### Saving Predictions

In [54]:
output = np.column_stack((X_result["PassengerId"],final_predictions))
df_results = pd.DataFrame(output.astype('int'),columns=['PassengerID','Survived'])
df_results.to_csv('titanic_results.csv',index=False)