# Imports

In [None]:
import sys
import pandas as pd
import numpy as np
import scipy as sp
import IPython
from IPython import display
import sklearn

import random
import time

import warnings
warnings.filterwarnings('ignore')

import matplotlib as mpl
import matplotlib.pyplot as plt # Collection of functions for scientific and publication-ready visualization
%matplotlib inline
import seaborn as sns

from collections import Counter

# Loading Data

In [None]:
data_raw = pd.read_csv('train.csv')
data_val = pd.read_csv('test.csv')

train_ids = data_raw['PassengerId']
test_ids = data_val['PassengerId']

data_c = data_raw.copy(deep = True)
data_all = [data_c, data_val]

print(data_c.info())
print(data_val.info())
data_c.head()

In [None]:
print(data_c.isnull().sum())
print("-"*10)
print(data_val.isnull().sum())
data_c.describe(include = 'all')

# Cleaning Data

### Names and Titles

In [None]:
#We check the different titles we have and group them into more general groups

titles = set()
for dataset in data_all:
    for name in dataset['Name']:
        titles.add(name.split(', ')[1].split('. ')[0])
print(titles)

In [None]:
title_groups = {
    'Col':'Officer',
    'Dona':'Royalty', 
    'Don':'Royalty',
    'Lady':'Royalty',
    'Mme':'Mrs',
    'Mr':'Mr',
    'Master':'Master',
    'Capt':'Officer',
    'Jonkheer':'Royalty',
    'Miss':'Miss',
    'Dr':'Officer',
    'Mlle':'Miss',
    'the Countess':'Royalty',
    'Mrs':'Mrs',
    'Major':'Officer',
    'Ms':'Mrs',
    'Rev':'Officer',
    'Sir':'Royalty'
}

for dataset in data_all:
    dataset['Title'] = dataset['Name'].map(lambda x : x.split(', ')[1].split('. ')[0]).map(title_groups)
    
#Doing a one-hot encoding of the Titles may help us depending on model. The Names column will much likely be droped in the future.

### Age

In [None]:
#We have 177 nulls in Age. Lets see the average Age per sex, title and class in the train dataset:
age_medians = data_c.groupby(['Sex','Pclass','Title']).median()[['Age','Survived']].reset_index()
age_medians
#We can also see the difference in survival chances with these variables

In [None]:
def get_age_median(row):
    return age_medians[((age_medians['Sex'] == row['Sex']) & \
                        (age_medians['Pclass'] == row['Pclass']) & \
                        (age_medians['Title'] == row['Title']))]['Age'].values[0]

for dataset in data_all:
    dataset['Age'] = dataset.apply(lambda row : get_age_median(row) if np.isnan(row['Age']) else row['Age'], axis=1)

### Fare

In [None]:
fig, ax = plt.subplots(figsize=(20, 5))
g = sns.distplot(data_c["Fare"][data_c["Survived"] == 0], color="r")
g = sns.distplot(data_c["Fare"][data_c["Survived"] == 1], color="b")
g = g.legend(["Not Survived","Survived"])

for dataset in data_all:
    dataset['logFare'] = dataset["Fare"].map(lambda i: np.log(i) if i > 0 else 0)
    
fig, ax = plt.subplots(figsize=(20, 5))
g = sns.distplot(data_c["logFare"][data_c["Survived"] == 0], color="r")
g = sns.distplot(data_c["logFare"][data_c["Survived"] == 1], color="b")
g = g.legend(["Not Survived","Survived"])

In [None]:
fig, ax = plt.subplots(figsize=(20, 5))
g = sns.distplot(data_c["logFare"][data_c["Pclass"] == 1], color="b")
g = sns.distplot(data_c["logFare"][data_c["Pclass"] == 2], color="orange")
g = sns.distplot(data_c["logFare"][data_c["Pclass"] == 3], color="g")
g = g.legend(["First Class","Second Class","Third Class"])

As we can see above, using a log scale for the Fare lets us visualize better how increased Fare prices lead to increased survival. This is because, as the second graph shows, people how paid more are probably high class, who we presume had an advantage on lower class people. We'll see in a bit that that's the case.

### Other variables

In [None]:
#Here we fill other NAs and drop useless columns
for dataset in data_all:
    dataset['Embarked'].fillna('S', inplace=True)
    
    dataset['Fare'].fillna(dataset['Fare'].mean(), inplace=True)
    
    dataset['Cabin'].fillna('U',inplace=True)
    dataset['Cabin'] = dataset['Cabin'].map(lambda x: x[0])
    
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
    
    dataset['IsAlone'] = 'Yes'
    dataset['IsAlone'].loc[dataset['FamilySize'] > 1] = 'No'

    #dataset['FareBin'] = pd.qcut(dataset['Fare'], 5)
    #dataset['AgeBin'] = pd.cut(dataset['Age'].astype(int), 5

## Some useful visuals

In [None]:
fig, ax = plt.subplots(2,3,figsize=(20,10))
sns.violinplot(x="Sex",y="Age",hue='Survived',data=data_c,split=True,ax=ax[0,0])
sns.barplot(x="Pclass",y="Survived",data=data_c,ax=ax[0,1])
sns.barplot(x="Cabin",y="Survived",data=data_c,ax=ax[0,2])
sns.barplot(x="Embarked",y="Survived",data=data_c,ax=ax[1,0])
sns.barplot(x="Title",y="Survived",hue="Sex",data=data_c,ax=ax[1,1])
sns.barplot(x="FamilySize",y="Survived",hue="Sex",data=data_c,ax=ax[1,2])
fig, ax = plt.subplots(figsize=(4.65,3.5))
sns.barplot(x="IsAlone",y="Survived",hue='Sex',data=data_c)

From this, we can see some characteristics we already presumed:
- Men have lower chances of survival then women, in general (Misters, male officers and male royalty all die far more often). Young men (20-40) are all more likely to die, while women in the same range have far better chances of survival.
- Except for when they're young boys, as we can see the difference in the first chart, and also that the Master title (reserved for young boys) survives much more than any other male title.
- We confirm that the higher the social status (Pclass), the higher the chances for survival. When combining this with sex, we can see that female royalty and female officers have the biggest survival chances. Male royalty also seem to have more chances than Misters, but the uncertainty is too big to draw this conclusion (as seen in the barplot).
- The cabin doesn't seem to impact as much, except for when the person didn't have an informed cabin (U). These were much more likely to die.
- Embarked only seems to make a difference when looking at if the person embarked at C or not.
- Finally, men with smaller families tended to die more, while females tend to die less. It escapes me why, but perhaps, for men, the need to protect a family leads to increased survival, and, for the woman, it's because families of size above 4 seem to be maily women, putting more weight to their total death toll.

In [None]:
for dataset in data_all:
    dataset['Sex'] = dataset['Sex'].map({'male' : 1, 'female' : 0, 1:1, 0:0})
    
    dataset['IsAlone'] = dataset['IsAlone'].map({'Yes' : 1, 'No' : 0, 1:1, 0:0})
    
    #dataset['Embarked'] = dataset['Embarked'].map({'S' : 0, 'C' : 1, 'Q' :2})
    
    #dataset['Cabin'] = dataset['Cabin'].map({'U':0,'C':1,'E':2,'G':3,'D':4,'A':5,'B':6,'F':7,'T':8})
    
    #dataset['Title'] = dataset['Title'].map({'Mr':0,'Mrs':1,'Miss':2,'Master':3,'Royalty':4,'Officer':5})
    
    drop_column = ['PassengerId','Name', 'Ticket','Fare']
    dataset.drop(drop_column, axis=1, inplace = True)

In [None]:
def categorize_row(dataset,row_name):
    dummies = pd.get_dummies(dataset[row_name], prefix=row_name)
    dataset = pd.concat([dataset, dummies], axis=1)
    dataset.drop(row_name, axis=1, inplace=True)
    return dataset

data_c = categorize_row(data_c,'Embarked')
data_c = categorize_row(data_c,'Cabin')
data_c = categorize_row(data_c,'Title')

data_val = categorize_row(data_val,'Embarked')
data_val = categorize_row(data_val,'Cabin')
data_val = categorize_row(data_val,'Title')

data_c.drop('Cabin_T', axis=1, inplace = True)

In [None]:
print(data_c.isnull().sum())
print("-"*10)
print(data_val.isnull().sum())
data_c.head()

In [None]:
data_val.head()

In [None]:
from sklearn.model_selection import StratifiedKFold as SKF
from sklearn.model_selection import cross_val_score as cv_score
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.mixture import BayesianGaussianMixture as BGMM
from sklearn.model_selection import GridSearchCV as GridS

In [None]:
y = data_c['Survived'].values
X = data_c.drop('Survived',axis=1).values
y.shape,X.shape

In [None]:
RFC_parameter_grid = {'max_depth' : [4, 6, 8, 10],
                 'n_estimators': [100, 50, 10],
                 'max_features' : ['sqrt', 'auto', 'log2'],
                 'min_samples_split': [2, 3, 10],
                 'min_samples_leaf': [1, 3, 10],
                 'bootstrap': [True, False],}

RFC_grid = GridS(RFC(),
                    scoring='accuracy',
                    param_grid=RFC_parameter_grid,
                    cv=SKF(n_splits=5),
                    verbose=1)

RFC_grid.fit(X, y)

RFC_params = RFC_grid.best_params_

print('Best score: {}'.format(RFC_grid.best_score_))
print('Best parameters: {}'.format(RFC_params))

pred = RFC_grid.predict(data_val.values).astype(int)
output = pd.DataFrame()
output['PassengerId'] = test_ids.values
output['Survived'] = pred
output[['PassengerId','Survived']].to_csv('./RFC_prediction.csv', index=False)

#max score on kaggle so far: {'bootstrap': True, 'max_depth': 6, 'max_features': 'log2', 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 100}

In [None]:
RFC_clf = RFC(**RFC_params)
RFC_clf.fit(X,y)
importances = pd.DataFrame()
importances['Feature'] = data_val.columns.tolist()
importances['Importance'] = RFC_clf.feature_importances_.tolist()
importances.sort_values(by='Importance',inplace=True,ascending=False)
importances.reset_index(drop=True,inplace=True)
importances['Accumulated Importance'] = [np.sum(importances['Importance'].tolist()[:i+1]) for (i,n) in enumerate(importances['Importance'].tolist())]
importances

In [None]:
selected = importances.iloc[0:15]['Feature'].values
X_s = data_c[selected].values
y.shape,X_s.shape

In [None]:
RFC_parameter_grid = {'max_depth' : [4, 6, 8, 10],
                 'n_estimators': [100, 50, 10],
                 'max_features' : ['sqrt', 'auto', 'log2'],
                 'min_samples_split': [2, 3, 10],
                 'min_samples_leaf': [1, 3, 10],
                 'bootstrap': [True, False],}

RFC_grid = GridS(RFC(),
                    scoring='accuracy',
                    param_grid=RFC_parameter_grid,
                    cv=SKF(n_splits=5),
                    verbose=1)

RFC_grid.fit(X_s, y)

RFC_params = RFC_grid.best_params_

print('Best score: {}'.format(RFC_grid.best_score_))
print('Best parameters: {}'.format(RFC_params))

pred = RFC_grid.predict(data_val.values).astype(int)
output = pd.DataFrame()
output['PassengerId'] = test_ids.values
output['Survived'] = pred
output[['PassengerId','Survived']].to_csv('./RFC_prediction.csv', index=False)

#max score on kaggle so far: {'bootstrap': True, 'max_depth': 6, 'max_features': 'log2', 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 100}

In [None]:
BGMM_parameter_grid = {'n_components':[10,20,40,80,100],
                      'covariance_type':['full','tied','diag','spherical'],
                      'max_iter':[10,50,100,200],
                      }

BGMM_grid = GridS(BGMM(),
                    scoring='accuracy',
                    param_grid=BGMM_parameter_grid,
                    cv=SKF(n_splits=5),
                    verbose=1)

BGMM_grid.fit(X, y)

BGMM_params = BGMM_grid.best_params_

print('Best score: {}'.format(BGMM_grid.best_score_))
print('Best parameters: {}'.format(BGMM_params))

BGMM_model = BGMM(**BGMM_params)
BGMM_model.fit(X,y)

pred = BGMM_grid.predict(data_val.values).astype(int)
output = pd.DataFrame()
output['PassengerId'] = test_ids.values
output['Survived'] = pred
output[['PassengerId','Survived']].to_csv('./GMM_prediction.csv', index=False)

In [None]:
'''Initialize bagging classifier.'''
from sklearn.ensemble import BaggingClassifier
bagg = BaggingClassifier(base_estimator = rf, verbose = 0, n_jobs = -1, random_state = seed)
'''We use rf as the base estimator for bagging technique.'''
print('Fitting Bagging Ensemble...')
display(bagg.fit(X_train, y_train))
print('Done.')

'''Bagging cross validation score.'''
print('\nComputing Bagging X Val Score..')
bagg_x_val_score = cross_val_score(bagg, X_train, y_train, cv = 10, scoring = 'accuracy')
bagg_x_val_score = np.round(bagg_x_val_score.mean()*100, 2)
print('Done.')

'''Compare bagging ensemble score with best base models scores.'''
bagg_vs_base_score = pd.DataFrame({'Bagging_vs_base_score(%)': [bagg_x_val_score, rf_best_score, gbc_best_score, dt_best_score, knn_best_score, lr_best_score]})
'''So basically we're comparing bagging x_val_score with base models's tunned score.'''
bagg_vs_base_score.index = ['Bagg', 'RF', 'GBC', 'DT', 'KNN', 'LR']
bold('**Bagging vs Base Models Scores:**')
display(bagg_vs_base_score)