In [None]:
import pandas as pd
import numpy as np
import os
import re
from time import time
import sklearn
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from scipy import stats
from scipy.stats import norm, skew

In [None]:
# Load in the train and test datasets
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

In [None]:
# making copies of original datasets for rest of this kernel
df_train = train.copy()
df_test = test.copy()

In [None]:
target = df_train['Survived']  #target variable
df_train = df_train.drop('Survived', axis=1) #drop target variable from training dataset

In [None]:
# assign an extra variable to training and testing dataset before joining them
df_train['training_set'] = True
df_test['training_set'] = False
# concatenate both dataframes prior to EDA
df_full = pd.concat([df_train, df_test], sort=False) 
df_full = df_full.reset_index(drop=True)

In [None]:
df_full.head()

In [None]:
# Feature selection
df_full = df_full.drop('PassengerId', axis=1)
df_full = df_full.drop('Name', axis=1)
df_full = df_full.drop('Ticket', axis=1)

In [None]:
df_full.head()

In [None]:
# Feature Engineering
# Replace Cabin with a feature that tells whether a passenger had a cabin on the Titanic
df_full['InCabin'] = df_full['Cabin'].apply(lambda x: 0 if type(x) == float else 1) 
df_full = df_full.drop('Cabin', axis=1)
# New feature noting if a passenger was travelling alone
df_full['IsAlone'] = 0
df_full['FamilySize'] = df_full.SibSp + df_full.Parch + 1
df_full.loc[df_full['FamilySize'] == 1, 'IsAlone'] = 1

In [None]:
df_full.head()

In [None]:
# The previous cell output seems to indicate that are missing values. Let's verify that.
df_full.isnull().sum()

In [None]:
# Replace NaNs
df_full['Embarked'] = df_full['Embarked'].fillna("U")
df_full['Fare'] = df_full['Fare'].fillna(df_full['Fare'].median())
df_full['Age'] = df_full['Age'].fillna(df_full['Age'].median())

In [None]:
# checking that we no longer have missing values
df_full.isnull().sum()

In [None]:
df_full.head()

In [None]:
# New categorial feature for age which may provide better classification
df_full['Categ_Age'] = 0
df_full.loc[df_full['Age'] < 10, 'Categ_Age'] = 0
df_full.loc[df_full['Age'] >= 10, 'Categ_Age'] = 1
df_full.loc[df_full['Age'] >= 18, 'Categ_Age'] = 2
df_full.loc[df_full['Age'] >= 25, 'Categ_Age'] = 3
df_full.loc[df_full['Age'] >= 35, 'Categ_Age'] = 4
df_full.loc[df_full['Age'] >= 45, 'Categ_Age'] = 5
df_full.loc[df_full['Age'] >= 55, 'Categ_Age'] = 6
df_full.loc[df_full['Age'] >= 65, 'Categ_Age'] = 7

In [None]:
df_full = df_full.drop('Age', axis=1)
df_full.head()

In [None]:
#convert categorical variable into dummy
df_full = pd.get_dummies(df_full)

In [None]:
df_full.head()

### Correlation Map

In [None]:
#Correlation map to see how features are correlated with SalePrice
corrmat = df_full.corr()
plt.subplots(figsize=(10,10))
sns.heatmap(corrmat,square=True, cmap="YlGnBu");

## Building Machine Learning Model(s)

In [None]:
# import MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
# import 'train_test_split'
from sklearn.model_selection import train_test_split 
# import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
# import metrics from sklearn
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, make_scorer
# Search over specified parameter values for an estimator
from sklearn.model_selection import GridSearchCV
# Search over specified parameter values for an estimator
from sklearn.model_selection import RandomizedSearchCV
# Random permutation cross-validator
from sklearn.model_selection import ShuffleSplit

In [None]:
scaler = MinMaxScaler(feature_range=(0, 5))
df_full[['Fare']] = scaler.fit_transform(df_full[['Fare']])
#df_full[['Age']] = scaler.fit_transform(df_full[['Age']])

In [None]:
df_full.head()

In [None]:
df_train = df_full[df_full['training_set']==True]
df_train = df_train.drop('training_set', axis=1)
df_test = df_full[df_full['training_set']==False]
df_test = df_test.drop('training_set', axis=1)

In [None]:
(df_train.shape, df_test.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_train, target, random_state=42)

### AdaBoostClassifier

In [None]:
from sklearn.ensemble  import AdaBoostClassifier

In [None]:
ada_classifier = AdaBoostClassifier(random_state=42)
#cv_sets = ShuffleSplit(random_state = 4) # shuffling our data for cross-validation
#parameters = {'n_estimators':[500, 1000, 1500, 2000], 
#              'learning_rate':[0.05, 0.1, 0.15, 0.2]}
#scorer = make_scorer(f1_score)
#ada_obj = RandomizedSearchCV(ada_classifier, 
#                              parameters, 
#                              scoring = scorer, 
#                              cv = cv_sets,
#                              random_state= 99)
#ada_fit = ada_obj.fit(X_train, y_train)
#ada_opt = ada_fit.best_estimator_

In [None]:
#ada_fit.best_params_

In [None]:
ada_obj = AdaBoostClassifier(learning_rate = 0.1,
                             n_estimators = 2000,
                             random_state=42)
ada_opt = ada_obj.fit(X_train, y_train)

### GradientBoostingClassifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
GBC_classifier = GradientBoostingClassifier(random_state=42)
#cv_sets = ShuffleSplit(random_state = 4) # shuffling our data for cross-validation
#parameters = {'n_estimators':[500, 1000, 1500], 
#              'learning_rate':[0.01, 0.03, 0.05],
#              'min_samples_split':[2,4,6],
#              'min_samples_leaf':[3,5,7]}
#scorer = make_scorer(f1_score)
#GBC_obj = RandomizedSearchCV(GBC_classifier, 
#                             parameters, 
#                             scoring = scorer, 
#                             cv = cv_sets,
#                             random_state= 99)
#GBC_fit = GBC_obj.fit(X_train, y_train)
#GBC_opt = ada_fit.best_estimator_

In [None]:
#GBC_fit.best_params_

In [None]:
GBC_obj = GradientBoostingClassifier(learning_rate = 0.05,
                                     max_depth = 3,
                                     min_samples_leaf = 5,
                                     min_samples_split = 4,
                                     n_estimators = 500,
                                     random_state=42)
GBC_opt = GBC_obj.fit(X_train, y_train)

### Submission (Ensemble GBC & ADA)

In [None]:
# Get the predictions for df_test f
y_pred_GBC = GBC_opt.predict(df_test)
y_pred_ada = ada_opt.predict(df_test)

In [None]:
y_pred_final = 0.5*y_pred_GBC + 0.5*y_pred_ada
y_pred_final = y_pred_final.astype(int)

In [None]:
# Final submission
my_submission = pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': y_pred_final})
my_submission.to_csv('submission-160518.csv', index=False)