# Import Modules

In [54]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm, probplot, skew
from scipy.special import boxcox1p
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold, RepeatedStratifiedKFold, RepeatedKFold, train_test_split
from sklearn.ensemble import RandomForestClassifier
import xgboost
from xgboost import XGBClassifier
from sklearn.metrics import auc, accuracy_score

from IPython.core.display import HTML

# Read Data

In [2]:
train = pd.read_csv('../input/titanic/train.csv')
test = pd.read_csv('../input/titanic/test.csv')

# Univariate Search

In [3]:
def multi_table(table_list):
    return HTML(
        f"<table><tr> {''.join(['<td>' + table._repr_html_() + '</td>' for table in table_list])} </tr></table>")

In [4]:
data = pd.concat((train, test)).drop(['Survived'], axis=1).reset_index(drop=True)

In [5]:
train.head()

In [6]:
test.head()

In [7]:
# dependent variable: Survived
set(train.columns) - set(test.columns)

In [8]:
train.info()

In [9]:
multi_table([pd.DataFrame(data[i].value_counts()) for i in data.columns])

In [10]:
# Check nominal variables and order variables.
# all order vairables(Pclass) are aleady numeric type.
nominal_vars = ['PassengerId', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']
data[nominal_vars] = data[nominal_vars].astype('category')

In [11]:
# Have to check distribution (skew, histogram, barplot...)
# Have to check derivative possible variables

# Missing values and Create Derivative variables

In [12]:
miss_cnt = data.isnull().sum().sort_values(ascending=False)
miss_pct = miss_cnt / data.shape[0]
miss_table = pd.DataFrame([miss_cnt, miss_pct], index=['count', 'percent']).T
miss_table = miss_table[miss_table['count'] > 0]
miss_table

Cabin: delete columns

In [13]:
data.drop(['Cabin'], axis=1, inplace=True)

Embarked: fill na by mode

In [14]:
data['Embarked'] = data['Embarked'].fillna(data['Embarked'].mode()[0])

Fare: fill na by median / create derivative variable(FareBin)

In [15]:
data['Fare'] = data['Fare'].fillna(data['Fare'].median())
data['FareBin']= pd.qcut(data['Fare'], 5)

Name: create Derivative variable

In [16]:
titles = data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
value_count_title = titles.value_counts()
other_titles = value_count_title.index[value_count_title.values < 10]
data['Title'] = titles
data['Title'] = data['Title'].replace(other_titles, 'Other')

Age: fill na median by Title / create derivative variables (AgeBin)

In [17]:
data.loc[data['Age'].isnull(), 'Age'] = data.groupby('Title').Age.transform('median')
data['AgeBin'] = 0
data.loc[data['Age'] <= 10, 'AgeBin'] = 0
data.loc[(data['Age'] > 10) & (data['Age'] <= 16), 'AgeBin'] = 1
data.loc[(data['Age'] > 16) & (data['Age'] <= 20), 'AgeBin'] = 2
data.loc[(data['Age'] > 20) & (data['Age'] <= 26), 'AgeBin'] = 3
data.loc[(data['Age'] > 26) & (data['Age'] <= 30), 'AgeBin'] = 4
data.loc[(data['Age'] > 30) & (data['Age'] <= 36), 'AgeBin'] = 5
data.loc[(data['Age'] > 36) & (data['Age'] <= 40), 'AgeBin'] = 6
data.loc[(data['Age'] > 40) & (data['Age'] <= 46), 'AgeBin'] = 7
data.loc[(data['Age'] > 46) & (data['Age'] <= 50), 'AgeBin'] = 8
data.loc[(data['Age'] > 50) & (data['Age'] <= 60), 'AgeBin'] = 9
data.loc[(data['Age'] > 60), 'AgeBin'] = 10

SibSp, Parch: create derivative variable(Family)

In [18]:
data['Family'] = data['SibSp'] + data['Parch']

Family: create derivative variable(Solo)

In [19]:
data['Solo'] = data['Family'] == 1
data['Solo'] = data['Solo'].astype('int')

# Select Variables and Transform

In [20]:
# delete some variables
data.drop(['PassengerId', 'Name', 'SibSp', 'Parch', 'Ticket', 'Age', 'Fare'], axis=1, inplace=True)

In [21]:
# label encoding
data['Sex'] = data.Sex.cat.codes
data['FareBin'] = data.FareBin.cat.codes

In [22]:
# onehot encoding
data = pd.get_dummies(data)

In [23]:
X_train = data.iloc[:train.shape[0], :]
X_test = data.iloc[train.shape[0]:, :]
y_train = train.Survived

In [24]:
X_train.shape, y_train.shape, X_test.shape

# Modeling

In [25]:
x_train, x_test, y_tmp_train, y_tmp_test = train_test_split(X_train, y_train, test_size=.2, random_state=42)

XGBoost

1) learning_rate and estimators

In [26]:
cv = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 3, random_state = 42)

model_xgb = XGBClassifier(use_label_encoder = False, 
                          objective = 'binary:logistic')

params_xgb = {'n_estimators': [1000],
               'learning_rate': [0.01, 0.05, 0.07, 0.1, 0.15, 0.2],
               'max_depth': [5],
               'min_child_weight': [1],
               'gamma': [0],
               'subsample': [0.8],
               'colsample_bytree': [0.8]}

search_xgb = GridSearchCV(model_xgb, params_xgb, verbose = 0,
                          scoring = 'neg_log_loss', cv = cv).fit(x_train, y_tmp_train, early_stopping_rounds = 15, 
                              eval_set = [[x_test, y_tmp_test]], 
                              eval_metric = 'logloss', verbose = False)
search_xgb.best_params_

2) max_depth and min_child_weight

In [28]:
model_xgb = XGBClassifier(use_label_encoder = False, 
                          objective = 'binary:logistic')

params_xgb = {'n_estimators': [1000],
               'learning_rate': [0.01],
               'max_depth': range(3, 10, 2),
               'min_child_weight': range(1, 6, 2),
               'gamma': [0],
               'subsample': [0.8],
               'colsample_bytree': [0.8]}

search_xgb = GridSearchCV(model_xgb, params_xgb, verbose = 0,
                          scoring = 'neg_log_loss', cv = cv).fit(x_train, y_tmp_train, early_stopping_rounds = 15, 
                              eval_set = [[x_test, y_tmp_test]], 
                              eval_metric = 'logloss', verbose = False)
search_xgb.best_params_

3) gamma

In [33]:
model_xgb = XGBClassifier(use_label_encoder = False, 
                          objective = 'binary:logistic')

params_xgb = {'n_estimators': [1000],
               'learning_rate': [0.01],
               'max_depth': [3],
               'min_child_weight': [1],
               'gamma': [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.7, 1, 2],
               'subsample': [0.8],
               'colsample_bytree': [0.8]}

search_xgb = GridSearchCV(model_xgb, params_xgb, verbose = 0,
                          scoring = 'neg_log_loss', cv = cv).fit(x_train, y_tmp_train, early_stopping_rounds = 15, 
                              eval_set = [[x_test, y_tmp_test]], 
                              eval_metric = 'logloss', verbose = False)
search_xgb.best_params_

4) subsample and colsample_bytree

In [34]:
model_xgb = XGBClassifier(use_label_encoder = False, 
                          objective = 'binary:logistic')

params_xgb = {'n_estimators': [1000],
               'learning_rate': [0.01],
               'max_depth': [3],
               'min_child_weight': [1],
               'gamma': [2],
               'subsample': [.6, .7, .8, .9],
               'colsample_bytree': [.6, .7, .8, .9]}

search_xgb = GridSearchCV(model_xgb, params_xgb, verbose = 0,
                          scoring = 'neg_log_loss', cv = cv).fit(x_train, y_tmp_train, early_stopping_rounds = 15, 
                              eval_set = [[x_test, y_tmp_test]], 
                              eval_metric = 'logloss', verbose = False)
search_xgb.best_params_

5) learning_rate

In [35]:
model_xgb = XGBClassifier(use_label_encoder = False, 
                          objective = 'binary:logistic')

params_xgb = {'n_estimators': [1000],
               'learning_rate': [x / 100.0 for x in range(1, 20, 1)],
               'max_depth': [3],
               'min_child_weight': [1],
               'gamma': [2],
               'subsample': [0.9],
               'colsample_bytree': [0.8]}

search_xgb = GridSearchCV(model_xgb, params_xgb, verbose = 0,
                          scoring = 'neg_log_loss', cv = cv).fit(x_train, y_tmp_train, early_stopping_rounds = 15, 
                              eval_set = [[x_test, y_tmp_test]], 
                              eval_metric = 'logloss', verbose = False)
search_xgb.best_params_

6) estimators

In [49]:
xgb_params = {
    'learning_rate': .03,
    'n_estimators': 1000,
    'max_depth': 3,
    'min_child_weight': 1,
    'gamma': 2,
    'subsample': .7,
    'colsample_bytree': .9,
    'objective': 'binary:logistic',
    'use_label_encoder': False,
    'eval_metric': 'logloss'
}

model_xgb = XGBClassifier(**xgb_params)
cvresult = xgboost.cv(model_xgb.get_xgb_params(), 
                      xgboost.DMatrix(x_train, y_train),
                      early_stopping_rounds=50, metrics='auc', nfold=5,
                      verbose_eval=0,
                      num_boost_round=model_xgb.get_params()['n_estimators'])

In [50]:
# estimators = 38
cvresult[cvresult['test-auc-mean'] == cvresult['test-auc-mean'].max()]

In [55]:
cv = RepeatedKFold(n_splits = 10, n_repeats = 3, random_state = 42)
params_xgb_final = {
    'learning_rate': .03,
    'n_estimators': 38,
    'max_depth': 3,
    'min_child_weight': 1,
    'gamma': 2,
    'subsample': .7,
    'colsample_bytree': .9,
    'objective': 'binary:logistic',
    'use_label_encoder': False,
    'eval_metric': 'logloss'
}
model_xgb = XGBClassifier(**xgb_params)
cross_val_xgb = cross_val_score(model_xgb, X_train, y_train, scoring='neg_log_loss', cv=42).mean()

In [56]:
round(-cross_val_xgb, 3)

In [43]:
model_xgb = model_xgb.fit(X_train, y_train)
y_pred_xgb = model_xgb.predict(X_test)

In [46]:
my_submission = pd.DataFrame({'PassengerId': test.PassengerId, 
                              'Survived': y_pred_xgb})

my_submission.to_csv('submission.csv', index = False)