In [1]:
# Import modules
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from matplotlib import pyplot
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from datetime import datetime
import warnings
from bayes_opt import BayesianOptimization
import xgboost as xgb
from math import log
from statistics import stdev

import os, sys
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# For print supression
class HiddenPrints:
    def __enter__(self):
        self._original_stdout = sys.stdout
        sys.stdout = open(os.devnull, 'w')

    def __exit__(self, exc_type, exc_val, exc_tb):
        sys.stdout.close()
        sys.stdout = self._original_stdout
        
# Turn off warning messages
warnings.filterwarnings('ignore') 

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [2]:
# Convert train.csv to DataFrame
pd.set_option('display.max_rows',5)
train_data = pd.read_csv('/kaggle/input/titanic/train.csv')
# train_data.head(5)

# Define output vector for training data
y = train_data['Survived']

# Convert test.csv to DataFrame
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
# test_data.head()

# Combine test and train data for preprocessing
train_shape = train_data.shape
train_rows = train_shape[0]
data = pd.concat([train_data,test_data],ignore_index=True)

# Remove useless columns from input data
data.drop(columns = ['PassengerId','Survived'], inplace = True)

# Separate column names by numerical/categorical for preprocessing
num_features = ['Pclass', 'SibSp', 'Parch', 'Age', 'Fare', 'Ticket']
cat_features = ['Embarked','Name','Sex','Cabin']

# Extract titles from 'Name' column, and replace values in column
def name_transform(X):
    names = X['Name']
    titles = ['Mr.', 'Mrs.', 'Miss.', 'Master.', 'Don.', 'Dr.', 'Rev.', 'Mme.', 'Ms.', 'Major.', 'Lady.', 'Sir.', 'Mlle.', 'Col.', 'Capt.']
    for i in range(len(names)):
        for title in titles:
            if title in names[i]:
                X['Name'][i] = title
        if not X['Name'][i] in titles:
            X['Name'][i] = 'Other'
    return X

# Transform 'Ticket' column values into natural log of the ticket number
def ticket_transform(X):
    tickets = X['Ticket']
    for i in range(len(tickets)):
        ticket = X['Ticket'][i]
        numeric_filter = filter(str.isdigit, ticket)
        numeric_string = "".join(numeric_filter)
        if numeric_string:
            X['Ticket'][i] = log(int(float(numeric_string)))
        else:
            X['Ticket'][i] = 0
    return X

# Transform 'Cabin' column values into cabin letter
def cabin_transform(X):
    cabin = X['Cabin']
    for i in range(len(cabin)):
        cabin = X['Cabin'][i]
        alpha_filter = filter(str.isalpha, cabin)
        alpha_string = "".join(alpha_filter)
        if alpha_string:
            X['Cabin'][i] = alpha_string
        else:
            X['Cabin'][i] = 'Other'
    return X

data = name_transform(data)
data = ticket_transform(data)

# Fill numerical NaN values with the mode of each column
for c in list(train_data[num_features].columns):
    data[c].fillna(data[c].mode(dropna=True)[0], inplace = True)
# for c in list(data[num_features].columns):
#     data[c].fillna(0, inplace = True)

# Fill string NaN values with the 'Other'                  
for c in list(data[cat_features].columns):
    data[c].fillna('Other', inplace = True)

data = cabin_transform(data)

# Wrapper for one hot encoding of columns in DataFrame
def one_hot(X,col_names):
    for n in col_names:
        col_name = n
        # creating instance of labelencoder
        labelencoder = LabelEncoder()
        # Assigning numerical values and storing in another column
        col_cat_name = col_name + 'Cat'
        X[col_cat_name] = labelencoder.fit_transform(X[col_name])

        # creating instance of one-hot-encoder
        enc = OneHotEncoder(handle_unknown='ignore')
        # passing cat column (label encoded values of col_name)
        enc_df = pd.DataFrame(enc.fit_transform(X[[col_name]]).toarray())
        enc_df.columns = enc_df.columns.map(str)
        # merge with main df, X, on key values
        X = X.drop(columns=[col_name, col_cat_name])
        X = X.join(enc_df,rsuffix='_' + col_name)
    return X

data_X = one_hot(data,cat_features)

# Separate data back into train and test data DataFrames
X = data_X.iloc[:train_rows]
X_test = data_X.iloc[train_rows:]

# Iterate to cross-validate model performance with randomized test/val set splits
errors = []
for _ in range(20):
    train_X, val_X, train_y, val_y = train_test_split(X, y, train_size = .8)

#     print('Train Survival Rate = {}'.format(sum(train_y)/len(train_y)*100))
#     print('Val Survival Rate = {}'.format(sum(val_y)/len(val_y)*100))
    
    # Extreme Gradient Boost Classifier model definition, fitting, and prediction
    model = XGBClassifier(n_estimators=3000,max_depth=6,gamma=1,learning_rate=0.001,eval_metric='error',
                            subsample=.8,colsample_bytree=.5)
    with HiddenPrints():
        model.fit(train_X, train_y, eval_set = [(val_X, val_y)],early_stopping_rounds=500)
    predicts = model.predict(val_X)

    # Plotting error vs. epoch number
    results = model.evals_result()

    epochs = len(results['validation_0']['error'])
    x_axis = range(0, epochs)

#     fig, ax = pyplot.subplots()
#     ax.plot(x_axis, results['validation_0']['error'], label='Val')
#     ax.legend()
#     pyplot.ylabel('Error')
#     pyplot.title('XGBoost Error')
#     pyplot.show()

#     print('Val Set Error = {:,.2f}'.format(results['validation_0']['error'][-1]))

    errors.append(results['validation_0']['error'][-1])

# Compute performance statistics
print('Mean Val Set Error = {:,.2f}'.format(sum(errors)/len(errors)))
print('Std Val Set Error = {:,.3f}'.format(stdev(errors)))

# Predict output for test data
predictions = model.predict(X_test)

# Output test data predictions to a .csv file
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('my_submission6.csv', index=False)
print("Your submission was successfully saved!")

Mean Val Set Error = 0.17
Std Val Set Error = 0.024
Your submission was successfully saved!
