In [1]:
import re
import os

from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV, RandomizedSearchCV

import matplotlib.pyplot as plt
import numpy as np 
import pandas as pd 
import xgboost as xgb

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Read training and test data

In [None]:
train_data = pd.read_csv('/kaggle/input/titanic/train.csv')
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")


## Features preprocessing

In [None]:
def features_preprocessing(df):
    """Data preprocessing pipeline after EDA

    Parameters:
    df (pd.DataFrame): DataFrame containing raw features for training/test data.

    Returns:
    df (pd.DataFrame): DataFrame containing preprocessed features for training/test data.

   """
    
    df['NameTitle'] = df.Name.apply(lambda x: x.split(',')[1].split('.')[0].strip()) # Remove titles from names
    df.loc[(df.NameTitle != "Miss") & (df.NameTitle != "Mrs") & (df.NameTitle != "Mr"), "NameTitle"] = 'None' # Remove any title different from Mr, Mrs or Miss
    df["Relatives"] = df["SibSp"] + df["Parch"] # Relatives count
    df["RelativesCat"] = "Safe"
    df.loc[(df.Relatives == 0) | (df.Relatives >= 4), "RelativesCat"] = "Unsafe" # EDA revealed people with 0 or more than 4 relatives had low chance of survival
    df["Embarked"].fillna('S', inplace=True) # Imputation with the mode
    df["Age"].fillna(df["Age"].median(), inplace=True) # Imputation with the median
    df.Cabin.fillna('NaN', inplace=True) #Imputation with NaN
    df["NumCabins"] = df.Cabin.apply(lambda x: len(x.split(" "))) # Splitting the cabin number seeking relationship among letters and survival rate
    df.loc[(df.NumCabins != 1), "NumCabins"] = 2
    df["CabinLetters"] = df.Cabin.apply(lambda x: re.findall(r"^\w", x)[0])
    df["CabinLettersCat"] = "Unsafe"
    df.loc[(df.CabinLetters.isin(["B", "C", "E", "F"])), "CabinLettersCat"] = "Safe" # EDA revealed safer sections
    df['AgeBins'] = pd.cut(df.Age, [0,1,2,5,18,30,45,60, np.inf], include_lowest=True) # Age binning
    df["FareLog"] = np.log(df.Fare + 1) # Fare log to improve distribution
    df['FareLogBins'] = pd.cut(df.FareLog, 6, include_lowest=True) # Fare bins
    cols_to_drop = ["Name", "Age", "Ticket", "Cabin", "Fare", "FareLogBins", "CabinLetters", "SibSp", "Parch", "Relatives"] # Remove raw/unnecessary features
    df = df.drop(cols_to_drop, axis=1)
    return df

def apply_one_hot(df, cols):
    """One-hot encoding on categorical variables

    Parameters:
    df (pd.DataFrame): DataFrame containing preprocessed features for training/test data containing categorical variables.

    Returns:
    df (pd.DataFrame): DataFrame containing preprocessed features for training/test data containing dummy variables for each category.

   """
    for cat in cols:
        cat_series = df[cat]
        cat_df = pd.get_dummies(cat_series, prefix=cat)
        df = df.merge(
            cat_df, how="left", left_index=True, right_index=True
        )
    df = df.drop(cols, axis=1)
    return df

train_data = features_preprocessing(train_data)
cols_to_onehot = ['Pclass', 'Sex', 'Embarked', 'NameTitle', 'CabinLettersCat', 'AgeBins', "RelativesCat"] # Columns requiring encoding
train_data = apply_one_hot(train_data, cols_to_onehot)
cols_to_drop = ['Pclass_3', 'Sex_male', 'Embarked_S', "RelativesCat_Unsafe", "CabinLettersCat_Unsafe"] # Redundant columns to be removed
train_data = train_data.drop(cols_to_drop, axis=1)


## Model

In [None]:
X = train_data.drop("Survived", axis=1).values
y = train_data.Survived.values
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.2, random_state=42)

xg_clf = xgb.XGBClassifier(objective='reg:logistic', use_label_encoder=False)

params_grid = {'learning_rate': np.logspace(-3, -1, num=6),
    'n_estimators': np.arange(50, 151, 10),
    'max_depth': [2,3,4,5],
    'subsample': np.arange(0.01, 1.01, 0.05), 
    'colsample_bytree': np.arange(0.01, 1.01, 0.05)
}

randomized_auc = RandomizedSearchCV(estimator=xg_clf,
    param_distributions=params_grid, 
    cv=4, 
    scoring='roc_auc',
    verbose=1,
    n_iter=100)

randomized_auc.fit(X_train, y_train)

In [None]:
params_df = pd.DataFrame(randomized_auc.best_params_, index=[0])
params_df["score"] = [randomized_auc.best_score_] # Visualization of the best parameters
params_df.head()

### Training

In [None]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
params = randomized_auc.best_params_

xg_cl = xgb.XGBClassifier(
    params=params, 
    dtrain=dtrain, 
    evals=[(dtest, "Test")],
    obj='reg:tree', 
    early_stopping_rounds=7
    )

xg_cl.fit(X_train, y_train) # Fitting the classifier on the training set using the best parameters found

### Validation

In [None]:
y_pred = xg_cl.predict(X_test)
y_pred_prob = xg_cl.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

plt.plot([0,1], [0,1], 'k--')
plt.plot(fpr, tpr, label='XGBoost')
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.show()

### Inference

In [None]:
test_data = features_preprocessing(test_data)
cols_to_onehot = ['Pclass', 'Sex', 'Embarked', 'NameTitle', 'CabinLettersCat', 'AgeBins', "RelativesCat"]
test_data = apply_one_hot(test_data, cols_to_onehot)
cols_to_drop = ['Pclass_3', 'Sex_male', 'Embarked_S', "RelativesCat_Unsafe", "CabinLettersCat_Unsafe"]
test_data = test_data.drop(cols_to_drop, axis=1)

In [None]:
predictions = xg_cl.predict(test_data)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")