In [47]:
import os
import warnings
import sys

import pandas as pd
import numpy as np

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold

from urllib.parse import urlparse
import mlflow
import mlflow.sklearn

import xgboost as xgb
from xgboost import plot_importance

import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Load the dataset

In [35]:
data_path = "dataset/"
df_train = pd.read_csv(data_path + "train.csv")
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [36]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


# Preprosessing

In [38]:
df_train['FamilySize'] = df_train['SibSp'] + df_train['Parch']
df_train['Age'].fillna(df_train.Age.mean(), inplace=True)

In [39]:
input_cols = ['Survived', 'Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'FamilySize']

In [40]:
df_train = pd.get_dummies(df_train[input_cols], drop_first=True)
df_train

Unnamed: 0,Survived,Pclass,Age,Fare,FamilySize,Sex_male,Embarked_Q,Embarked_S
0,0,3,22.000000,7.2500,1,1,0,1
1,1,1,38.000000,71.2833,1,0,0,0
2,1,3,26.000000,7.9250,0,0,0,1
3,1,1,35.000000,53.1000,1,0,0,1
4,0,3,35.000000,8.0500,0,1,0,1
...,...,...,...,...,...,...,...,...
886,0,2,27.000000,13.0000,0,1,0,1
887,1,1,19.000000,30.0000,0,0,0,1
888,0,3,29.699118,23.4500,3,0,0,1
889,1,1,26.000000,30.0000,0,1,0,0


In [41]:
X = df_train.drop(["Survived"], axis=1)
y = df_train.Survived.values

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

# Train

In [48]:
def metrics(actual, pred):
    acc = accuracy_score(actual, pred)
    recall = recall_score(actual, pred)
    precision = precision_score(actual, pred)
    f1 = f1_score(actual, pred)
    return acc, recall, precision, f1

In [43]:
mlflow.xgboost.autolog()

In [54]:
def run_model(params):
    with mlflow.start_run(run_name="Xgboost Titanic Model") as run:
        bst = xgb.XGBClassifier(objective='binary:logistic', booster="gbtree", max_depth=params['max_depth'], learning_rate=params['learning_rate'], colsample_bytree=0.8, eval_metric='logloss')
        bst.fit(X_train, y_train)
        
        y_train_hat = bst.predict(X_train)
        y_val_hat = bst.predict(X_test)
        
        (train_accuracy, train_recall, train_precision, train_f1) = metrics(y_train, y_train_hat)
        (val_accuracy, val_recall, val_precision, val_f1) = metrics(y_test, y_val_hat)
        
#         mlflow.log_param("max_depth", params['max_depth'])
#         mlflow.log_param("learning_rate", params['learning_rate'])
#         mlflow.log_metric("train_accuracy", train_accuracy)
#         mlflow.log_metric("train_recall", train_recall)
#         mlflow.log_metric("train_precision", train_precision)
#         mlflow.log_metric("train_f1", train_f1)
#         mlflow.log_metric("val_accuracy", val_accuracy)
#         mlflow.log_metric("val_recall", val_recall)
#         mlflow.log_metric("val_precision", val_precision)
#         mlflow.log_metric("val_f1", val_f1)
    return (run.info.experiment_id, run.info.run_id)

In [55]:
for max_depth, learning_rate in [[5,0.1],[5,0.01],[8,0.1],[8,0.01]]:
    params = {
        'max_depth': max_depth,
        'learning_rate': learning_rate
    }
    run_model(params)

