In [1]:
import os
import warnings
import sys

import pandas as pd
import numpy as np

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold

from urllib.parse import urlparse
import mlflow
import mlflow.sklearn

import xgboost as xgb
from xgboost import plot_importance

import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Load the dataset

In [2]:
data_path = "dataset/"
df_train = pd.read_csv(data_path + "train.csv")
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
df_train.Survived.value_counts()

0    549
1    342
Name: Survived, dtype: int64

# Preprosessing

In [5]:
df_train['FamilySize'] = df_train['SibSp'] + df_train['Parch']
#df_train['Age'].fillna(df_train.Age.mean(), inplace=True)

In [6]:
df_train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FamilySize
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,0
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,0
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,3
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,0


In [7]:
#df_train.Name.apply(lambda n: n) 
df_train['Title'] = df_train.Name.str.extract('([A-Za-z]+)\.', expand=False).astype(str)

In [8]:
df_train.groupby(['Title'])['PassengerId'].count()

Title
Capt          1
Col           2
Countess      1
Don           1
Dr            7
Jonkheer      1
Lady          1
Major         2
Master       40
Miss        182
Mlle          2
Mme           1
Mr          517
Mrs         125
Ms            1
Rev           6
Sir           1
Name: PassengerId, dtype: int64

In [9]:
titles_dict = {'Capt': 'Other',
               'Major': 'Other',
               'Jonkheer': 'Other',
               'Don': 'Other',
               'Sir': 'Other',
               'Dr': 'Other',
               'Rev': 'Other',
               'Countess': 'Other',
               'Dona': 'Other',
               'Mme': 'Mrs',
               'Mlle': 'Miss',
               'Ms': 'Miss',
               'Mr': 'Mr',
               'Mrs': 'Mrs',
               'Miss': 'Miss',
               'Master': 'Master',
               'Lady': 'Other'}

In [10]:
df_train['Title'] = df_train['Title'].map(titles_dict)
df_train.Title.fillna('Mr', inplace=True)

In [11]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
 12  FamilySize   891 non-null    int64  
 13  Title        891 non-null    object 
dtypes: float64(2), int64(6), object(6)
memory usage: 97.6+ KB


In [12]:
means = df_train.groupby('Title')['Age'].mean().to_dict()
means

{'Master': 4.574166666666667,
 'Miss': 21.845637583892618,
 'Mr': 32.49625,
 'Mrs': 35.788990825688074,
 'Other': 44.3}

In [13]:
id_nan_age = df_train.loc[np.isnan(df_train['Age'])].index
df_train.loc[id_nan_age,'Age'].loc[id_nan_age] = df_train['Title'].loc[id_nan_age].map(means)
df_train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FamilySize,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,1,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,0,Miss
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,1,Mrs
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,0,Mr
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,0,Other
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,0,Miss
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,3,Miss
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,0,Mr


In [17]:
df_train.loc[df_train.Age.isna(),'Age'] = df_train['Title'].loc[df_train.Age.isna()].map(means)

In [13]:
input_cols = ['Survived', 'Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'FamilySize']

In [18]:
#df_train = pd.get_dummies(df_train[input_cols], drop_first=True)
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
 12  FamilySize   891 non-null    int64  
 13  Title        891 non-null    object 
dtypes: float64(2), int64(6), object(6)
memory usage: 97.6+ KB


In [8]:
X = df_train.drop(["Survived"], axis=1)
y = df_train.Survived.values

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

# Train

In [10]:
def metrics(actual, pred):
    acc = accuracy_score(actual, pred)
    recall = recall_score(actual, pred)
    precision = precision_score(actual, pred)
    f1 = f1_score(actual, pred)
    return acc, recall, precision, f1

In [11]:
#mlflow.xgboost.autolog()

In [12]:
def run_model(params):
    with mlflow.start_run(run_name="Xgboost Titanic Model") as run:
        bst = xgb.XGBClassifier(objective='binary:logistic', booster="gbtree", max_depth=params['max_depth'], learning_rate=params['learning_rate'], colsample_bytree=0.8, eval_metric='logloss')
        bst.fit(X_train, y_train)
        
        y_train_hat = bst.predict(X_train)
        y_val_hat = bst.predict(X_test)
        
        (train_accuracy, train_recall, train_precision, train_f1) = metrics(y_train, y_train_hat)
        (val_accuracy, val_recall, val_precision, val_f1) = metrics(y_test, y_val_hat)
        
        mlflow.log_param("max_depth", params['max_depth'])
        mlflow.log_param("learning_rate", params['learning_rate'])
        mlflow.log_metric("train_accuracy", train_accuracy)
        mlflow.log_metric("train_recall", train_recall)
        mlflow.log_metric("train_precision", train_precision)
        mlflow.log_metric("train_f1", train_f1)
        mlflow.log_metric("val_accuracy", val_accuracy)
        mlflow.log_metric("val_recall", val_recall)
        mlflow.log_metric("val_precision", val_precision)
        mlflow.log_metric("val_f1", val_f1)
        
        tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

        if tracking_url_type_store != "file":
            mlflow.sklearn.log_model(bst, "model", registered_model_name="xgboost_titanic_"+params['max_depth']+ "_"+params['learning_rate'])
        else:
            mlflow.sklearn.log_model(bst, "model")
    return (run.info.experiment_id, run.info.run_id)

In [13]:
for max_depth, learning_rate in [[5,0.1],[5,0.01],[8,0.1],[8,0.01]]:
    params = {
        'max_depth': max_depth,
        'learning_rate': learning_rate
    }
    run_model(params)