In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

import xgboost as xgb

import optuna

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [2]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

# **FUNCTIONS**

## **Stratified K-Fold**

In [5]:
def StratKFold(df, features, target, n_splits, metric, model):
    x_data = df[features]
    y_data = df[target]
    
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=13)

    if (metric == True):
        metrics = {
            'accuracy': [],
            'precision': [],
            'recall': [],
            'f1': [],
            'auc': []
        }
    else:
        metrics = []
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(x_data, y_data)):
        
        x_train, x_test = x_data.iloc[train_idx].copy(), x_data.iloc[val_idx].copy()
        y_train, y_test = y_data.iloc[train_idx], y_data.iloc[val_idx]
    
        model.fit(x_train, y_train)
        yhat = model.predict(x_test)
        y_proba = model.predict_proba(x_test)[:, 1]

        if (metric == True):
            metrics['accuracy'].append(accuracy_score(y_test, yhat))
            metrics['precision'].append(precision_score(y_test, yhat))
            metrics['recall'].append(recall_score(y_test, yhat))
            metrics['f1'].append(f1_score(y_test, yhat))
            metrics['auc'].append(roc_auc_score(y_test, y_proba))
        else:
            metrics.append(accuracy_score(y_test, yhat))
    
    if (metric == True):
        for k, v in metrics.items():
            print(f"{k}: {np.mean(v)}")
    else:
        print(f"Acurracy: {np.mean(metrics)}")

In [6]:
def StratKFold_Select(df, features, target, n_splits, model, verbose):
    y_data = df[target]
    
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=13)

    accuracy = 0

    for i in range(len(features)):
        f = features[0: i+1]
        x_data = df[f]
        acc = []
        
        for fold, (train_idx, val_idx) in enumerate(skf.split(x_data, y_data)):
            
            x_train, x_test = x_data.iloc[train_idx].copy(), x_data.iloc[val_idx].copy()
            y_train, y_test = y_data.iloc[train_idx], y_data.iloc[val_idx]
        
            model.fit(x_train, y_train)
            yhat = model.predict(x_test)
            acc.append(accuracy_score(y_test, yhat))
    
        if (np.mean(acc) > accuracy):
            features_selected = f
            if (verbose == True):
                print("Old Ac:", accuracy, "\nNew Ac:", np.mean(acc), "\nFeature:", features[i], "\nFeatures:", f, "\n")
            accuracy = np.mean(acc)


    return features_selected

# **DATASET LOADING**

In [8]:
df = pd.read_csv("../data/processed/2.EDA_train.csv")
df_test = pd.read_csv("../data/processed/2.EDA_test.csv")

# **MODEL DEVELOPMENT AND EVALUATION**

In [10]:
features_KFold = ['IsMale', 'Title', 'FarePerPerson', 'Age', 'Pclass', 'Fare_log', 'SibSp', 'Deck', 'Embarked', 'FamilySize', 'Stage', 'Parch', 'HasCabin',
            'IsSingle', 'IsAgeEstimated', 'FreeFare', 'IsAlone']

Features_OHE = ['Age', 'Deck', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'FamilySize', 'Fare_log', 'FarePerPerson', 'FreeFare', 'HasCabin', 'IsAgeEstimated', 'IsAlone',
                'IsMale', 'IsSingle', 'Parch', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'SibSp', 'Stage', 'Title']

## **XGBOOST CLASSIFIER**

### **MODEL DEVELOPMENT**

#### **K-FOLD TARGET ENCODING**

In [14]:
xgbc = xgb.XGBClassifier(random_state=13)

StratKFold(df, features_KFold, 'Survived', 10, True, xgbc)

accuracy: 0.8181897627965045
precision: 0.7843604761805174
recall: 0.7341176470588235
f1: 0.7559811549340355
auc: 0.8753361344537816


In [16]:
feature_importances = pd.DataFrame({'Feature': features_KFold,
                                    'XGBC_Importance': xgbc.feature_importances_
                                   })
feature_importances = feature_importances.sort_values(by='XGBC_Importance', ascending=False)

feature_importances

Unnamed: 0,Feature,XGBC_Importance
1,Title,0.320998
9,FamilySize,0.132304
4,Pclass,0.086753
0,IsMale,0.075823
10,Stage,0.06224
7,Deck,0.055978
5,Fare_log,0.043445
2,FarePerPerson,0.041004
3,Age,0.038027
13,IsSingle,0.037587


In [17]:
features_xgbc = feature_importances.sort_values(by='XGBC_Importance', ascending=False)['Feature'].to_list()

In [18]:
xgbc = xgb.XGBClassifier(random_state=13)

features_xgbc_sel_KFold = StratKFold_Select(df, features_xgbc, 'Survived', 5, xgbc, False)
StratKFold(df, features_xgbc_sel_KFold, 'Survived', 5, True, xgbc)

accuracy: 0.8316740945326722
precision: 0.7866468709785395
recall: 0.7721653878942881
f1: 0.778509729577841
auc: 0.8854029821668667


#### **ONE HOT ENCODING**

In [21]:
xgbc = xgb.XGBClassifier(random_state=13)

StratKFold(df, Features_OHE, 'Survived', 5, False, xgbc)

Acurracy: 0.8137342288619672


In [23]:
feature_importances = pd.DataFrame({'Feature': Features_OHE,
                                    'XGBC_Importance': xgbc.feature_importances_
                                   })
feature_importances = feature_importances.sort_values(by='XGBC_Importance', ascending=False)

feature_importances

Unnamed: 0,Feature,XGBC_Importance
17,Pclass_3,0.245963
20,Title,0.222327
5,FamilySize,0.118044
12,IsMale,0.064797
19,Stage,0.041164
6,Fare_log,0.038154
1,Deck,0.037362
4,Embarked_S,0.034678
0,Age,0.028842
7,FarePerPerson,0.028137


In [24]:
features_xgbc = feature_importances.sort_values(by='XGBC_Importance', ascending=False)['Feature'].to_list()

xgbc = xgb.XGBClassifier(random_state=13)

features_xgbc_sel_OHE = StratKFold_Select(df, features_xgbc, 'Survived', 5, xgbc, False)
StratKFold(df, features_xgbc_sel_OHE, 'Survived', 5, True, xgbc)

accuracy: 0.8283158621555458
precision: 0.7907913195817893
recall: 0.7575021312872975
f1: 0.7720250197881777
auc: 0.8740513549583447


### **MODEL REFINEMENT**

In [None]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_depth': trial.suggest_int('max_depth', 2, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 5),
        'random_state': 13,
        'use_label_encoder': False,
        'eval_metric': 'logloss'
    }

    model = xgb.XGBClassifier(**params)
    skf = StratifiedKFold(n_splits = 10, shuffle = True, random_state = 13)
    score = cross_val_score(model, df[features_xgbc_sel_KFold], df['Survived'], cv = skf, scoring = 'accuracy').mean()
    return score

study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 500)


print('\n\nBest trial:', study.best_trial.params)

In [44]:
xgbc_op = xgb.XGBClassifier(
    random_state = 13,
    n_estimators = 471,
    max_depth = 6,
    learning_rate = 0.111482401319574,
    subsample = 0.8407428963803871,
    colsample_bytree = 0.8369739021081172,
    gamma = 0.3770767545748295,
    reg_alpha = 1.1312401823000522,
    reg_lambda = 4.219245922048798
)

StratKFold(df, features_xgbc_sel_KFold, 'Survived', 10, True, xgbc_op)

accuracy: 0.8474032459425717
precision: 0.8403013159085674
recall: 0.7457983193277311
f1: 0.7886036637604111
auc: 0.8891940695470106


# **PREDICTING TEST DATASET**

In [39]:
submission = df_test[['PassengerId']].copy()
submission['Survived'] = xgbc_op.predict(df_test[features_xgbc_sel_KFold])
submission = submission.set_index("PassengerId")
submission.to_csv('../data/processed/submission.csv')