In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, make_scorer

from catboost import CatBoostClassifier

import xgboost as xgb

from lightgbm import LGBMClassifier

import optuna

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [2]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

# **FUNCTIONS**

## **Scatter Plot**

In [5]:
def PlotScatter(x, y, title, xunit, yunit, format_x, x_size):
    plt.figure().patch.set_facecolor("0.85")
    plt.axes().set(facecolor = "0.85")

    plt.title(title)
    plt.scatter(x, y, alpha=0.7)

    plt.grid(False)
    plt.gca().spines['bottom'].set_visible(True)
    plt.gca().spines['left'].set_visible(True)
    plt.gca().spines['top'].set_visible(False)
    plt.gca().spines['right'].set_visible(False)
    plt.gca().spines['bottom'].set_color("black")
    plt.gca().spines['left'].set_color("black")

    plt.gca().set_xlabel(xunit, fontsize = 10)
    plt.gca().xaxis.set_label_coords(1.05, -0.025)
    plt.gca().set_ylabel(yunit, fontsize = 10)
    plt.gca().yaxis.set_label_coords(-0.16, .98)

    plt.gca().set_yticklabels(['{:,.0f}'.format(x) for x in plt.gca().get_yticks()])

    if (format_x == 1):
        plt.gca().set_xticklabels(['{:,.0f}'.format(x) for x in plt.gca().get_xticks()])

    if (x_size == 0):
        plt.gcf().set_size_inches(6, 5)
    else:
        plt.gcf().set_size_inches(x_size, 5)
    plt.show()
    plt.close()

## **Box Plot**

In [7]:
def PlotBoxPlot(df_x, df_y, title, yunit):
    plt.figure(figsize=(6, 5))
    plt.figure().patch.set_facecolor("0.85")
    plt.axes().set(facecolor = "0.85")

    plt.title(title)
    sns.boxplot(x = df_x, y = df_y, palette = "mako")

    plt.grid(False)
    plt.gca().spines['bottom'].set_visible(True)
    plt.gca().spines['left'].set_visible(True)
    plt.gca().spines['top'].set_visible(False)
    plt.gca().spines['right'].set_visible(False)
    plt.gca().spines['bottom'].set_color("black")
    plt.gca().spines['left'].set_color("black")

    plt.gca().set_ylabel(yunit, fontsize = 10)
    plt.gca().yaxis.set_label_coords(-0.16, .98)

    plt.gca().set_xlabel("", fontsize = 1)

    plt.gca().set_yticklabels(['{:,.0f}'.format(x) for x in plt.gca().get_yticks()])

    plt.show()
    plt.close()

In [8]:
def PlotBoxPlotV2(df_x, df_y, title, yunit, size_x, size_y):
    plt.figure().patch.set_facecolor("0.85")
    plt.axes().set(facecolor = "0.85")

    plt.title(title)
    sns.boxplot(x = df_x, y = df_y, palette = "mako")

    plt.grid(False)
    plt.gca().spines['bottom'].set_visible(True)
    plt.gca().spines['left'].set_visible(True)
    plt.gca().spines['top'].set_visible(False)
    plt.gca().spines['right'].set_visible(False)
    plt.gca().spines['bottom'].set_color("black")
    plt.gca().spines['left'].set_color("black")

    plt.gca().set_ylabel(yunit, fontsize = 10)
    plt.gca().yaxis.set_label_coords(-0.16, .98)

    plt.gca().set_xlabel("", fontsize = 1)

    plt.gca().set_yticklabels(['{:,.0f}'.format(x) for x in plt.gca().get_yticks()])

    plt.gcf().set_size_inches(size_x, size_y)

    plt.show()
    plt.close()

## **Regression Plot**

In [10]:
def PlotRegPlot(df_x, df_y, title, xunit, yunit):
    plt.figure(figsize=(6, 5))
    plt.figure().patch.set_facecolor("0.85")
    plt.axes().set(facecolor = "0.85")

    plt.title(title)
    sns.regplot(x = df_x, y = df_y, line_kws={"color": "#31273F"})

    plt.grid(False)
    plt.gca().spines['bottom'].set_visible(True)
    plt.gca().spines['left'].set_visible(True)
    plt.gca().spines['top'].set_visible(False)
    plt.gca().spines['right'].set_visible(False)
    plt.gca().spines['bottom'].set_color("black")
    plt.gca().spines['left'].set_color("black")

    plt.gca().set_ylabel(yunit, fontsize = 10)
    plt.gca().yaxis.set_label_coords(-0.16, .98)

    plt.gca().set_xlabel(xunit, fontsize = 10)
    plt.gca().xaxis.set_label_coords(1.05, -0.025)

    plt.gca().set_yticklabels(['{:,.0f}'.format(x) for x in plt.gca().get_yticks()])
    plt.gca().set_xticklabels(['{:,.0f}'.format(x) for x in plt.gca().get_xticks()])

    plt.show()
    plt.close()

## **Histogram**

In [12]:
def HistPlot(x, title, xunit, x_size):
    plt.figure().patch.set_facecolor("0.85")
    plt.axes().set(facecolor = "0.85")

    plt.title(title)
    sns.histplot(x = x)

    plt.grid(False)
    plt.gca().spines['bottom'].set_visible(True)
    plt.gca().spines['left'].set_visible(True)
    plt.gca().spines['top'].set_visible(False)
    plt.gca().spines['right'].set_visible(False)
    plt.gca().spines['bottom'].set_color("black")
    plt.gca().spines['left'].set_color("black")

    plt.gca().set_xlabel(xunit, fontsize = 10)
    plt.gca().xaxis.set_label_coords(0.98, -0.08)

    plt.ylabel("Count")

    if (x_size == 0):
        plt.gcf().set_size_inches(6, 5)
    else:
        plt.gcf().set_size_inches(x_size, 5)

    plt.show()
    plt.close()

## **Predicted vs Actual Values**

In [14]:
def PlotFitted(y_test, yhat, title):
    plt.figure().patch.set_facecolor("0.85")
    plt.axes().set(facecolor = "0.85")
    
    plt.title(title)

    y_test1 = np.exp(y_test)
    yhat1 = np.exp(yhat)

    sns.scatterplot(x=y_test1.to_numpy().flatten(), y=yhat1, alpha=0.6)
    plt.plot([y_test1.min(), y_test1.max()], [y_test1.min(), y_test1.max()], 'r--')

    plt.grid(False)
    plt.gca().spines['bottom'].set_visible(True)
    plt.gca().spines['left'].set_visible(True)
    plt.gca().spines['top'].set_visible(False)
    plt.gca().spines['right'].set_visible(False)
    plt.gca().spines['bottom'].set_color("black")
    plt.gca().spines['left'].set_color("black")
    
    plt.xlabel("Actual Values")
    plt.ylabel("Predicted Values")

    plt.gcf().set_size_inches(7, 5)

    plt.ylim(0,)
    plt.xlim(0,)
    
    plt.show()
    plt.close()
    
    
    print("\n- Mean Squared Error:", mean_squared_error(y_test, yhat))
    print("- Root Mean Squared Error:", root_mean_squared_error(y_test, yhat))

In [15]:
def PlotFittedV(yhat, y_test, model, x_data, y_data, title, xlabel):
    plt.figure().patch.set_facecolor("0.85")
    plt.axes().set(facecolor = "0.85")

    plt.title(title)

    ax2 = sns.distplot(y_test, hist=False, color="r", label = "Actual Value")
    sns.distplot(yhat, hist=False, color="b", label="Predicted Value", ax = ax2)

    plt.grid(False)
    plt.gca().spines['bottom'].set_visible(True)
    plt.gca().spines['left'].set_visible(True)
    plt.gca().spines['top'].set_visible(False)
    plt.gca().spines['right'].set_visible(False)
    plt.gca().spines['bottom'].set_color("black")
    plt.gca().spines['left'].set_color("black")

    plt.ylabel("Density")
    plt.xlabel(xlabel)
    plt.legend(['Actual Value', 'Predicted Value'], framealpha=0)

    plt.gcf().set_size_inches(7, 5)

    plt.show()
    plt.close()

    print("\n- Mean Squared Error:", mean_squared_error(y_test, yhat))
    print("- Root Mean Squared Error:", root_mean_squared_error(y_test, yhat))
    print("- Cross-Validation Score:", cross_val_score(model, x_data, y_data, cv=5).mean())

In [16]:
def PlotFittedV2(yhat, y_test, title, xlabel):
    plt.figure().patch.set_facecolor("0.85")
    plt.axes().set(facecolor = "0.85")

    plt.title(title)

    ax2 = sns.distplot(np.exp(y_test), hist=False, color="r", label = "Actual Value")
    sns.distplot(np.exp(yhat), hist=False, color="b", label="Predicted Value", ax = ax2)

    plt.grid(False)
    plt.gca().spines['bottom'].set_visible(True)
    plt.gca().spines['left'].set_visible(True)
    plt.gca().spines['top'].set_visible(False)
    plt.gca().spines['right'].set_visible(False)
    plt.gca().spines['bottom'].set_color("black")
    plt.gca().spines['left'].set_color("black")

    plt.ylabel("Density")
    plt.xlabel(xlabel)
    plt.legend(['Actual Value', 'Predicted Value'], framealpha=0)

    plt.gcf().set_size_inches(7, 5)

    plt.show()
    plt.close()

    print("\n- Mean Squared Error:", mean_squared_error(y_test, yhat))
    print("- Root Mean Squared Error:", root_mean_squared_error(y_test, yhat))

## **Line Plot**

In [18]:
def PlotLine(y, title, yunit, x_size, y_size):
  plt.figure().patch.set_facecolor("0.85")
  plt.axes().set(facecolor = "0.85")

  plt.title(title)
  plt.plot(y)

  plt.grid(False)
  plt.gca().spines['bottom'].set_visible(True)
  plt.gca().spines['left'].set_visible(True)
  plt.gca().spines['top'].set_visible(False)
  plt.gca().spines['right'].set_visible(False)
  plt.gca().spines['bottom'].set_color("black")
  plt.gca().spines['left'].set_color("black")

  plt.gca().set_xlabel("", fontsize = 0)
  #plt.gca().xaxis.set_label_coords(1.05, -0.025)
  plt.gca().set_ylabel(yunit, fontsize = 10)
  plt.gca().yaxis.set_label_coords(-0.16, .98)

  plt.gca().set_yticklabels(['{:,.0f}'.format(x) for x in plt.gca().get_yticks()])

  plt.gcf().set_size_inches(x_size, y_size)

  plt.show()
  plt.close()

## **K-Fold Target Encoding**

In [20]:
def KFTE(df, df_test, categ_f, target):
  kf = KFold(n_splits = 5, shuffle = True, random_state = 13)

  nome = categ_f + "_E"

  df[nome] = 0.0

  for train_idx, val_idx in kf.split(df):
      fold_train = df.iloc[train_idx]
      fold_valid  = df.iloc[val_idx]

      medias_feature = fold_train.groupby(categ_f)[target].mean()
      media_global = fold_train[target].mean()

      feature_encoded = fold_valid[categ_f].map(medias_feature).fillna(media_global)
      df.loc[val_idx, nome] = feature_encoded

  mapping = df.groupby(categ_f)[target].mean().to_dict()
  global_mean = df[target].mean()

  df_test[categ_f] = df_test[categ_f].map(mapping).fillna(global_mean)

## **Stratified K-Fold**

In [22]:
def StratKFold(df, features, target, n_splits, metric, model):
    x_data = df[features]
    y_data = df[target]
    
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=13)

    if (metric == True):
        metrics = {
            'accuracy': [],
            'precision': [],
            'recall': [],
            'f1': [],
            'auc': []
        }
    else:
        metrics = []
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(x_data, y_data)):
        
        x_train, x_test = x_data.iloc[train_idx].copy(), x_data.iloc[val_idx].copy()
        y_train, y_test = y_data.iloc[train_idx], y_data.iloc[val_idx]
    
        model.fit(x_train, y_train)
        yhat = model.predict(x_test)
        y_proba = model.predict_proba(x_test)[:, 1]

        if (metric == True):
            metrics['accuracy'].append(accuracy_score(y_test, yhat))
            metrics['precision'].append(precision_score(y_test, yhat))
            metrics['recall'].append(recall_score(y_test, yhat))
            metrics['f1'].append(f1_score(y_test, yhat))
            metrics['auc'].append(roc_auc_score(y_test, y_proba))
        else:
            metrics.append(accuracy_score(y_test, yhat))
    
    if (metric == True):
        for k, v in metrics.items():
            print(f"{k}: {np.mean(v)}")
    else:
        print(f"Acurracy: {np.mean(metrics)}")

In [23]:
def StratKFold_Select(df, features, target, n_splits, model, verbose):
    y_data = df[target]
    
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=13)

    #features_selected = []
    accuracy = 0

    for i in range(len(features)):
        f = features[0: i+1]
        x_data = df[f]
        acc = []
        
        for fold, (train_idx, val_idx) in enumerate(skf.split(x_data, y_data)):
            
            x_train, x_test = x_data.iloc[train_idx].copy(), x_data.iloc[val_idx].copy()
            y_train, y_test = y_data.iloc[train_idx], y_data.iloc[val_idx]
        
            model.fit(x_train, y_train)
            yhat = model.predict(x_test)
            acc.append(accuracy_score(y_test, yhat))
    
        if (np.mean(acc) > accuracy):
            features_selected = f
            if (verbose == True):
                print("Old Ac:", accuracy, "\nNew Ac:", np.mean(acc), "\nFeature:", features[i], "\nFeatures:", f, "\n")
            accuracy = np.mean(acc)


    return features_selected

# **DATASET LOADING**

In [25]:
df = pd.read_csv("../data/processed/2.EDA_train.csv")
df_test = pd.read_csv("../data/processed/2.EDA_test.csv")

# **MODEL DEVELOPMENT AND EVALUATION**

In [27]:
features_KFold = ['IsMale', 'Title', 'FarePerPerson', 'Age', 'Pclass', 'Fare_log', 'SibSp', 'Deck', 'Embarked', 'FamilySize', 'Stage', 'Parch', 'HasCabin',
            'IsSingle', 'IsAgeEstimated', 'FreeFare', 'IsAlone']

Features_OHE = ['Age', 'Deck', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'FamilySize', 'Fare_log', 'FarePerPerson', 'FreeFare', 'HasCabin', 'IsAgeEstimated', 'IsAlone',
                'IsMale', 'IsSingle', 'Parch', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'SibSp', 'Stage', 'Title']

## **XGBOOST CLASSIFIER**

### **MODEL DEVELOPMENT**

#### **K-FOLD TARGET ENCODING**

In [31]:
xgbc = xgb.XGBClassifier(random_state=13)

StratKFold(df, features_KFold, 'Survived', 5, False, xgbc)

Acurracy: 0.8204506936162199


In [32]:
feature_importances = pd.DataFrame({'Feature': features_KFold,
                                    'XGBC_Importance': xgbc.feature_importances_
                                   })
feature_importances = feature_importances.sort_values(by='XGBC_Importance', ascending=False)

feature_importances

Unnamed: 0,Feature,XGBC_Importance
1,Title,0.31469
9,FamilySize,0.149316
0,IsMale,0.082611
4,Pclass,0.06825
10,Stage,0.055283
7,Deck,0.04951
2,FarePerPerson,0.041254
14,IsAgeEstimated,0.040605
5,Fare_log,0.039984
8,Embarked,0.039087


In [33]:
features_xgbc = feature_importances.sort_values(by='XGBC_Importance', ascending=False)['Feature'].to_list()

In [34]:
xgbc = xgb.XGBClassifier(random_state=13)

features_xgbc_sel = StratKFold_Select(df, features_xgbc, 'Survived', 5, xgbc, False)
StratKFold(df, features_xgbc_sel, 'Survived', 5, True, xgbc)

accuracy: 0.8204569706860838
precision: 0.7754976706673467
recall: 0.7516624040920716
f1: 0.7623375889654943
auc: 0.8803014625003645


#### **ONE HOT ENCODING**

In [36]:
xgbc = xgb.XGBClassifier(random_state=13)

StratKFold(df, Features_OHE, 'Survived', 5, False, xgbc)

Acurracy: 0.8182160567447117


In [37]:
feature_importances = pd.DataFrame({'Feature': Features_OHE,
                                    'XGBC_Importance': xgbc.feature_importances_
                                   })
feature_importances = feature_importances.sort_values(by='XGBC_Importance', ascending=False)

feature_importances

Unnamed: 0,Feature,XGBC_Importance
17,Pclass_3,0.225388
20,Title,0.217528
5,FamilySize,0.111809
12,IsMale,0.066219
18,SibSp,0.040779
19,Stage,0.037544
6,Fare_log,0.036963
1,Deck,0.033966
10,IsAgeEstimated,0.033168
4,Embarked_S,0.030122


In [38]:
features_xgbc = feature_importances.sort_values(by='XGBC_Importance', ascending=False)['Feature'].to_list()

xgbc = xgb.XGBClassifier(random_state=13)

features_xgbc_sel = StratKFold_Select(df, features_xgbc, 'Survived', 5, xgbc, False)
StratKFold(df, features_xgbc_sel, 'Survived', 5, True, xgbc)

accuracy: 0.8249325214989642
precision: 0.7917470315546288
recall: 0.7371696504688832
f1: 0.7626018817445888
auc: 0.8638359847329438


### **MODEL REFINEMENT**

In [40]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_depth': trial.suggest_int('max_depth', 2, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 5),
        'random_state': 13,
        'use_label_encoder': False,
        'eval_metric': 'logloss'
    }

    model = xgb.XGBClassifier(**params)
    skf = StratifiedKFold(n_splits = 10, shuffle = True, random_state = 13)
    score = cross_val_score(model, df[features_xgbc], df['Survived'], cv = skf, scoring = 'roc_auc').mean()
    return score

study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 500)


print('\n\nBest trial:', study.best_trial.params)

[I 2025-03-25 17:10:59,422] A new study created in memory with name: no-name-b474e6fc-f40c-461a-a133-96cee88730b1
[I 2025-03-25 17:11:00,447] Trial 0 finished with value: 0.878686217921512 and parameters: {'n_estimators': 302, 'max_depth': 9, 'learning_rate': 0.17450116296069715, 'subsample': 0.5194025304422454, 'colsample_bytree': 0.5314325716434716, 'gamma': 1.0192329825643602, 'reg_alpha': 2.2938198346551126, 'reg_lambda': 3.5288908503298893}. Best is trial 0 with value: 0.878686217921512.
[I 2025-03-25 17:11:01,269] Trial 1 finished with value: 0.8673149704326175 and parameters: {'n_estimators': 363, 'max_depth': 8, 'learning_rate': 0.13016760667164487, 'subsample': 0.8331597082901379, 'colsample_bytree': 0.5977694065423955, 'gamma': 2.563092121241945, 'reg_alpha': 2.829245235742267, 'reg_lambda': 3.350248442583873}. Best is trial 0 with value: 0.878686217921512.
[I 2025-03-25 17:11:01,624] Trial 2 finished with value: 0.8695104263927792 and parameters: {'n_estimators': 83, 'max_de



Best trial: {'n_estimators': 204, 'max_depth': 7, 'learning_rate': 0.03463098825538474, 'subsample': 0.7864170344039825, 'colsample_bytree': 0.7748569547367724, 'gamma': 0.3526595396237495, 'reg_alpha': 0.16266251550416774, 'reg_lambda': 0.29844380155476014}


In [41]:
xgbc_op = xgb.XGBClassifier(
    random_state = 13,
    n_estimators = 486,
    max_depth = 8,
    learning_rate = 0.0639314751572629,
    subsample = 0.8785631452997261,
    colsample_bytree = 0.6797172480023336,
    gamma = 0.5865646778192904,
    reg_alpha = 0.2744587219560853,
    reg_lambda = 0.18122570448115347
)

StratKFold(df, features_xgbc_sel, 'Survived', 10, True, xgbc_op)

accuracy: 0.8362172284644196
precision: 0.8168014258614237
recall: 0.7371428571428571
f1: 0.7723550797670095
auc: 0.8727015816427581


# **PREDICTING TEST DATASET**

In [43]:
submission = df_test[['PassengerId']].copy()
submission['Survived'] = xgbc_op.predict(df_test[features_xgbc_sel])
submission = submission.set_index("PassengerId")
submission.to_csv('../data/processed/submission.csv')