In [182]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform
from xgboost import XGBClassifier
import pandas as pd
import numpy as np
import optuna as opt
import xgboost as xgb 
import re

In [183]:
to_test = pd.read_csv("test.csv", index_col="PassengerId")
df = pd.read_csv("train.csv", index_col="PassengerId")
df

Unnamed: 0_level_0,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.2500,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.9250,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1000,C123,S
5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...
887,0,2,"Montvila, Rev. Juozas",27.0,0,0,211536,13.0000,,S
888,1,1,"Graham, Miss. Margaret Edith",19.0,0,0,112053,30.0000,B42,S
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",,1,2,W./C. 6607,23.4500,,S
890,1,1,"Behr, Mr. Karl Howell",26.0,0,0,111369,30.0000,C148,C


In [184]:
df = df.convert_dtypes()
df["Embarked"] = df["Embarked"].astype("category")
df.dtypes

Survived             Int64
Pclass               Int64
Name        string[python]
Age                Float64
SibSp                Int64
Parch                Int64
Ticket      string[python]
Fare               Float64
Cabin       string[python]
Embarked          category
dtype: object

In [185]:
X_train,X_test,y_train,y_test = train_test_split(df.drop("Survived", axis=1), df["Survived"], test_size=0.2, random_state=42)
print(df.columns[df.isna().any(axis=0)].tolist())

['Age', 'Cabin', 'Embarked']


### Data Pre-Processing after splitting the data to prevent data - leakage

In [186]:
#Data Imputation for the Age column
data = [X_train, X_test, to_test]
for dataset in data:
    mean = X_train["Age"].mean()
    std = X_test["Age"].std()
    is_null = dataset["Age"].isna().sum()
    rand_age = np.random.randint(mean - std, mean + std, size = is_null)
    rand_age_series = pd.Series(rand_age, index=dataset[dataset["Age"].isnull()].index)
    dataset["Age"].fillna(rand_age_series, inplace=True)
    dataset["Age"] = dataset["Age"].astype(int)

In [187]:
#Feature Engineering the Deck Column 
#Cabins are mapped to a numerical Deck feature
deck = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "U": 8}
data = [X_train, X_test, to_test]

for dataset in data:
    dataset['Cabin'] = dataset['Cabin'].fillna("U0")
    dataset['Deck'] = dataset['Cabin'].map(lambda x: re.compile("([a-zA-Z]+)").search(x).group())
    dataset['Deck'] = dataset['Deck'].map(deck)
    dataset['Deck'] = dataset['Deck'].fillna(0)
    dataset['Deck'] = dataset['Deck'].astype(int)
X_train = X_train.drop(['Cabin'], axis=1)
X_test = X_test.drop(['Cabin'], axis=1)
to_test = to_test.drop(['Cabin'], axis=1)

In [188]:
X_train["Embarked"].describe()
for dataset in [X_train, X_test, to_test]:
    dataset["Embarked"].fillna("S", inplace=True)

In [189]:
X_train.isna().any()

Pclass      False
Name        False
Age         False
SibSp       False
Parch       False
Ticket      False
Fare        False
Embarked    False
Deck        False
dtype: bool

In [190]:
X_train

Unnamed: 0_level_0,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Embarked,Deck
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
332,1,"Partner, Mr. Austen",45,0,0,113043,28.5,S,3
734,2,"Berriman, Mr. William John",23,0,0,28425,13.0,S,8
383,3,"Tikkanen, Mr. Juho",32,0,0,STON/O 2. 3101293,7.925,S,8
705,3,"Hansen, Mr. Henrik Juul",26,1,0,350025,7.8542,S,8
814,3,"Andersson, Miss. Ebba Iris Alfrida",6,4,2,347082,31.275,S,8
...,...,...,...,...,...,...,...,...,...
107,3,"Salkjelsvik, Miss. Anna Kristine",21,0,0,343120,7.65,S,8
271,1,"Cairns, Mr. Alexander",21,0,0,113798,31.0,S,8
861,3,"Hansen, Mr. Claus Peter",41,2,0,350026,14.1083,S,8
436,1,"Carter, Miss. Lucile Polk",14,1,2,113760,120.0,S,2


In [191]:
#Feature Engineering Title Column
#X_train["Title"] = X_train["Name"].apply(lambda x: x.split(",")[1].split(".")[0].strip())
#X_test["Title"] = X_test["Name"].apply(lambda x: x.split(",")[1].split(".")[0].strip())

In [192]:
#Feature Engineering Family Size Column
X_train["Fam_size"] = X_train["SibSp"] + X_train["Parch"] + 1
X_test["Fam_size"] = X_test["SibSp"] + X_test["Parch"] + 1

X_train["Fam_type"] = pd.cut(X_train.Fam_size, [0,1,4,7,11], labels=["Solo", "Small", "Big", "Very big"])
X_test["Fam_type"] = pd.cut(X_test.Fam_size, [0,1,4,7,11], labels=["Solo", "Small", "Big", "Very big"])
X_train.drop(["SibSp","Parch","Fam_size"], axis=1, inplace=True)
X_test.drop(["SibSp","Parch","Fam_size"], axis=1, inplace=True)

In [193]:
X_train

Unnamed: 0_level_0,Pclass,Name,Age,Ticket,Fare,Embarked,Deck,Fam_type
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
332,1,"Partner, Mr. Austen",45,113043,28.5,S,3,Solo
734,2,"Berriman, Mr. William John",23,28425,13.0,S,8,Solo
383,3,"Tikkanen, Mr. Juho",32,STON/O 2. 3101293,7.925,S,8,Solo
705,3,"Hansen, Mr. Henrik Juul",26,350025,7.8542,S,8,Small
814,3,"Andersson, Miss. Ebba Iris Alfrida",6,347082,31.275,S,8,Big
...,...,...,...,...,...,...,...,...
107,3,"Salkjelsvik, Miss. Anna Kristine",21,343120,7.65,S,8,Solo
271,1,"Cairns, Mr. Alexander",21,113798,31.0,S,8,Solo
861,3,"Hansen, Mr. Claus Peter",41,350026,14.1083,S,8,Small
436,1,"Carter, Miss. Lucile Polk",14,113760,120.0,S,2,Small


### Encode all Labels to String in order to pass through the GBM model

In [194]:
X_train = X_train.apply(LabelEncoder().fit_transform)
X_test = X_test.apply(LabelEncoder().fit_transform)
to_test = to_test.apply(LabelEncoder().fit_transform)
X_train

Unnamed: 0_level_0,Pclass,Name,Age,Ticket,Fare,Embarked,Deck,Fam_type
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
332,0,499,45,15,133,2,3,2
734,1,63,23,188,74,2,8,2
383,2,642,32,543,37,2,8,2
705,2,250,26,332,33,2,8,1
814,2,19,6,277,141,2,8,0
...,...,...,...,...,...,...,...,...
107,2,574,21,245,23,2,8,2
271,0,96,21,39,140,2,8,2
861,2,249,41,333,81,2,8,1
436,0,108,14,27,205,2,2,1


### Cross-Fold Validation

In [195]:
model = XGBClassifier() # classifier used to tune hyperparameters
param_grid = {
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'subsample': [0.8, 0.9, 1.0]
}

k_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
grid_search = GridSearchCV(model, param_grid, scoring='accuracy', n_jobs=-1, cv=k_fold)
grid_result = grid_search.fit(X_train, y_train)
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best: 0.727484 using {'learning_rate': 0.01, 'max_depth': 4, 'n_estimators': 300, 'subsample': 1.0}


In [196]:
param_distributions = {
    'max_depth': randint(3, 10),  
    'learning_rate': uniform(0.001, 0.299),  
    'n_estimators': randint(50, 1000),
    'subsample': uniform(0.5, 0.5),  
    'colsample_bytree': uniform(0.3, 0.7),  
    'min_child_weight': randint(0, 10),
    'gamma': uniform(0, 5),
    'reg_lambda': uniform(1e-5, 10 - 1e-5),  
    'reg_alpha': uniform(0, 1),
    'scale_pos_weight': [1, 10, 25, 50, 75, 100]
}

kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
random_search = RandomizedSearchCV(model, param_distributions, n_iter=100, scoring='accuracy', n_jobs=-1, cv=kfold, random_state=42)
random_search.fit(X_train, y_train)

print("Best: %f using %s" % (random_search.best_score_, random_search.best_params_))

Best: 0.720520 using {'colsample_bytree': 0.6983570284160233, 'gamma': 0.3175591479500067, 'learning_rate': 0.012009738423161422, 'max_depth': 9, 'min_child_weight': 0, 'n_estimators': 454, 'reg_alpha': 0.056375496650927115, 'reg_lambda': 8.64722511532677, 'scale_pos_weight': 1, 'subsample': 0.7671731375147315}


In [197]:
def objective(trial):
    param = {
        'verbosity': 0,
        'objective': 'binary:logistic',  # or 'multi:softprob' for multiclass and set 'num_class'
        'eval_metric': 'logloss',  # or another suitable metric
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 9),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-5, 100.0),
    }

    model = xgb.XGBClassifier(**param)
    model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
    model.set_params(early_stopping_rounds=100)

    preds = model.predict(X_test)
    accuracy = accuracy_score(y_test, preds)
    
    return accuracy

In [198]:
study = opt.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)


[I 2024-03-02 19:49:14,281] A new study created in memory with name: no-name-48b1d503-eec7-4a2b-82b9-4ab020baa5ee
[I 2024-03-02 19:49:14,829] Trial 0 finished with value: 0.7094972067039106 and parameters: {'learning_rate': 0.035170417243975, 'n_estimators': 625, 'max_depth': 8, 'min_child_weight': 1, 'subsample': 0.8384941928080936, 'colsample_bytree': 0.5388103559391851, 'reg_alpha': 0.5131911100564954, 'reg_lambda': 58.064482735308616}. Best is trial 0 with value: 0.7094972067039106.
[I 2024-03-02 19:49:15,195] Trial 1 finished with value: 0.7094972067039106 and parameters: {'learning_rate': 0.04955162092049209, 'n_estimators': 694, 'max_depth': 3, 'min_child_weight': 1, 'subsample': 0.86111415047129, 'colsample_bytree': 0.5100220641236789, 'reg_alpha': 0.27025897191251524, 'reg_lambda': 76.48971790489543}. Best is trial 0 with value: 0.7094972067039106.
[I 2024-03-02 19:49:15,346] Trial 2 finished with value: 0.6983240223463687 and parameters: {'learning_rate': 0.2776587306633664, 

Number of finished trials: 100
Best trial: {'learning_rate': 0.017620125163183186, 'n_estimators': 953, 'max_depth': 8, 'min_child_weight': 10, 'subsample': 0.9509984957416191, 'colsample_bytree': 0.6358230957636279, 'reg_alpha': 0.8263766431412558, 'reg_lambda': 45.60824557768336}


In [199]:
best_trial = study.best_trial
print(f"Best trial accuracy: {best_trial.value}")

Best trial accuracy: 0.7374301675977654


### Generate Results

In [200]:
model.set_params(**best_trial.params)
model.fit(X_train, y_train)

In [201]:
y_pred = model.predict(X_test)

In [202]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.74


In [203]:
submission_test = model.predict(to_test)
submission_test = pd.Series(submission_test, to_test.index, name='Survived')
# save submission
submission_test.to_csv('submission_test.csv') 

ValueError: Feature shape mismatch, expected: 8, got 9