In [23]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform
from xgboost import XGBClassifier
import pandas as pd
import numpy as np
import optuna as opt
import xgboost as xgb 
import re

In [24]:
to_test = pd.read_csv("test.csv", index_col="PassengerId")
df = pd.read_csv("train.csv", index_col="PassengerId")
df

Unnamed: 0_level_0,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.2500,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.9250,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1000,C123,S
5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...
887,0,2,"Montvila, Rev. Juozas",27.0,0,0,211536,13.0000,,S
888,1,1,"Graham, Miss. Margaret Edith",19.0,0,0,112053,30.0000,B42,S
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",,1,2,W./C. 6607,23.4500,,S
890,1,1,"Behr, Mr. Karl Howell",26.0,0,0,111369,30.0000,C148,C


In [25]:
df = df.convert_dtypes()
df["Embarked"] = df["Embarked"].astype("category")
df.dtypes

Survived             Int64
Pclass               Int64
Name        string[python]
Age                Float64
SibSp                Int64
Parch                Int64
Ticket      string[python]
Fare               Float64
Cabin       string[python]
Embarked          category
dtype: object

In [ ]:
'''
random_id = 17 #ai23m017 - 17 is the id
# a RandomState object or the id may be used, choose a variant
random_state = np.random.RandomState(random_id)
random_state = random_id
'''
#TODO implement this for all random states

In [26]:
X_train,X_test,y_train,y_test = train_test_split(df.drop("Survived", axis=1), df["Survived"], test_size=0.2, random_state=42)
print(df.columns[df.isna().any(axis=0)].tolist())

['Age', 'Cabin', 'Embarked']


### Data Pre-Processing after splitting the data to prevent data - leakage

In [27]:
#Data Imputation for the Age column
data = [X_train, X_test, to_test]
for dataset in data:
    mean = X_train["Age"].mean()
    std = X_test["Age"].std()
    is_null = dataset["Age"].isna().sum()
    rand_age = np.random.randint(mean - std, mean + std, size = is_null)
    rand_age_series = pd.Series(rand_age, index=dataset[dataset["Age"].isnull()].index)
    dataset["Age"].fillna(rand_age_series, inplace=True)
    dataset["Age"] = dataset["Age"].astype(int)
    

In [28]:
#Feature Engineering the Deck Column 
#Cabins are mapped to a numerical Deck feature
deck = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "U": 8}

for dataset in data:
    dataset['Cabin'] = dataset['Cabin'].fillna("U0")
    dataset['Deck'] = dataset['Cabin'].map(lambda x: re.compile("([a-zA-Z]+)").search(x).group())
    dataset['Deck'] = dataset['Deck'].map(deck)
    dataset['Deck'] = dataset['Deck'].fillna(0)
    dataset['Deck'] = dataset['Deck'].astype(int)
    dataset.drop(["Cabin"], axis=1, inplace=True)


In [29]:
X_train["Embarked"].describe()
for dataset in [X_train, X_test, to_test]:
    dataset["Embarked"].fillna("S", inplace=True)

In [30]:
X_train.isna().any()

Pclass      False
Name        False
Age         False
SibSp       False
Parch       False
Ticket      False
Fare        False
Embarked    False
Deck        False
dtype: bool

In [32]:
X_train

Unnamed: 0_level_0,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Embarked,Deck
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
332,1,"Partner, Mr. Austen",45,0,0,113043,28.5,S,3
734,2,"Berriman, Mr. William John",23,0,0,28425,13.0,S,8
383,3,"Tikkanen, Mr. Juho",32,0,0,STON/O 2. 3101293,7.925,S,8
705,3,"Hansen, Mr. Henrik Juul",26,1,0,350025,7.8542,S,8
814,3,"Andersson, Miss. Ebba Iris Alfrida",6,4,2,347082,31.275,S,8
...,...,...,...,...,...,...,...,...,...
107,3,"Salkjelsvik, Miss. Anna Kristine",21,0,0,343120,7.65,S,8
271,1,"Cairns, Mr. Alexander",31,0,0,113798,31.0,S,8
861,3,"Hansen, Mr. Claus Peter",41,2,0,350026,14.1083,S,8
436,1,"Carter, Miss. Lucile Polk",14,1,2,113760,120.0,S,2


In [33]:
#Feature Engineering Title Column
for dataset in data:
    dataset["Title"] = dataset["Name"].apply(lambda x: x.split(",")[1].split(".")[0].strip())

In [34]:
#Feature Engineering Sex Column 
def map_to_sex(value):
    if value in ["Miss","Mrs"]:
        return "female"
    else:
        return "male"

for dataset in data:
    dataset["Sex"] = dataset["Title"].apply(map_to_sex)
    dataset.drop("Title", axis=1, inplace=True)

In [35]:
#Feature Engineering Family Size Column
for dataset in data:
    dataset["Fam_size"] = dataset["SibSp"] + dataset["Parch"] + 1
    dataset["Fam_type"] = pd.cut(dataset.Fam_size, [0,1,4,7,11], labels=["Solo", "Small", "Big", "Very big"])
    dataset.drop(["SibSp","Parch","Fam_size"], axis=1, inplace=True)


In [36]:
#Feature Engineering Ticket_2letter column
for dataset in data:
    dataset["Ticket_2letter"] = dataset.Ticket.apply(lambda x: x[:2])
    dataset.drop("Ticket", axis=1, inplace=True)

In [37]:
X_train

Unnamed: 0_level_0,Pclass,Name,Age,Fare,Embarked,Deck,Sex,Fam_type,Ticket_2letter
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
332,1,"Partner, Mr. Austen",45,28.5,S,3,male,Solo,11
734,2,"Berriman, Mr. William John",23,13.0,S,8,male,Solo,28
383,3,"Tikkanen, Mr. Juho",32,7.925,S,8,male,Solo,ST
705,3,"Hansen, Mr. Henrik Juul",26,7.8542,S,8,male,Small,35
814,3,"Andersson, Miss. Ebba Iris Alfrida",6,31.275,S,8,female,Big,34
...,...,...,...,...,...,...,...,...,...
107,3,"Salkjelsvik, Miss. Anna Kristine",21,7.65,S,8,female,Solo,34
271,1,"Cairns, Mr. Alexander",31,31.0,S,8,male,Solo,11
861,3,"Hansen, Mr. Claus Peter",41,14.1083,S,8,male,Small,35
436,1,"Carter, Miss. Lucile Polk",14,120.0,S,2,female,Small,11


### Encode all Labels to String in order to pass through the GBM model

In [38]:
X_train = X_train.apply(LabelEncoder().fit_transform)
X_test = X_test.apply(LabelEncoder().fit_transform)
to_test = to_test.apply(LabelEncoder().fit_transform)
X_train

Unnamed: 0_level_0,Pclass,Name,Age,Fare,Embarked,Deck,Sex,Fam_type,Ticket_2letter
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
332,0,499,45,133,2,3,1,2,0
734,1,63,23,74,2,8,1,2,14
383,2,642,32,37,2,8,1,2,48
705,2,250,26,33,2,8,1,1,20
814,2,19,6,141,2,8,0,0,19
...,...,...,...,...,...,...,...,...,...
107,2,574,21,23,2,8,0,2,19
271,0,96,31,140,2,8,1,2,0
861,2,249,41,81,2,8,1,1,20
436,0,108,14,205,2,2,0,1,0


### Cross-Fold Validation

In [39]:
model = XGBClassifier() # classifier used to tune hyperparameters

def objective(trial):
    param = {
        'verbosity': 0,
        'objective': 'binary:logistic',  # or 'multi:softprob' for multiclass and set 'num_class'
        'eval_metric': 'logloss',  # or another suitable metric
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 9),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-5, 100.0),
    }

    model = xgb.XGBClassifier(**param)
    model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
    model.set_params(early_stopping_rounds=100 ) # TODO random state

    preds = model.predict(X_test)
    accuracy = accuracy_score(y_test, preds)
    
    return accuracy

In [40]:
study = opt.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)


[I 2024-03-03 20:31:53,069] A new study created in memory with name: no-name-7556f0d9-c8a7-4532-8bcc-07f0dbfcaba8
[I 2024-03-03 20:31:53,269] Trial 0 finished with value: 0.7932960893854749 and parameters: {'learning_rate': 0.02597043225203985, 'n_estimators': 342, 'max_depth': 5, 'min_child_weight': 8, 'subsample': 0.911558217380901, 'colsample_bytree': 0.8549474038073508, 'reg_alpha': 0.15699348707765126, 'reg_lambda': 39.39630824893548}. Best is trial 0 with value: 0.7932960893854749.
[I 2024-03-03 20:31:53,964] Trial 1 finished with value: 0.770949720670391 and parameters: {'learning_rate': 0.2453102468475491, 'n_estimators': 849, 'max_depth': 7, 'min_child_weight': 9, 'subsample': 0.604567019716912, 'colsample_bytree': 0.5061485173838973, 'reg_alpha': 0.4944060510143551, 'reg_lambda': 18.66937627190101}. Best is trial 0 with value: 0.7932960893854749.
[I 2024-03-03 20:31:54,351] Trial 2 finished with value: 0.7597765363128491 and parameters: {'learning_rate': 0.2522329180138892, '

Number of finished trials: 100
Best trial: {'learning_rate': 0.05076725610038065, 'n_estimators': 513, 'max_depth': 8, 'min_child_weight': 1, 'subsample': 0.5234593365457169, 'colsample_bytree': 0.9039714759184293, 'reg_alpha': 0.5931634534883738, 'reg_lambda': 92.5791140113161}


In [41]:
best_trial = study.best_trial
print(f"Best trial accuracy: {best_trial.value}")

Best trial accuracy: 0.8268156424581006


### Generate Results

In [42]:
model.set_params(**best_trial.params)
model.fit(X_train, y_train)

In [43]:
y_pred = model.predict(X_test)

In [48]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.83


In [49]:
submission_test = model.predict(to_test)
submission_test = pd.Series(submission_test, to_test.index, name='Survived')
# save submission
submission_test.to_csv('submission_test.csv') 

## RandomForestClassifier

In [61]:
from sklearn.ensemble import RandomForestClassifier


def objective(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 2, 32),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
    }

    model = RandomForestClassifier(**param, random_state=42)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    accuracy = accuracy_score(y_test, preds)
    
    return accuracy

In [62]:
study = opt.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
print(f"Best trial accuracy: {study.best_trial.value}")

[I 2024-03-03 03:37:25,468] A new study created in memory with name: no-name-33bfe8fc-3f68-4a4e-bbef-4fde57370c64
[I 2024-03-03 03:37:26,810] Trial 0 finished with value: 0.6927374301675978 and parameters: {'n_estimators': 771, 'max_depth': 20, 'min_samples_split': 2, 'min_samples_leaf': 6, 'max_features': 'log2', 'bootstrap': False}. Best is trial 0 with value: 0.6927374301675978.
[I 2024-03-03 03:37:30,340] Trial 1 finished with value: 0.6815642458100558 and parameters: {'n_estimators': 921, 'max_depth': 11, 'min_samples_split': 10, 'min_samples_leaf': 5, 'max_features': None, 'bootstrap': False}. Best is trial 0 with value: 0.6927374301675978.
[I 2024-03-03 03:37:30,942] Trial 2 finished with value: 0.6927374301675978 and parameters: {'n_estimators': 405, 'max_depth': 26, 'min_samples_split': 8, 'min_samples_leaf': 10, 'max_features': 'sqrt', 'bootstrap': True}. Best is trial 0 with value: 0.6927374301675978.
[I 2024-03-03 03:37:32,564] Trial 3 finished with value: 0.670391061452513

Number of finished trials: 100
Best trial: {'n_estimators': 886, 'max_depth': 3, 'min_samples_split': 3, 'min_samples_leaf': 1, 'max_features': 'log2', 'bootstrap': False}
Best trial accuracy: 0.7374301675977654


In [63]:
model = RandomForestClassifier(random_state=0, n_estimators=500, max_depth=5)
model.set_params(**study.best_trial.params)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.73


### Compare accuracy to supposed real data

In [64]:
actual_data = pd.read_csv("corrected_passenger_survived.csv", index_col="PassengerId")
predicted_data = pd.read_csv("submission_test.csv", index_col="PassengerId")
to_drop = actual_data[actual_data['survived'].isna()].index.tolist()

data = [actual_data, predicted_data]
for dataset in data:
    dataset.drop(to_drop, axis="index", inplace=True)
accuracy_score(actual_data, predicted_data)

0.6565656565656566