In [65]:
# import all related packages
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

from xgboost import XGBClassifier
from sklearn.metrics import median_absolute_error
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform


In [14]:
# Reading the data
train_data_path = "titanic-data/train.csv"
data = pd.read_csv(train_data_path)
data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [94]:
# splitting the data into training and validation sets
y = data.Survived
X = data.drop(['Survived', 'PassengerId'], axis=1)

X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, random_state=42)
X_train_full.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
331,1,"Partner, Mr. Austen",male,45.5,0,0,113043,28.5,C124,S
733,2,"Berriman, Mr. William John",male,23.0,0,0,28425,13.0,,S
382,3,"Tikkanen, Mr. Juho",male,32.0,0,0,STON/O 2. 3101293,7.925,,S
704,3,"Hansen, Mr. Henrik Juul",male,26.0,1,0,350025,7.8542,,S
813,3,"Andersson, Miss. Ebba Iris Alfrida",female,6.0,4,2,347082,31.275,,S


In [96]:
# getting categorical/numerical columns
categorical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and X_train_full[cname].dtype == "object"]
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

In [97]:
X_train.head()

Unnamed: 0,Sex,Embarked,Pclass,Age,SibSp,Parch,Fare
331,male,S,1,45.5,0,0,28.5
733,male,S,2,23.0,0,0,13.0
382,male,S,3,32.0,0,0,7.925
704,male,S,3,26.0,1,0,7.8542
813,female,S,3,6.0,4,2,31.275


# Define Preprocessing Steps

In [98]:
# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy="median")

# Preprocessing for categorical data
categorical_transformer = Pipeline(
    steps = [
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers = [
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Creating a XGBoost Model

In [99]:
# initializing a XGBoost model
my_classifier = XGBClassifier(n_estimators=1000, learning_rate=0.005)

# creating model pipeline
my_pipeline = Pipeline(
    steps = [
        ('preprocessor', preprocessor),
        ('xgbclassifier', my_classifier)
    ]
)

my_pipeline.fit(X_train, y_train)

In [100]:
scores = cross_val_score(my_pipeline, 
                             X_valid, 
                             y_valid, 
                             cv=5, 
                             scoring='accuracy')
print(f"Mean error for validation sets: {scores.mean() * 100}%")

Mean error for validation sets: 76.57142857142858%


In [119]:
# Define the hyperparameter search space
param_dist = {
    'xgbclassifier__n_estimators': [50, 100, 200, 500, 1000],
    'xgbclassifier__learning_rate': uniform(0.001, 0.1),
    'xgbclassifier__max_depth': [3, 4, 5, 6],
    'xgbclassifier__min_child_weight': [1, 3, 5],
    'xgbclassifier__subsample': [0.6, 0.8, 1.0],
    'xgbclassifier__colsample_bytree': [0.6, 0.8, 1.0]
}

# Perform Randomized Search
random_search = RandomizedSearchCV(estimator=my_pipeline, param_distributions=param_dist, n_iter=200, cv=5, scoring='accuracy')
random_search.fit(X_train, y_train)

print("Best hyperparameters:", random_search.best_params_)

Best hyperparameters: {'xgbclassifier__colsample_bytree': 0.6, 'xgbclassifier__learning_rate': 0.07423685155899373, 'xgbclassifier__max_depth': 4, 'xgbclassifier__min_child_weight': 5, 'xgbclassifier__n_estimators': 500, 'xgbclassifier__subsample': 0.6}


In [120]:
# initializing a XGBoost model
my_classifier2 = XGBClassifier(n_estimators=500, 
                               learning_rate=0.07423685155899373,
                               colsample_bytree=0.6,
                               max_depth=4,
                               min_child_weight=5,
                               subsample=0.6)

# creating model pipeline
my_pipeline2 = Pipeline(
    steps = [
        ('preprocessor', preprocessor),
        ('xgbclassifier', my_classifier2)
    ]
)

In [125]:
my_pipeline2.fit(X_train, y_train)
scores2 = cross_val_score(my_pipeline2, 
                             X_valid, 
                             y_valid, 
                             cv=5, 
                             scoring='accuracy')
print(f"Mean error for validation sets: {scores2.mean() * 100}%")

Mean error for validation sets: 81.01587301587301%


# Make Predictions

In [126]:
test_data = pd.read_csv("titanic-data/test.csv")
X_test = test_data.copy()
X_test = X_test.drop(['PassengerId'], axis=1)

categorical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and X_train_full[cname].dtype == "object"]
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

X_test = X_test[categorical_cols + numerical_cols]

In [127]:
# Submitting the output
predictions = my_pipeline2.predict(X_test)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
