<a href="https://colab.research.google.com/github/christinajoslin/titanic-prediction-xgboost/blob/main/Titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import essential libraries for data manipulation and numerical computations
import pandas as pd  # For handling dataframes and data processing
import numpy as np  # For numerical operations and array handling

# Import the XGBoost Classifier for building and training the machine learning model
from xgboost import XGBClassifier  # XGBoost classifier implementation

# Import tools for hyperparameter tuning and evaluation metrics
from sklearn.model_selection import RandomizedSearchCV  # Randomized search for hyperparameter optimization
from scipy.stats import randint, uniform, loguniform  # Distributions for random hyperparameter sampling
from sklearn.metrics import accuracy_score  # Metric for evaluating model accuracy

# Import a utility for splitting datasets into training, development, and test sets
from sklearn.model_selection import train_test_split

# Import the KNN Imputer for handling missing values in datasets
from sklearn.impute import KNNImputer  # Imputation using K-Nearest Neighbors

# Import the StandardScaler for feature scaling
from sklearn.preprocessing import StandardScaler  # Standardizes features to have zero mean and unit variance


In [None]:
# Import the 'drive' module from 'google.colab' to work with Google Drive
from google.colab import drive
# Mount Google Drive to the Colab environment to access files stored in my Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
test_set = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Titanic/test.csv") #Modify according to your file path
train_set = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Titanic/train.csv") #Modify according to your file path

# Adjust the Pandas display settings to prevent truncation of content in dataframe outputs
# This ensures that all content in columns is fully visible, especially useful for columns with long text
pd.set_option('display.max_colwidth',None)

In [None]:
train_set.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
# # Data Processing # #

# Extract the features (independent variables) and target (dependent variable) for the training set
# 'Survived' is the target variable, while the rest are relevant predictors.
X_train = train_set[["Age","Pclass","Sex","SibSp","Parch","Fare","Embarked"]]
y_train = train_set["Survived"]

# Extract the features for the test set (no target variable as it will be used for predictions)
X_test = test_set[["Age","Pclass","Sex","SibSp","Parch","Fare","Embarked"]]

# Convert the 'Pclass' column (passenger class) from numeric to categorical (category) type
# as it represents categories rather than numeric values.
X_train.loc[:,'Pclass'] = X_train.loc[:,'Pclass'].astype('category')
X_test.loc[:,'Pclass'] = X_test.loc[:,'Pclass'].astype('category')

# Split the training data into a new training set and a development set
# The development set is 15% of the original training data, used for validation during training.
X_train, X_dev, y_train, y_dev = train_test_split(
    X_train, # Features for training
    y_train, # Target for training
    test_size=0.1, # Proportion of data allocated to the development set
    random_state=42, # Ensures reproducibility of the split
    stratify=y_train) # Maintains the distribution of the target variable across the split


In [None]:
# Before we start processing the information, let's see how many NaN values there are.

nan_train__count = X_train.isna().sum() #161
nan_dev_count = X_dev.isna().sum() #16

print(f"NaN Values in Training Set: \n{nan_train__count}")
print(f"NaN Values in Dev Set: \n{nan_dev_count}")


# Since there are a lot of NaN values in the 'Age' column of both the Dev and Training Sets,
# we will use K-Nearest Neighbors (KNN) Imputation to estimate the missing values.

# Create a KNN imputer object with 5 nearest neighbors
imputer = KNNImputer(n_neighbors =5)

# Impute the missing values in the 'Age' column for the training, development, and test sets
X_train['Age'] = imputer.fit_transform(X_train[['Age']])[:,0]
X_dev['Age'] = imputer.transform(X_dev[['Age']])[:,0]
X_test['Age'] = imputer.transform(X_test[['Age']])[:,0]

NaN Values in Training Set: 
Age         161
Pclass        0
Sex           0
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64
NaN Values in Dev Set: 
Age         16
Pclass       0
Sex          0
SibSp        0
Parch        0
Fare         0
Embarked     0
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['Age'] = imputer.transform(X_test[['Age']])[:,0]


In [None]:


# Select the categorical features for one-hot encoding
# - These features contain non-numeric values that need to be converted into numeric format
categorical_features = ['Sex','Pclass','Embarked']

# Convert categorical features into binary (dummy) variables
# - One-hot encoding creates a separate column for each category
# - `drop_first=True` removes one category per feature to avoid multicollinearity (redundant features)
X_train = pd.get_dummies(X_train, columns=categorical_features, drop_first=True)
X_dev = pd.get_dummies(X_dev, columns=categorical_features, drop_first=True)
X_test = pd.get_dummies(X_test, columns=categorical_features, drop_first = True)

# Create a StandardScaler object for feature scaling
# - StandardScaler normalizes features to have zero mean and unit variance
scaler = StandardScaler()

# Select numerical features to apply scaling
# - Scaling ensures that features with large values (e.g., 'Fare') don't dominate smaller ones
numerical_features = ['Age','SibSp','Parch','Fare']

# Scale the numerical features in the training set
# - `fit_transform` computes the scaling parameters (mean, variance) and applies scaling
X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])

# Scale the numerical features in the development set
# - `transform` applies the scaling using the parameters computed from the training set
X_dev[numerical_features] = scaler.transform(X_dev[numerical_features])

# Scale the numerical features in the test set
# - Ensures consistent scaling across all datasets using training set parameters
X_test[numerical_features] = scaler.transform(X_test[numerical_features])



In [None]:
X_train.head() #Verify that the data processing was successful

Unnamed: 0,Age,SibSp,Parch,Fare,Sex_male,Pclass_2,Pclass_3,Embarked_Q,Embarked_S
86,-1.045111,0.437238,3.15195,0.051433,True,False,True,False,True
329,-1.045111,-0.472444,0.732485,0.546842,False,False,False,False,False
517,-2.721948e-16,-0.472444,-0.477248,-0.163171,True,False,True,True,False
844,-0.9684947,-0.472444,-0.477248,-0.488225,True,False,True,False,True
408,-0.6620306,-0.472444,-0.477248,-0.506852,True,False,True,False,True


In [None]:
# Initialize an XGBoost classifier with a fixed random state for reproducibility
# and an early stopping condition to prevent overfitting during training.

xgb_clf = XGBClassifier(random_state=42, early_stopping_rounds=10)

# Define a parameter distribution for hyperparameter tuning.
# This will be used to find the best combination of parameters using RandomSearchCV.

param_dist_xgb = {
    'n_estimators': randint(50,1000), # Number of boosting rounds (trees), sampled between 50 and 1000.
    'learning_rate': loguniform(1e-4,0.1), # Learning rate, chosen on a logarithmic scale between 0.0001 and 0.1.
    'max_depth':randint(3,8), # Maximum depth of each tree, sampled between 3 and 8.
    'colsample_bytree':uniform(0.5,0.5), # Fraction of features used to train each tree, between 0.5 and 1.0.
    'subsample':uniform(0.5,0.5), # Fraction of training samples used for each tree, between 0.5 and 1.0.
    'reg_alpha':loguniform(1e-5,10), # L1 regularization term, chosen on a logarithmic scale.
    'reg_lambda':loguniform(1e-5,10),  # L2 regularization term, chosen on a logarithmic scale.
    }

In [None]:
# Perform Randomized Search Cross-Validation for XGBoost Classifier

random_search_xgb = RandomizedSearchCV(
    estimator=xgb_clf, # The base XGBoost classifier to optimize
    param_distributions = param_dist_xgb, # The hyperparameter distributions defined earlier
    n_iter = 100, # Number of random combinations of hyperparameters to try
    cv=5,  #Use 5-fold cross-validation to evaluate each combination
    scoring='accuracy', # Optimization metric to maximize (here, classification accuracy)
    verbose=2,  # Verbosity level; displays progress for each combination during training
    n_jobs=-1, #Use all available CPU cores to parallelize the search
    random_state = 42 #Ensures reproducibility of the random search
    )

In [None]:
#!pip install scikit-learn==1.5.2 In case you run into issues with scikit-learn compatibility

In [None]:
# Perform hyperparameter tuning and model training using RandomizedSearchCV
# - Fits the model on the training data (X_train, y_train)
# - Evaluates the model's performance on the development set (X_dev, y_dev) during training

random_search_xgb.fit(
    X_train,  # Features for training
    y_train, # Target labels for training
    eval_set=[(X_dev,y_dev)]) # Validation set used for monitoring and early stopping

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[0]	validation_0-logloss:0.63899
[1]	validation_0-logloss:0.61166
[2]	validation_0-logloss:0.58643
[3]	validation_0-logloss:0.56844
[4]	validation_0-logloss:0.56194
[5]	validation_0-logloss:0.54459
[6]	validation_0-logloss:0.53151
[7]	validation_0-logloss:0.51964
[8]	validation_0-logloss:0.50892
[9]	validation_0-logloss:0.50601
[10]	validation_0-logloss:0.50288
[11]	validation_0-logloss:0.50199
[12]	validation_0-logloss:0.50400
[13]	validation_0-logloss:0.50216
[14]	validation_0-logloss:0.49978
[15]	validation_0-logloss:0.49171
[16]	validation_0-logloss:0.48289
[17]	validation_0-logloss:0.47900
[18]	validation_0-logloss:0.47485
[19]	validation_0-logloss:0.46910
[20]	validation_0-logloss:0.46322
[21]	validation_0-logloss:0.46507
[22]	validation_0-logloss:0.46112
[23]	validation_0-logloss:0.45876
[24]	validation_0-logloss:0.45532
[25]	validation_0-logloss:0.45337
[26]	validation_0-logloss:0.45538
[27]	validation_0-logloss:0.4

In [None]:
# Evaluate the accuracy of the best model on the development set

# - Retrieves the best estimator (model) found during RandomizedSearchCV
best_model_xgb = random_search_xgb.best_estimator_ # Extract the best-performing model

# Compute the accuracy of the best model on the development set (X_dev, y_dev)
# The development set acts as unseen data to measure model generalization
dev_score = best_model_xgb.score(X_dev, y_dev)

# Print the accuracy score for the development set to assess initial performance
print(f"Development Set Accuracy Before Further Tuning: {dev_score}")

# Compute the accuracy of the best model on the training set (X_train, y_train)
# This provides insight into how well the model performs on the data it was trained on
train_score = best_model_xgb.score(X_train,y_train)

# Print the training set accuracy for comparison with the development set accuracy
print(f"Training Set Accuracy Before Further Tuning: {train_score}")

Development Set Accuracy Before Further Tuning: 0.8222222222222222
Training Set Accuracy Before Further Tuning: 0.9051186017478152


In [None]:
# Extract and analyze results from the initial RandomizedSearchCV process

# Store all cross-validation results from the random search into a DataFrame for easier analysis
xgb_results = pd.DataFrame(random_search_xgb.cv_results_)

# Identify the top 5 hyperparameter combinations based on the highest mean test scores
# Includes columns for the mean test score, standard deviation of test scores, and hyperparameter configurations
top_results_xgb = xgb_results[['mean_test_score','std_test_score','params']].sort_values('mean_test_score',ascending=False).head()

# Display the top 5 hyperparameter combinations and their corresponding performance metrics
top_results_xgb

Unnamed: 0,mean_test_score,std_test_score,params
38,0.852671,0.029798,"{'colsample_bytree': 0.873859386948707, 'learning_rate': 0.07231286044841394, 'max_depth': 7, 'n_estimators': 546, 'reg_alpha': 0.020729546493770245, 'reg_lambda': 0.027149061106928746, 'subsample': 0.9901657918580229}"
67,0.847694,0.03248,"{'colsample_bytree': 0.918481937118687, 'learning_rate': 0.03986631701572259, 'max_depth': 7, 'n_estimators': 217, 'reg_alpha': 2.732612421694094e-05, 'reg_lambda': 0.853415164671324, 'subsample': 0.8531211135782482}"
12,0.846436,0.035067,"{'colsample_bytree': 0.8645035840204937, 'learning_rate': 0.020597335357437203, 'max_depth': 7, 'n_estimators': 539, 'reg_alpha': 0.0014151235919053699, 'reg_lambda': 4.956947932799954e-05, 'subsample': 0.9315517129377968}"
50,0.843929,0.031695,"{'colsample_bytree': 0.6399669484729714, 'learning_rate': 0.07321428778380787, 'max_depth': 7, 'n_estimators': 839, 'reg_alpha': 0.24001362671293255, 'reg_lambda': 1.933288938757823e-05, 'subsample': 0.7830186052470381}"
73,0.843921,0.03317,"{'colsample_bytree': 0.901054624513367, 'learning_rate': 0.08584243064474285, 'max_depth': 7, 'n_estimators': 878, 'reg_alpha': 0.0008632137104961323, 'reg_lambda': 1.821397347548555e-05, 'subsample': 0.9623216651117981}"


In [None]:
# Retrain the model on the best parameters using the combined training and development set

# Use the original, unscaled data from the training set (this time with no separate dev set)
# Extract features and the target variable for final training
X_train_final = train_set[['Age','Pclass','Sex','SibSp','Parch','Fare','Embarked']]
y_train_final = train_set['Survived']

# Convert the 'Pclass' column to categorical (category) type
X_train_final.loc[:,'Pclass'] = X_train_final.loc[:,'Pclass'].astype('category')

# Re-apply KNN imputation to handle missing values in the 'Age' column
X_train_final['Age'] = imputer.fit_transform(X_train_final[['Age']])[:,0]

# Re-apply scaling to numerical features
X_train_final[numerical_features] = scaler.fit_transform(X_train_final[numerical_features])

# Re-apply one-hot encoding to categorical features
X_train_final = pd.get_dummies(X_train_final, columns=categorical_features, drop_first=True)

# Retrain the final model using the best parameters found by RandomizedSearchCV
best_params_xgb = random_search_xgb.best_params_   # Retrieve the best hyperparameter combination
final_model = XGBClassifier(**best_params_xgb, random_state=42) # Create a new model with the best parameters
final_model.fit(X_train_final, y_train_final) # Fit the final model on the preprocessed combined training data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_final['Age'] = imputer.fit_transform(X_train_final[['Age']])[:,0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_final[numerical_features] = scaler.fit_transform(X_train_final[numerical_features])


In [None]:
# Evaluate the accuracy of the final model on the development set
dev_score_xgb = final_model.score(X_dev, y_dev)
print(f"Development Set Accuracy After Tuning {dev_score_xgb}") # Print the development set accuracy after retraining

# Evaluate the accuracy of the final model on the training set
train_score_xgb = final_model.score(X_train,y_train)
print(f"Training Set Accuracy After Tuning {train_score_xgb}") # Print the training set accuracy after retraining

Development Set Accuracy After Tuning 0.8666666666666667
Training Set Accuracy After Tuning 0.8951310861423221


In [None]:
# Generate predictions on the test set using the final trained model
y_pred = final_model.predict(X_test)

# Create a DataFrame for the submission file
# - 'PassengerId' is taken from the test set to ensure alignment with the dataset
# - 'Survived' contains the predicted labels from the final model
submissions_df = pd.DataFrame({'PassengerId': test_set['PassengerId'],  # Unique identifier for each passenger
                               'Survived': y_pred # Predicted survival outcomes
                               })

# Save the predictions to a CSV file for submission
# - The file 'submissions.csv' will contain the required format for uploading
# - `index=False` ensures the DataFrame index is not included in the output file
submissions_df.to_csv('submissions.csv', index=False)


In [None]:
#Copy csv file to current Google Drive
#!cp submissions.csv /content/drive/MyDrive


#Save the final trained model
#import joblib
#joblib.dump(final_model, 'best_rgb_model.pk1')

#Load a reuse the saved model
#loaded_model = joblib.load('best_rgb_model.pk1')

#Making predictions from the newly loaded model
#new_predictions = loaded_model.predict(X_test)
