<a href="https://colab.research.google.com/github/ddo2024/ggcolab/blob/main/Titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Imports important libraries
!pip install opendatasets
!pip install xgboost
!pip install tensorflow

import pandas as pd
import opendatasets as od
import os
import numpy as np
import tensorflow as tf
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Input
from tensorflow.keras.optimizers import Adam

Collecting opendatasets
  Downloading opendatasets-0.1.22-py3-none-any.whl.metadata (9.2 kB)
Downloading opendatasets-0.1.22-py3-none-any.whl (15 kB)
Installing collected packages: opendatasets
Successfully installed opendatasets-0.1.22


In [2]:
# Use the opendatasets library to download the Titanic dataset
dataset_url = "https://www.kaggle.com/c/titanic/data"
od.download(dataset_url)


Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: ddo2024
Your Kaggle Key: ··········
Downloading titanic.zip to ./titanic


100%|██████████| 34.1k/34.1k [00:00<00:00, 14.8MB/s]


Extracting archive ./titanic/titanic.zip to ./titanic





In [4]:
# List the contents of the downloaded dataset folder
print(os.listdir("titanic"))


['train.csv', 'gender_submission.csv', 'test.csv']


In [5]:
# Load and display the first few rows of the Titanic training and test datasets
train_data = pd.read_csv("titanic/train.csv")
display(train_data.head())

test_data = pd.read_csv("titanic/test.csv")
display(test_data.head())


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [6]:
# Select relevant features
features = ["Pclass","Sex", "SibSp", "Parch"]
data_with_features = train_data[features]

# Apply one-hot encoding to the categorical variables in the selected features to convert them into numerical format.
X = pd.get_dummies(data_with_features)
y = train_data["Survived"]

# Split the dataset into training(80%) and testing(20%) sets
X_train, X_test, y_train,y_test = train_test_split(X,y,train_size = 0.8)

X_train = X_train.astype(int)
X_test = X_test.astype(int)

# Check shapes, data
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
print("X_train\n", X_train)
print("X_test\n", X_test)
print("y_train\n", y_train)
print("y_test\n", y_test)


(712, 5) (179, 5)
(712,) (179,)
(712, 5)
(179, 5)
(712,)
(179,)
X_train
      Pclass  SibSp  Parch  Sex_female  Sex_male
99        2      1      0           0         1
724       1      1      0           0         1
499       3      0      0           0         1
40        3      1      0           1         0
236       2      1      0           0         1
..      ...    ...    ...         ...       ...
393       1      1      0           1         0
816       3      0      0           1         0
15        2      0      0           1         0
837       3      0      0           0         1
288       2      0      0           0         1

[712 rows x 5 columns]
X_test
      Pclass  SibSp  Parch  Sex_female  Sex_male
666       2      0      0           0         1
185       1      0      0           0         1
55        1      0      0           0         1
0         3      1      0           0         1
186       3      1      0           1         0
..      ...    ...    ...      

In [7]:
# Train the Random Forest Classifier model
rfc_model = RandomForestClassifier(n_estimators = 100, max_depth = 5, random_state = 42)
rfc_model.fit(X_train,y_train)

# Calculate the test accuracy of the rfc_model by comparing predicted values with actual values from the test set
rfc_test_predictions = rfc_model.predict(X_test)
rfc_test_accuracy = accuracy_score(y_test, rfc_test_predictions)
print(f"RandomForestClassifier model test accuracy: {rfc_test_accuracy:.4f}")

# Calculate the training accuracy of the rfc_model by comparing predicted values with actual values from the training set
rfc_train_prediction = rfc_model.predict(X_train)
rfc_train_accuracy = accuracy_score(y_train, rfc_train_prediction)
print(f"RandomForestClassifier model train accuracy: {rfc_train_accuracy:.4f}")


RandomForestClassifier model test accuracy: 0.7709
RandomForestClassifier model train accuracy: 0.8258


In [8]:
# Hyperparameter tuning
param_dic = {"learning_rate":uniform(0.01, 0.1).rvs(10),
             "max_depth":[2,3,4,5,6],
             "subsample":uniform(0,0.1).rvs(10),
             "n_estimators":[50,100,150,200,250]
}
random_search = RandomizedSearchCV(xgb.XGBClassifier(random_state = 42), param_dic, n_iter = 20,scoring = "accuracy", cv = 5)
random_search.fit(X_train,y_train)
print(random_search.best_params_)


{'subsample': 0.068213339292072, 'n_estimators': 50, 'max_depth': 5, 'learning_rate': 0.09628478182502448}


In [9]:
# Train the XGBoost model
xgb_model = xgb.XGBClassifier(n_estimators = random_search.best_params_["n_estimators"],
                              max_depth = random_search.best_params_["max_depth"],
                              random_state = 42,
                              subsample = random_search.best_params_["subsample"],
                              learning_rate = random_search.best_params_["learning_rate"])
xgb_model.fit(X_train,y_train)

# Calculate the test accuracy of the xgb_model by comparing predicted values with actual values from the test set
xgb_test_predictions = xgb_model.predict(X_test)
xgb_test_accuracy = accuracy_score(y_test, xgb_test_predictions)
print(f"XGBClassifier model test accuracy: {xgb_test_accuracy:.4f}")

# Calculate the training accuracy of the xgb_model by comparing predicted values with actual values from the training set
xgb_train_predictions = xgb_model.predict(X_train)
xgb_train_accuracy = accuracy_score(y_train, xgb_train_predictions)
print(f"XGBClassifier model train accuracy: {xgb_train_accuracy:.4f}")

XGBClassifier model test accuracy: 0.7933
XGBClassifier model train accuracy: 0.8062


In [10]:
# Train Artificial Neural Network model
ANN_model = Sequential(
    [
    Input(shape=(X_train.shape[1],)),
    Dense(32, activation='relu', name = "L1"),
    Dense(16, activation='relu', name = "L2"),
    Dense(1, activation='sigmoid', name = "L3")
    ], name = "ANN_model"
)
ANN_model.compile(optimizer=Adam(learning_rate = 0.005), loss='binary_crossentropy', metrics=['accuracy'])
ANN_model.fit(X_train, y_train)

# Display model summary
ANN_model.summary()

# Calculate the test accuracy of the ANN_model by comparing predicted values with actual values from the test set
ANN_test_predictions = ANN_model.predict(X_test)
ANN_test_predictions = (ANN_test_predictions > 0.5).astype(int) # Convert the predicted probabilities to binary class labels (0 or 1)
ANN_test_accuracy = accuracy_score(y_test, ANN_test_predictions)
print(f"ANN_model test accuracy: {ANN_test_accuracy:.4f}")

# Calculate the training accuracy of the ANN_model by comparing predicted values with actual values from the training set
ANN_train_predictions = ANN_model.predict(X_train)
ANN_train_predictions = (ANN_train_predictions > 0.5).astype(int) # Convert the predicted probabilities to binary class labels (0 or 1)
ANN_train_accuracy = accuracy_score(y_train, ANN_train_predictions)
print(f"ANN_model training accuracy: {ANN_train_accuracy:.4f}")

[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.6707 - loss: 0.6083


[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
ANN_model test accuracy: 0.7765
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
ANN_model training accuracy: 0.7949
