In [1]:
# import all related packages
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split

import matplotlib.pyplot as plt

In [2]:
torch.manual_seed(42)

<torch._C.Generator at 0x327ea3710>

# Loading the data

In [3]:
# Reading the data
train_data_path = "titanic-data/train.csv"
data = pd.read_csv(train_data_path)
data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [4]:
# splitting the data into training and validation sets
y = data.copy().Survived
X = data.copy().drop(['Survived', 'PassengerId'], axis=1)

X.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
# getting categorical/numerical columns
categorical_cols = [cname for cname in X.columns if X[cname].nunique() < 10 and X[cname].dtype == "object"]
numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]

my_cols = categorical_cols + numerical_cols
X = X[my_cols].copy()
X.head()

Unnamed: 0,Sex,Embarked,Pclass,Age,SibSp,Parch,Fare
0,male,S,3,22.0,1,0,7.25
1,female,C,1,38.0,1,0,71.2833
2,female,S,3,26.0,0,0,7.925
3,female,S,1,35.0,1,0,53.1
4,male,S,3,35.0,0,0,8.05


# Preprocessing data

In [6]:
# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy="median")

# Preprocessing for categorical data
categorical_transformer = Pipeline(
    steps = [
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers = [
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [7]:
X_processed = preprocessor.fit_transform(X)
X_processed = torch.tensor(X_processed)
y = torch.tensor(y)

In [8]:
dataset = TensorDataset(X_processed, y)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64)

# Creating a model

In [9]:
X_processed.size()

torch.Size([891, 10])

In [18]:
input_size = 10
hidden_size = 64
output_size = 1

# creating a feedforward model
model1 = nn.Sequential(
    nn.Linear(in_features=input_size, out_features=hidden_size),
    nn.LeakyReLU(negative_slope=0.01),
    nn.BatchNorm1d(hidden_size),
    # nn.Dropout(0.2),  # Add Dropout with a rate of 50%
    nn.Linear(in_features=hidden_size, out_features=hidden_size),
    nn.LeakyReLU(negative_slope=0.01),
    nn.BatchNorm1d(hidden_size),
    #nn.Dropout(0.2),  # Dropout in second layer as well
    nn.Linear(in_features=hidden_size, out_features=hidden_size),
    nn.LeakyReLU(negative_slope=0.01),
    nn.BatchNorm1d(hidden_size),
    #nn.Dropout(0.2),  # Dropout in second layer as well
    nn.Linear(in_features=hidden_size, out_features=hidden_size),
    nn.LeakyReLU(negative_slope=0.01),
    nn.BatchNorm1d(hidden_size),
    #nn.Dropout(0.2),  # Dropout in second layer as well
    nn.Linear(in_features=hidden_size, out_features=hidden_size),
    nn.LeakyReLU(negative_slope=0.01),
    nn.BatchNorm1d(hidden_size),
    nn.Dropout(0.01),  # Dropout in second layer as well
    nn.Linear(in_features=hidden_size, out_features=output_size),
)

In [19]:
# initialize loss and optimizer functions
loss_fn = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(params=model1.parameters(), lr=0.0002, weight_decay=1e-5)

# Training the model

In [20]:
epochs = 750
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, factor=0.3)

for epoch in range(epochs):
    model1.train()
    epoch_loss = 0.0
    total_batches = len(train_loader)
    
    for batch_idx, (inputs, targets) in enumerate(train_loader, start=1):
        inputs = inputs.float()
        targets = targets.float()
        
        # 1. forward pass
        y_logits = model1(inputs).squeeze()
        
        # 2. Calculate the loss
        loss = loss_fn(y_logits, targets)
        
        # 3. Zero the optimizer
        optimizer.zero_grad()
        
        # 4. Backprop
        loss.backward()
        
        # 5. Gradient descent
        optimizer.step()
        
        epoch_loss += loss.item()
        
        if epoch % 100 == 0 and batch_idx == total_batches:
            print(f"Epoch {epoch}/{epochs}, Loss: {epoch_loss / len(train_loader):.4f}")
            
    # Call the scheduler with validation loss (use your validation data)
    model1.eval()
    val_loss = 0.0
    with torch.no_grad():
        for inputs, targets in val_loader:
            inputs = inputs.float()
            targets = targets.float()
            y_logits = model1(inputs).squeeze()
            val_loss += loss_fn(y_logits, targets).item()

    # Step the scheduler with validation loss
    scheduler.step(val_loss / len(val_loader))
    

Epoch 0/750, Loss: 0.7202
Epoch 100/750, Loss: 0.4216
Epoch 200/750, Loss: 0.4322
Epoch 300/750, Loss: 0.4829
Epoch 400/750, Loss: 0.4118
Epoch 500/750, Loss: 0.4546
Epoch 600/750, Loss: 0.4268
Epoch 700/750, Loss: 0.4535


In [21]:
def calculate_accuracy(model, data_loader, threshold=0.5):
    correct = 0
    total = 0
    
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():  # No need to calculate gradients
        for inputs, targets in data_loader:
            inputs = inputs.float()
            targets = targets.float()
            
            # Forward pass: Get model predictions
            y_logits = model(inputs).squeeze()
            
            # Convert logits to predictions using the threshold
            y_preds = torch.round(torch.sigmoid(y_logits))
            
            # Calculate number of correct predictions
            correct += (y_preds == targets).sum().item()
            total += targets.size(0)
    
    accuracy = correct / total * 100
    return accuracy


# Example usage during training or validation
train_accuracy = calculate_accuracy(model1, train_loader)
val_accuracy = calculate_accuracy(model1, val_loader)

print(f"Training Accuracy: {train_accuracy:.2f}%")
print(f"Validation Accuracy: {val_accuracy:.2f}%")

Training Accuracy: 40.59%
Validation Accuracy: 29.61%


In [14]:
test_data = pd.read_csv("titanic-data/test.csv")
X_test = test_data.copy()
X_test = X_test.drop(['PassengerId'], axis=1)

X_test = X_test[my_cols]
X_test = preprocessor.transform(X_test)
X_test = torch.tensor(X_test)
X_test = X_test.float()

In [15]:
print(X_processed[0])
print(X_test[0])

tensor([ 3.0000, 22.0000,  1.0000,  0.0000,  7.2500,  0.0000,  1.0000,  0.0000,
         0.0000,  1.0000], dtype=torch.float64)
tensor([ 3.0000, 34.5000,  0.0000,  0.0000,  7.8292,  0.0000,  1.0000,  0.0000,
         1.0000,  0.0000])


In [16]:
predictions = torch.round(torch.sigmoid(model1(X_test)))
predictions = predictions.int().detach().numpy().flatten()
print(predictions)

[0 0 0 0 1 0 1 0 1 0 0 0 1 0 1 1 0 0 0 0 0 1 1 1 0 0 1 0 0 0 0 0 0 0 1 0 0
 1 0 0 0 0 0 1 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 1 0 1 0 0 0
 1 0 0 1 0 1 1 0 0 0 0 0 1 1 1 1 1 0 1 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0
 1 1 1 1 0 0 1 1 1 1 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0
 0 0 1 0 0 0 0 0 1 1 0 1 1 1 1 0 0 1 1 0 1 1 0 0 0 0 0 1 1 0 1 1 0 1 1 0 0
 0 1 0 0 0 0 0 0 0 1 0 1 1 0 0 1 1 1 1 0 0 0 0 1 0 0 0 0 1 0 1 1 0 1 0 1 0
 1 0 1 1 0 1 0 0 0 1 0 0 0 0 0 0 1 1 1 0 0 0 0 1 1 0 1 1 1 0 0 0 0 0 0 0 1
 0 0 0 1 1 1 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 1 0
 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 1 1 0 1 0 0 1 1 0
 0 1 0 0 1 1 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 1 1 0 0 0
 0 1 1 1 1 0 0 1 0 0 0]


In [17]:
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission1.csv', index=False)
print("Your submission was successfully saved!")# Submitting the output

Your submission was successfully saved!
