In [None]:
##**Trying to apply Pytorch deep learning to solve Titanic on Kaggle  -  Work in progress**

In [None]:
#I know using Deep Learning is a bit overkill for this but I'm trying to learn pytorch so...



In [9]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.preprocessing import StandardScaler
import torch.optim as optim


In [16]:
#create Train and test classes

class TitanicDataset(Dataset):
    def __init__(self, csv_file, scaler=None):
        self.data = pd.read_csv(csv_file)
        feature_cols = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked']
        self.X = self.data[feature_cols].values
        self.y = self.data['Survived'].values

        if scaler is None:
            self.scaler = StandardScaler()
            self.X = self.scaler.fit_transform(self.X)  # Fit and transform
        else:
            self.scaler = scaler
            self.X = self.scaler.transform(self.X)  # Just transform
            
        self.X = torch.tensor(self.X, dtype=torch.float32)
        self.y = torch.tensor(self.y, dtype=torch.float32)
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

class TitanicTestDataset(Dataset):
    def __init__(self, csv_file, scaler=None):
        self.data = pd.read_csv(csv_file)
        feature_cols = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked']
        self.X = self.data[feature_cols].values
        self.X = torch.tensor(self.X, dtype=torch.float32)

        self.X = scaler.transform(self.X)
        
        self.X = torch.tensor(self.X, dtype=torch.float32)
        
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        return self.X[idx]


In [17]:
#make instance of train and test datasets with batching and shuffling

train_dataset = TitanicDataset('train.NUMO.csv')
test_dataset = TitanicTestDataset('test.NUMO.csv', scaler=train_dataset.scaler)  

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)



In [27]:
# Let's make a simple architecture for the model
# I'm thinking Input - Linear - ReLU - Linear - ReLU - Linear - ReLU - Sigmoid (64 - 32 - 16 - 1)


class TitanicNet(nn.Module):
    def __init__(self, input_size):
        super(TitanicNet, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.dt1 = nn.Dropout(p=0.4)
        self.fc2 = nn.Linear(64,32)
        self.dt2 = nn.Dropout(p=0.3)
        self.fc3 = nn.Linear(32,16)
        self.dt3 = nn.Dropout(p=0.4)
        self.fc4 = nn.Linear(16,1)

    def forward(self, X):
        X = torch.relu(self.fc1(X))
        X = self.dt1(X)
        X = torch.relu(self.fc2(X))
        X = self.dt2(X)
        X = torch.relu(self.fc3(X))
        X = self.dt3(X)
        X = torch.sigmoid(self.fc4(X))
        return X

input_size = 8 # we have 8 features
model = TitanicNet(input_size)

        

In [28]:
print(model)

TitanicNet(
  (fc1): Linear(in_features=8, out_features=64, bias=True)
  (dt1): Dropout(p=0.4, inplace=False)
  (fc2): Linear(in_features=64, out_features=32, bias=True)
  (dt2): Dropout(p=0.3, inplace=False)
  (fc3): Linear(in_features=32, out_features=16, bias=True)
  (dt3): Dropout(p=0.4, inplace=False)
  (fc4): Linear(in_features=16, out_features=1, bias=True)
)


In [29]:
# Choosing a loss func
# obvious choice is binary cross entropy loss

loss_func = nn.BCELoss()

# Lets use Adam 

optimizer = optim.Adam(model.parameters(), lr=0.001)  #Learning rate could be even smaller TBH, 
                                                      #Its strange to fine tune a nn on such a small data set (8 features, ~820 examples)


In [32]:
# Now lets specify the training loop

num_epochs = 100

for epoch in range(num_epochs):
    model.train()  
    train_loss = 0.0
    correct = 0
    total = 0
    for batch_X, batch_Y in train_loader:
        #Forward pass
        outputs = model(batch_X)
        loss = loss_func(outputs, batch_Y.reshape(-1, 1))

        #backward pass
        optimizer.zero_grad()  # Clear gradients from last step
        loss.backward()        # Compute gradients
        optimizer.step()

        # Tracking metrics
        train_loss += loss.item()
        predicted = (outputs > 0.5).float()
        total += batch_Y.size(0)
        correct += (predicted == batch_Y.reshape(-1, 1)).sum().item()

    train_accuracy = 100 * correct / total
    avg_train_loss = train_loss / len(train_loader)

    #SHow progress while training
    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_train_loss:.4f}, Accuracy: {train_accuracy:.2f}%')

print("Training complete")


    

Epoch [10/100], Loss: 0.3961, Accuracy: 84.51%
Epoch [20/100], Loss: 0.3850, Accuracy: 84.85%
Epoch [30/100], Loss: 0.3855, Accuracy: 84.62%
Epoch [40/100], Loss: 0.3741, Accuracy: 84.29%
Epoch [50/100], Loss: 0.3739, Accuracy: 85.63%
Epoch [60/100], Loss: 0.3895, Accuracy: 84.74%
Epoch [70/100], Loss: 0.3755, Accuracy: 85.52%
Epoch [80/100], Loss: 0.3716, Accuracy: 85.75%
Epoch [90/100], Loss: 0.3699, Accuracy: 85.52%
Epoch [100/100], Loss: 0.3638, Accuracy: 86.20%
Training complete


In [33]:
# Now lets use the model on the test set!

model.eval()

predictions = []
with torch.no_grad():
    for batch_X in test_loader:
        outputs = model(batch_X)
        predicted = (outputs > 0.5).float()
        predictions.extend(predicted.numpy())

# make submission file
test_data = pd.read_csv('test.NUMO.csv')
submission = pd.DataFrame({
    'PassengerId': test_data['PassengerId'],  
    'Survived': [int(p[0]) for p in predictions]
})

submission.to_csv('titanic_submission.csv', index=False)
print('Submission created')

Submission created


In [None]:
# Submission #1 got 77% accuracy, an improvement over the 67% achevued with a randomforest model


# for this task, a gradient boosting model or ensemble methods with simpler models will probably outperform neural nets because the dataset is so small.
