In [None]:
import torch
import pandas as pd
import torch.nn as nn
import numpy as np
from torch.utils.data import DataLoader, TensorDataset

In [None]:
def clean_dataset(path, test = False):
    df = pd.read_csv(path)
    del df['PassengerId'] # not neccessary
    del df['Name'] # not neccessary
    del df['Ticket'] # can be infered from Pclass and Fare
    del df['Cabin'] # can be infered from Pclass and Fare
    del df['Embarked'] # can be infered from Pclass and Fare
    arr = df.to_numpy()
    if test:
        arr[:, 1] = np.where(arr[:, 1] == 'male', 0, 1)
    else:
        arr[:, 2] = np.where(arr[:, 2] == 'male', 0, 1)
    arr = arr.astype(float)
    for i in range(arr.shape[1]):
        if np.issubdtype(arr[:, i].dtype, np.float64):
            arr[:, i] /= max(arr[:, i])
            median = np.nanmedian(arr[:, i]) 
            arr[:, i] = np.where(np.isnan(arr[:, i]), median, arr[:, i])  # replace NaNs
    return arr
arr = clean_dataset(path='../dataset/titanic/train.csv')

In [None]:
Xtrain = torch.tensor(arr[:,1:], dtype=torch.float32)
ytrain = torch.tensor(arr[:,:1], dtype=torch.long).squeeze(1)
dataset = TensorDataset(Xtrain, ytrain)
dataloader = DataLoader(dataset=dataset, batch_size=10, shuffle=True)
Xtrain

In [None]:
class MLP(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(6,128),
            nn.ReLU(),
            nn.Linear(128,64),
            nn.ReLU(),
            nn.Linear(64,2),
        )
    def forward(self, x):
        return self.layers(x)
    
model = MLP()

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr = 0.01)
loss_function = torch.nn.CrossEntropyLoss()

for k in range(1000):
    current_loss = 0.0
    for batch_idx, (x_batch, y_batch) in enumerate(dataloader):
        optimizer.zero_grad()
        output = model(x_batch).squeeze(1)
        loss = loss_function(output, y_batch)
        loss.backward()
        optimizer.step()
        current_loss += loss.item()
    print(f'Epoch {k}, Avg Loss: {current_loss / len(dataloader)}')
print("Training complete")

In [None]:
arr = clean_dataset(path="../dataset/titanic/test.csv", test=True)
print(arr)
test_features = torch.tensor(arr, dtype = torch.float32)
outputs = model(test_features)
_, predicted_class = torch.max(outputs, 1)

submission_df = pd.DataFrame(predicted_class.numpy(), columns=['Survived'])
submission_df
submission_df.insert(0, 'PassengerId', range(892, 892 + len(submission_df)))

submission_df.to_csv('submission.csv', index=False)

print("The predictions have been saved to 'submission.csv'")




In [14]:
torch.save(model, '../models/titanic.pth')