# Titanic - Machine Learning from Disaster

### Load data

In [395]:
import pandas as pd
import numpy as np

# Load data from files
test_X = pd.read_csv('test.csv')
train = pd.read_csv('train.csv')

# Split data into X:parameters and y:output
train_y = train["Survived"]
train_X = train.drop(columns="Survived")

### Clean data

In [396]:
# Function for cleaning data

def clean_data(data: pd.core.frame.DataFrame) -> pd.core.frame.DataFrame:
    
    # Drop unnecessary columns
    to_drop = ["PassengerId","Name","SibSp","Parch","Ticket","Fare","Cabin","Embarked"]
    data.drop(columns=to_drop, inplace=True)

    # Rephrase male and female into discrete values between 0 and 1
    data = data.replace("male", 0)
    data = data.replace("female", 1)

    # Calculate avg age to replace null values with
    avg_age = round(sum(data["Age"].dropna()) / len(data["Age"].dropna()))
    data["Age"] = data["Age"].fillna(avg_age)

    # Calculate max values for normalization of columns
    max_age = max(data["Age"])
    max_pclass = max(data["Pclass"])

    # Normalize values
    data["Pclass"] /= max_pclass
    data["Age"] /= max_age

    return data

# Clean both train and test data
ctrain = clean_data(train_X)
ctest = clean_data(test_X)

# Get labels
train_labels = ctrain.columns

In [397]:
import torch

# Set seed for torch
torch.manual_seed(42)

# Put data onto tensors
train_X_tensor = torch.tensor(ctrain.to_numpy())
train_y_tensor = torch.tensor(train_y.to_numpy())
test_X_tensor = torch.tensor(ctest.to_numpy())

# Split data into training- and validation sets
train_split = int(0.8 * len(train_X_tensor))
X_train, y_train = train_X_tensor[:train_split], train_y_tensor[:train_split]
X_valid, y_valid = train_X_tensor[train_split:], train_y_tensor[train_split:]

### Create model

In [398]:
# Load neural network
from torch import nn

# Select device (aka cpu/gpu)
device = "cuda" if torch.cuda.is_available() else "cpu"
#print(f"Using {device} device")

# Defining the neural network class
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.flatten = nn.Flatten()

        self.layer_1 = nn.Linear(in_features=3, out_features=16)
        self.layer_2 = nn.Linear(in_features=16, out_features=16)
        self.layer_3 = nn.Linear(in_features=16, out_features=1)
    
    def forward(self, x):
        x = self.layer_1(x)
        #x = torch.relu(x)
        x = self.layer_2(x)
        #x = torch.relu(x)
        x = self.layer_3(x)
        x = torch.sigmoid(x)
        return x

model = NeuralNetwork().to(device)
print(model)

NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (layer_1): Linear(in_features=3, out_features=16, bias=True)
  (layer_2): Linear(in_features=16, out_features=16, bias=True)
  (layer_3): Linear(in_features=16, out_features=1, bias=True)
)


### Train model

In [399]:
# Set loss function
loss_fn = nn.BCEWithLogitsLoss()

# Optimizer
optim = torch.optim.SGD(params=model.parameters(), lr=1)

# Define accuracy function
def accuracy_fn(y_true, y_pred):
    correct = torch.eq(y_true, y_pred).sum().item()
    acc = (correct / len(y_true)) * 100
    return acc

# Amount of training epochs to run
epochs = 100

# Training loop
for epoch in range(epochs):
    # Forward pass
    y_pred = model(X_train.float()).flatten()
    loss = loss_fn(y_pred, y_train.float())

    # Backward pass
    optim.zero_grad()
    loss.backward()
    optim.step()

    # Calculate accuracy
    y_pred = torch.round(y_pred)
    acc = accuracy_fn(y_train, y_pred)
    if epoch % 10 == 0:
        print(f"Epoch {epoch}: Loss: {loss.item()}, Accuracy: {acc}")

Epoch 0: Loss: 0.7835756540298462, Accuracy: 39.04494382022472
Epoch 10: Loss: 0.6971967220306396, Accuracy: 60.95505617977528
Epoch 20: Loss: 0.687463641166687, Accuracy: 60.95505617977528
Epoch 30: Loss: 0.6756638288497925, Accuracy: 60.95505617977528
Epoch 40: Loss: 0.656712532043457, Accuracy: 77.9494382022472
Epoch 50: Loss: 0.6472126245498657, Accuracy: 78.23033707865169
Epoch 60: Loss: 0.6432842016220093, Accuracy: 78.65168539325843
Epoch 70: Loss: 0.6410934925079346, Accuracy: 78.51123595505618
Epoch 80: Loss: 0.6395926475524902, Accuracy: 77.9494382022472
Epoch 90: Loss: 0.6384506225585938, Accuracy: 77.9494382022472


### Validate model

In [400]:
with torch.inference_mode():
    y_valid_pred = model(X_valid.float()).flatten()

print(f"Final training: Loss: {loss.item()}, Accuracy: {acc}")


loss = loss_fn(y_valid_pred, y_valid.float())
y_valid_pred = torch.round(y_valid_pred)
acc = accuracy_fn(y_valid, y_valid_pred)
print(f"Validation: Loss: {loss.item()}, Accuracy: {acc}")

Final training: Loss: 0.6376174092292786, Accuracy: 77.9494382022472
Validation: Loss: 0.6380630135536194, Accuracy: 81.56424581005587


### Compute predictions for test data

In [401]:
with torch.inference_mode():
    y_pred_test = model(test_X_tensor.float()).flatten()

y_pred_test = torch.round(y_pred_test)

### Save output to csv

In [402]:
# Revert predictions to NumPy array
y_pred_np = y_pred_test.int().detach().numpy()

# Create a list with the correct indicies
min_i = max(train['PassengerId'])
y_indicies = [min_i+i+1 for i in range(len(y_pred_test))]

# Make a combined list of indicies and predictions
y_data = {'PassengerId': y_indicies, 'Survived': y_pred_np}

# Convert to Pandas Dataframe
y_pred_df = pd.DataFrame(data=y_data)

# Select file name
import os.path

filename = "basicnn_submission.csv"
while os.path.isfile(filename):
    filename = filename[:len(filename)-4] + "(1).csv"

# Send to csv-file
y_pred_df.to_csv(filename,index=False)