# Titanic - Machine Learning from Disaster

### Load data

In [3]:
import pandas as pd
import numpy as np

# Load data from files
test_X = pd.read_csv('test.csv')
train = pd.read_csv('train.csv')

# Split data into X:parameters and y:output
train_y = train["Survived"]
train_X = train.drop(columns="Survived")

### Clean data

In [4]:
# Function for cleaning data

def clean_data(data: pd.core.frame.DataFrame) -> pd.core.frame.DataFrame:
    
    # Drop unnecessary columns
    to_drop = ["PassengerId","Name","SibSp","Parch","Ticket","Fare","Cabin","Embarked"]
    data.drop(columns=to_drop, inplace=True)

    # Rephrase male and female into discrete values between 0 and 1
    data = data.replace("male", 0)
    data = data.replace("female", 1)

    # Calculate avg age to replace null values with
    avg_age = round(sum(data["Age"].dropna()) / len(data["Age"].dropna()))
    data["Age"] = data["Age"].fillna(avg_age)

    # Calculate max values for normalization of columns
    max_age = max(data["Age"])
    max_pclass = max(data["Pclass"])

    # Normalize values
    data["Pclass"] /= max_pclass
    data["Age"] /= max_age

    return data

# Clean both train and test data
ctrain = clean_data(train_X)
ctest = clean_data(test_X)

# Get labels
train_labels = ctrain.columns

In [5]:
import torch

# Set seed for torch
torch.manual_seed(42)

# Put data onto tensors
train_X_tensor = torch.tensor(ctrain.to_numpy())
train_y_tensor = torch.tensor(train_y.to_numpy())
test_X_tensor = torch.tensor(ctest.to_numpy())

# Split data into training- and validation sets
train_split = int(0.8 * len(train_X_tensor))
X_train, y_train = train_X_tensor[:train_split], train_y_tensor[:train_split]
X_valid, y_valid = train_X_tensor[train_split:], train_y_tensor[train_split:]

### Create model

In [6]:
# Load neural network
from torch import nn

# Select device (aka cpu/gpu)
device = "cuda" if torch.cuda.is_available() else "cpu"
#print(f"Using {device} device")

# Defining the neural network class
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.flatten = nn.Flatten()

        self.layer_1 = nn.Linear(in_features=3, out_features=32)
        self.layer_2 = nn.Linear(in_features=32, out_features=32)
        self.layer_3 = nn.Linear(in_features=32, out_features=1)
    
    def forward(self, x):
        layer1 = self.layer_1(x)
        layer2 = self.layer_2(layer1)
        layer3 = self.layer_3(layer2)
        return layer3

model = NeuralNetwork().to(device)
print(model)

NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (layer_1): Linear(in_features=3, out_features=32, bias=True)
  (layer_2): Linear(in_features=32, out_features=32, bias=True)
  (layer_3): Linear(in_features=32, out_features=1, bias=True)
)


### Train model

In [7]:
# Set loss function
loss_fn = nn.BCEWithLogitsLoss()

# Optimizer
optim = torch.optim.SGD(params=model.parameters(), lr=1)

# Define accuracy function
def accuracy_fn(y_true, y_pred):
    correct = torch.eq(y_true, y_pred).sum().item()
    acc = (correct / len(y_true)) * 100
    return acc

# Amount of training epochs to run
epochs = 100

# Training loop
for epoch in range(epochs):
    # Forward pass
    y_pred = model(X_train.float()).flatten()
    loss = loss_fn(y_pred, y_train.float())

    # Backward pass
    optim.zero_grad()
    loss.backward()
    optim.step()

    # Calculate accuracy
    y_pred = torch.round(torch.sigmoid(y_pred))
    acc = accuracy_fn(y_train, y_pred)
    if epoch % 10 == 0:
        print(f"Epoch {epoch}: Loss: {loss.item()}, Accuracy: {acc}")

Epoch 0: Loss: 0.7491713762283325, Accuracy: 39.04494382022472
Epoch 10: Loss: 0.5588670372962952, Accuracy: 75.84269662921348
Epoch 20: Loss: 0.47447526454925537, Accuracy: 78.37078651685393
Epoch 30: Loss: 0.4704074263572693, Accuracy: 78.37078651685393
Epoch 40: Loss: 0.4677979052066803, Accuracy: 78.37078651685393
Epoch 50: Loss: 0.46601659059524536, Accuracy: 78.37078651685393
Epoch 60: Loss: 0.4648061990737915, Accuracy: 78.51123595505618
Epoch 70: Loss: 0.46401742100715637, Accuracy: 78.51123595505618
Epoch 80: Loss: 0.46357131004333496, Accuracy: 78.37078651685393
Epoch 90: Loss: 0.5146408677101135, Accuracy: 77.24719101123596


### Validate model

In [8]:
with torch.inference_mode():
    y_valid_pred = model(X_valid.float()).flatten()

print(f"Final training: Loss: {loss.item()}, Accuracy: {acc}")


loss = loss_fn(y_valid_pred, y_valid.float())
y_valid_pred = torch.round(torch.sigmoid(y_valid_pred))
acc = accuracy_fn(y_valid, y_valid_pred)
print(f"Validation: Loss: {loss.item()}, Accuracy: {acc}")

Final training: Loss: 0.46467265486717224, Accuracy: 78.51123595505618
Validation: Loss: 0.42052748799324036, Accuracy: 80.44692737430168


### Compute predictions for test data

In [9]:
with torch.inference_mode():
    y_pred_test = model(test_X_tensor.float()).flatten()

y_pred_test = torch.round(torch.sigmoid(y_pred_test))

### Save output to csv

In [29]:
# Revert predictions to NumPy array
y_pred_np = y_pred_test.int().detach().numpy()

# Create a list with the correct indicies
min_i = max(train['PassengerId'])
y_indicies = [min_i+i+1 for i in range(len(y_pred_test))]

# Make a combined list of indicies and predictions
y_data = {'PassengerId': y_indicies, 'Survived': y_pred_np}

y_data

{'PassengerId': [892,
  893,
  894,
  895,
  896,
  897,
  898,
  899,
  900,
  901,
  902,
  903,
  904,
  905,
  906,
  907,
  908,
  909,
  910,
  911,
  912,
  913,
  914,
  915,
  916,
  917,
  918,
  919,
  920,
  921,
  922,
  923,
  924,
  925,
  926,
  927,
  928,
  929,
  930,
  931,
  932,
  933,
  934,
  935,
  936,
  937,
  938,
  939,
  940,
  941,
  942,
  943,
  944,
  945,
  946,
  947,
  948,
  949,
  950,
  951,
  952,
  953,
  954,
  955,
  956,
  957,
  958,
  959,
  960,
  961,
  962,
  963,
  964,
  965,
  966,
  967,
  968,
  969,
  970,
  971,
  972,
  973,
  974,
  975,
  976,
  977,
  978,
  979,
  980,
  981,
  982,
  983,
  984,
  985,
  986,
  987,
  988,
  989,
  990,
  991,
  992,
  993,
  994,
  995,
  996,
  997,
  998,
  999,
  1000,
  1001,
  1002,
  1003,
  1004,
  1005,
  1006,
  1007,
  1008,
  1009,
  1010,
  1011,
  1012,
  1013,
  1014,
  1015,
  1016,
  1017,
  1018,
  1019,
  1020,
  1021,
  1022,
  1023,
  1024,
  1025,
  1026,
  1027,
  102

In [31]:

y_pred_df = pd.DataFrame(data=y_data)
y_pred_df.to_csv("basicnn_submission",index=False)