### Setup - Titanic problem

In [1]:
import numpy as np
import pandas as pd
import torch

In [2]:
import os

datasets_base_path = "../../../../datasets"
trainset_path = os.path.join(datasets_base_path, "titanic_train.csv")
testset_path = os.path.join(datasets_base_path, "titanic_test.csv")

#### Pre-processing

In [3]:
df_train = pd.read_csv(trainset_path)
# We will inpute the missing values with the mode
modes = df_train.mode().iloc[0]
df_train.fillna(modes, inplace=True)


df_train["LogFare"] = np.log(df_train["Fare"] + 1)


# Clearly we need to change the string values by numeric ones. We will use dummy variables.
df_train = pd.get_dummies(data=df_train, columns=["Sex", "Pclass", "Embarked"])
# Cabin, Name, and Ticket have too many unique values for it to make sense creating 
# dummy variables for them.

added_columns = [
    "Sex_male",
    "Sex_female",
    "Pclass_1",
    "Pclass_2",
    "Pclass_3",
    "Embarked_C",
    "Embarked_Q",
    "Embarked_S"
]

df_train[added_columns].head()

Unnamed: 0,Sex_male,Sex_female,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S
0,1,0,0,0,1,0,0,1
1,0,1,1,0,0,1,0,0
2,0,1,0,0,1,0,0,1
3,0,1,1,0,0,0,0,1
4,1,0,0,0,1,0,0,1


### Create Pytorch tensors

In [4]:
y = torch.tensor(data=df_train["Survived"])


# The independent variables are all the continuous variables and all the dummy 
# variables just created
independent_cols = ["Age", "SibSp", "Parch", "LogFare"] + added_columns

x = torch.tensor(df_train[independent_cols].values, dtype=torch.float)
x[:2]

tensor([[22.0000,  1.0000,  0.0000,  2.1102,  1.0000,  0.0000,  0.0000,  0.0000,
          1.0000,  0.0000,  0.0000,  1.0000],
        [38.0000,  1.0000,  0.0000,  4.2806,  0.0000,  1.0000,  1.0000,  0.0000,
          0.0000,  1.0000,  0.0000,  0.0000]])

In [5]:
N_COEFFS = x.shape[1] # number of columns in X

In [6]:
# normalization
vals, indices = x.max(dim=0)
x = x / vals # is dividing a matrix by a vector, using broadcasting

x[:1]

tensor([[0.2750, 0.1250, 0.0000, 0.3381, 1.0000, 0.0000, 0.0000, 0.0000, 1.0000,
         0.0000, 0.0000, 1.0000]])

In [7]:
from fastai.data.transforms import RandomSplitter

trn_split, val_split = RandomSplitter(seed=42)(df_train)

type(trn_split)

fastcore.foundation.L

In [8]:
trn_x, val_x = x[trn_split], x[val_split]
trn_y, val_y = y[trn_split], y[val_split]

len(trn_x),len(val_x)

(713, 178)

In [9]:
# turn our dependent variable into a column vector,

trn_y = trn_y[:, None]
val_y = val_y[:, None]

val_y[:2]

tensor([[1],
        [0]])

### Neural Network

In this notebook we will build a NN with a hidden layer to solve the Titanic dataset (from Kaggle competition).

The simple neural network will have a layer with N inputs equal to the number of coefficients (`n_coeff`)
and then `n_hidden` neurons in the hidden layer -- `MATRIX W_0=(n_coeff, n_hidden)` --, finally the second layer will take `n_hidden` inputs and create a single output, in addition a bias term (independent) -- `MATRIX W_1=(n_hidden, 1), BIAS B = <SCALAR>`.

In [10]:
def init_coeffs(n_coeffs, n_hidden: int = 10):
    layer_1 = (torch.rand(n_coeffs, n_hidden) - 0.5) / n_hidden
    layer_2 = torch.rand(n_hidden, 1) - 0.3
    bias = torch.rand(1)[0]

    return layer_1.requires_grad_(), layer_2.requires_grad_(), bias.requires_grad_()

In [11]:
import torch.nn.functional as F


def calc_preds(coeffs, indeps):
    layer_1, layer_2, const = coeffs
    res = F.relu(indeps@layer_1)
    res = res@layer_2 + const
    # note that the output layer pass through the sigmoid function to make sure
    # everything is between 0 and 1
    return torch.sigmoid(res)

In [12]:
def update_coeffs(coeffs, lr):
    """Update each layer of parameters / coefficients and reset gradient values."""
    for layer in coeffs:
        layer.sub_(layer.grad * lr)
        layer.grad.zero_()

Train our model

In [13]:
def calc_loss(coeffs, indeps, deps):
    # perform MAE using preds and Y (ground truth)
    return torch.abs(calc_preds(coeffs, indeps) - deps).mean()

In [14]:
def one_epoch(coeffs, lr, train_x, train_y):
    loss = calc_loss(coeffs, train_x, train_y)
    loss.backward()

    with torch.no_grad():
        update_coeffs(coeffs, lr)

    return loss

In [15]:
def train_model(epochs: int = 30, lr: float = 0.01):
    torch.manual_seed(42)

    coeffs = init_coeffs(N_COEFFS, n_hidden=10)

    for epoch in range(epochs):
        loss = one_epoch(coeffs, lr=lr, train_x=trn_x, train_y=trn_y)

        print(f"epoch: {epoch},", f"{loss:.3f}")

    return coeffs

In [16]:
coeffs = train_model(lr=10)

epoch: 0, 0.554
epoch: 1, 0.515
epoch: 2, 0.436
epoch: 3, 0.311
epoch: 4, 0.245
epoch: 5, 0.225
epoch: 6, 0.217
epoch: 7, 0.212
epoch: 8, 0.209
epoch: 9, 0.206
epoch: 10, 0.204
epoch: 11, 0.202
epoch: 12, 0.201
epoch: 13, 0.199
epoch: 14, 0.199
epoch: 15, 0.198
epoch: 16, 0.197
epoch: 17, 0.197
epoch: 18, 0.196
epoch: 19, 0.196
epoch: 20, 0.196
epoch: 21, 0.196
epoch: 22, 0.195
epoch: 23, 0.195
epoch: 24, 0.195
epoch: 25, 0.195
epoch: 26, 0.195
epoch: 27, 0.194
epoch: 28, 0.194
epoch: 29, 0.194


In [17]:
def get_accuracy(coeffs, val_x, val_y, threshold: float = 0.5):
    return (val_y.bool() == (calc_preds(coeffs, val_x) > threshold)).float().mean()

In [18]:
get_accuracy(coeffs, val_x, val_y)

tensor(0.8258)

### Deep Neural Network


We will add more hidden layers in order to make our neural network deep.

You'll notice here that there's a lot of messy constants to get the random numbers in just the right ranges. When you train the model in a moment, you'll see that the tiniest changes to these initialisations can cause our model to fail to train at all! This is a key reason that deep learning failed to make much progress in the early days. In the future we will learn about that initialisation.

In [19]:
def init_coeffs(n_coeffs, hiddens = [10, 10], debug: bool = False):
    """hidddens contains the size of each hidden layer that you want."""
    sizes = [n_coeffs] + hiddens + [1]
    n = len(sizes)

    if debug:
        print("N:", n)

    layers = [(torch.rand(sizes[i], sizes[i+1]) - 0.3) / sizes[i+1] * 4 for i in range(n-1)]
    consts = [(torch.rand(1)[0] - 0.5) * 0.1 for i in range(n-1)]

    if debug:
        print("LAYERS:", len(layers))
        print("BIASES", len(consts))

    for layer in layers+consts:
        layer.requires_grad_()

    return layers,consts

In [90]:
_ = init_coeffs(N_COEFFS, debug=True)

N: 4
LAYERS: 3
BIASES 3


In [20]:
def calc_preds(coeffs, indeps):
    layers, consts = coeffs
    n = len(layers)
    res = indeps

    for i, layer in enumerate(layers):
        res = res@layer + consts[i]

        if i != (n-1):
            res = F.relu(res)

    return torch.sigmoid(res)

In [21]:
def update_coeffs(coeffs, lr):
    layers, consts = coeffs

    for layer in layers + consts:
        layer.sub_(layer.grad * lr)
        layer.grad.zero_()

In [23]:
def train_model(epochs: int = 20, lr: float = 0.01):
    torch.manual_seed(42)

    coeffs = init_coeffs(N_COEFFS)

    for epoch in range(epochs):
        loss = one_epoch(coeffs, lr=lr, train_x=trn_x, train_y=trn_y)

        print(f"epoch: {epoch},", f"{loss:.3f}")

    return coeffs

In [24]:
coeffs = train_model(epochs=60, lr=1.4)

epoch: 0, 0.548
epoch: 1, 0.496
epoch: 2, 0.490
epoch: 3, 0.482
epoch: 4, 0.463
epoch: 5, 0.395
epoch: 6, 0.378
epoch: 7, 0.374
epoch: 8, 0.370
epoch: 9, 0.354
epoch: 10, 0.324
epoch: 11, 0.308
epoch: 12, 0.312
epoch: 13, 0.349
epoch: 14, 0.243
epoch: 15, 0.217
epoch: 16, 0.214
epoch: 17, 0.210
epoch: 18, 0.206
epoch: 19, 0.204
epoch: 20, 0.202
epoch: 21, 0.201
epoch: 22, 0.200
epoch: 23, 0.199
epoch: 24, 0.199
epoch: 25, 0.198
epoch: 26, 0.198
epoch: 27, 0.197
epoch: 28, 0.197
epoch: 29, 0.196
epoch: 30, 0.196
epoch: 31, 0.196
epoch: 32, 0.196
epoch: 33, 0.195
epoch: 34, 0.195
epoch: 35, 0.195
epoch: 36, 0.195
epoch: 37, 0.195
epoch: 38, 0.195
epoch: 39, 0.195
epoch: 40, 0.194
epoch: 41, 0.194
epoch: 42, 0.194
epoch: 43, 0.194
epoch: 44, 0.194
epoch: 45, 0.194
epoch: 46, 0.194
epoch: 47, 0.194
epoch: 48, 0.194
epoch: 49, 0.194
epoch: 50, 0.194
epoch: 51, 0.194
epoch: 52, 0.194
epoch: 53, 0.194
epoch: 54, 0.194
epoch: 55, 0.194
epoch: 56, 0.193
epoch: 57, 0.193
epoch: 58, 0.193
epoch: 

In [25]:
get_accuracy(coeffs, val_x, val_y)

tensor(0.8258)

The acc hasn't improved comparing to single NN and simple linear model, because DNN and NNs are designed to perform well when there's much data and the same is more complex.