# Lesson 5: Linear model and Neural Network From Scratch
A notebook based on Lesson 5 of the Fast AI course.

In this notebook, we will train a linear model from scratch using basic pytorch tensors.
This follows a similar process to the spreadsheet approach from Lesson 3, [here](https://docs.google.com/spreadsheets/d/1hma4bTEFuiS483djqE5dPoLlbsSQOTioqMzsesZGUGI/edit?usp=sharing).

## 0. Set up
Download the source data from Kaggle.

In [1]:
import os
from pathlib import Path
import zipfile

import kaggle


DATA_DIR = Path().absolute().parents[1] / 'datasets' / 'fastai' / 'lesson5'
DATA_FNAME = 'titanic'

if not DATA_DIR.exists():
    kaggle.api.competition_download_cli(str(DATA_FNAME), path=DATA_DIR)
    zipfile.ZipFile(f'{DATA_DIR / DATA_FNAME}.zip').extractall(DATA_DIR)



## 1. Feature engineering
Clean the data, impute missing values and calcualte some meaningful features to train on.

In [2]:
import numpy as np
import pandas as pd

In [3]:
train_df = pd.read_csv(DATA_DIR / 'train.csv')
test_df = pd.read_csv(DATA_DIR / 'test.csv')

In [4]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
continuous_cols = ['AgeNorm', 'SibSp', 'Parch', 'Family', 'Alone', 'LogFare', 'TicketFreq']
categorical_cols = ['Pclass', 'Sex', 'Embarked', 'Deck', 'Title']
    

def calculate_features(df):
    """Add engineered features for the titanic dataset."""
    df = df.copy()

    # Min-max scaler on Age
    df['AgeNorm'] = df['Age'].fillna(df['Age'].mean()) / df['Age'].max()

    # Log of Fares to tame the long tail
    df['LogFare'] = np.log1p(df['Fare'])

    # Group Cabins by Deck
    df['Cabin'].fillna(df['Cabin'].mode().iloc[0], inplace=True)
    df['Deck'] = df['Cabin'].str[0].map(dict(A="ABC", B="ABC", C="ABC", D="DE", E="DE", F="FG", G="FG")).fillna("Other")

    # Features based on family members
    df['Family'] = df['SibSp'] + df['Parch']
    df['Alone'] = df['Family'] == 0

    # Did multiple people travel on the same ticket
    df['TicketFreq'] = df.groupby('Ticket')['Ticket'].transform('count')

    # Use just the title portion of the Name field
    df['Title'] = df['Name'].str.split(', ', expand=True)[1].str.split('. ', expand=True)[0]
    df['Title'] = df['Title'].map({'Mr': 'Mr', 'Miss': 'Miss', 'Mrs': 'Mrs', 'Master': 'Master'}).fillna("Other")

    # One-hot encode categorical variables
    categorical_cols = ['Pclass', 'Sex', 'Embarked', 'Deck', 'Title']
    df['Embarked'].fillna(df['Embarked'].mode().iloc[0], inplace=True)
    df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
    df['const'] = 1

    # Filter the output columns
    categorical_cols_output = []
    for cat_col in categorical_cols:
        categorical_cols_output.extend([k for k in df.columns if f'{cat_col}_' in k])
    output_cols = continuous_cols + categorical_cols_output

    return df[output_cols] * 1.

In [6]:
X_train = calculate_features(train_df)
X_train.head()

Unnamed: 0,AgeNorm,SibSp,Parch,Family,Alone,LogFare,TicketFreq,Pclass_2,Pclass_3,Sex_male,Embarked_Q,Embarked_S,Deck_DE,Deck_FG,Deck_Other,Title_Miss,Title_Mr,Title_Mrs,Title_Other
0,0.275,1.0,0.0,1.0,0.0,2.110213,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.475,1.0,0.0,1.0,0.0,4.280593,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.325,0.0,0.0,0.0,1.0,2.188856,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.4375,1.0,0.0,1.0,0.0,3.990834,2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.4375,0.0,0.0,0.0,1.0,2.202765,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [7]:
y_train = train_df['Survived'] * 1.
y_train.head()

0    0.0
1    1.0
2    1.0
3    1.0
4    0.0
Name: Survived, dtype: float64

## 2. Train a linear model
Train a linear model "from scratch".

### 2.1. Initialise random weights

In [8]:
import torch

In [9]:
num_features = X_train.shape[1]
num_layers = 1

In [10]:
def initialise_weights(num_layers, num_features):
    weights = torch.rand([num_layers, num_features]) - 0.5
    weights.requires_grad_()
    return weights

In [11]:
weights = initialise_weights(num_layers, num_features)
weights

tensor([[-0.3417,  0.4206,  0.1409,  0.4548,  0.2176,  0.3695,  0.2355, -0.0815,
          0.3598,  0.3943,  0.1020, -0.3897,  0.1433,  0.3704, -0.1163, -0.2272,
          0.0212,  0.3592,  0.1844]], requires_grad=True)

### 2.2. Define the model and loss function
This is just a linear model with MAE loss function to start with.

In [12]:
def calculate_loss(predictions, actual):
    """Mean Absolute Error"""
    return (predictions - actual).abs().mean()
    

def calculate_predictions(features, weights):
    """Linear model to calculate predictions."""
    return (features @ weights.T).squeeze()

In [13]:
t_independent = torch.tensor(X_train.values, dtype=torch.float)
t_independent

tensor([[0.2750, 1.0000, 0.0000,  ..., 1.0000, 0.0000, 0.0000],
        [0.4750, 1.0000, 0.0000,  ..., 0.0000, 1.0000, 0.0000],
        [0.3250, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.3712, 1.0000, 2.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.3250, 0.0000, 0.0000,  ..., 1.0000, 0.0000, 0.0000],
        [0.4000, 0.0000, 0.0000,  ..., 1.0000, 0.0000, 0.0000]])

In [14]:
t_dependent = torch.tensor(y_train.values, dtype=torch.float)
t_dependent

tensor([0., 1., 1., 1., 0., 0., 0., 0., 1., 1., 1., 1., 0., 0., 0., 1., 0., 1.,
        0., 1., 0., 1., 1., 1., 0., 1., 0., 0., 1., 0., 0., 1., 1., 0., 0., 0.,
        1., 0., 0., 1., 0., 0., 0., 1., 1., 0., 0., 1., 0., 0., 0., 0., 1., 1.,
        0., 1., 1., 0., 1., 0., 0., 1., 0., 0., 0., 1., 1., 0., 1., 0., 0., 0.,
        0., 0., 1., 0., 0., 0., 1., 1., 0., 1., 1., 0., 1., 1., 0., 0., 1., 0.,
        0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 1., 1.,
        0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1.,
        0., 1., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 1., 1., 0.,
        0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1.,
        0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 1., 1., 0., 1., 1., 0., 0., 1., 0., 1., 1., 1., 1., 0., 0.,
        1., 0., 0., 0., 0., 0., 1., 0., 0., 1., 1., 1., 0., 1., 0., 0., 0., 1.,
        1., 0., 1., 0., 1., 0., 0., 0., 

In [15]:
preds = calculate_predictions(t_independent, weights)
preds[:5]

tensor([2.1823, 2.8894, 0.8938, 2.6410, 1.5032], grad_fn=<SliceBackward0>)

In [16]:
preds.shape

torch.Size([891])

In [17]:
(preds-t_dependent).shape

torch.Size([891])

In [18]:
loss = calculate_loss(preds, t_dependent)
loss

tensor(2.0370, grad_fn=<MeanBackward0>)

### 2.3 Gradient descent step
Calculate an epoch of gradient descent.

In [19]:
LEARNING_RATE = 5e-3

In [20]:
weights = initialise_weights(num_layers, num_features)
weights

tensor([[-0.2009,  0.0197, -0.0083,  0.3054,  0.1181, -0.3794,  0.3723,  0.3662,
          0.2323, -0.0597,  0.0644, -0.2248, -0.4564,  0.2257,  0.4999, -0.4412,
         -0.0720, -0.2849,  0.2854]], requires_grad=True)

In [21]:
# Calculate the loss using the current weights
preds = calculate_predictions(t_independent, weights)
loss = calculate_loss(preds, t_dependent)
print(loss)

# Calculate the gradients
loss.backward()

with torch.no_grad():
    # Take a gradient descent step
    weights.subtract_(weights.grad * LEARNING_RATE)

    # Reset gradients ready for the next step. Otherwise pytorch adds gradients on the next epoch.
    weights.grad.zero_()

    preds = calculate_predictions(t_independent, weights)
    loss = calculate_loss(preds, t_dependent)
    print(loss)

tensor(1.0689, grad_fn=<MeanBackward0>)
tensor(1.0371)


The loss reduced as we hoped.

In [22]:
def calculate_epoch(weights, t_independent, t_dependent, loss=None):
    """Run one epoch and return the weights and loss"""
    
    if loss is None:
        # Use the loss from the previous epoch if passed in, otherwise calculate the loss using the current weights
        preds = calculate_predictions(t_independent, weights)
        loss = calculate_loss(preds, t_dependent)

    # Calculate the gradients
    loss.backward()

    with torch.no_grad():
        # Take a gradient descent step
        weights.subtract_(weights.grad * LEARNING_RATE)

        # Reset gradients ready for the next step. Otherwise pytorch adds gradients on the next epoch.
        weights.grad.zero_()

    preds = calculate_predictions(t_independent, weights)
    loss = calculate_loss(preds, t_dependent)

    return weights, loss

In [23]:
weights = initialise_weights(num_layers, num_features)
weights

tensor([[-0.2225, -0.2552, -0.0402, -0.3125, -0.3008, -0.2943,  0.3436, -0.4569,
          0.2370, -0.2990, -0.1736,  0.2465,  0.3021, -0.0868,  0.4510,  0.3277,
          0.3800, -0.0745, -0.1727]], requires_grad=True)

In [24]:
weights, loss = calculate_epoch(weights, t_independent, t_dependent, loss=None)
print(float(loss))

1.0088685750961304


In [25]:
weights, loss = calculate_epoch(weights, t_independent, t_dependent, loss=loss)
print(float(loss))

0.9469995498657227


In [26]:
weights, loss = calculate_epoch(weights, t_independent, t_dependent, loss=loss)
print(float(loss))

0.9004923701286316


In [27]:
weights, loss = calculate_epoch(weights, t_independent, t_dependent, loss=loss)
print(float(loss))

0.8665304780006409


In [28]:
weights, loss = calculate_epoch(weights, t_independent, t_dependent, loss=loss)
print(float(loss))

0.8376653790473938


### 2.4. Loop through multiple epochs
Run multiple epochs of gradient descent.

In [29]:
NUM_EPOCHS = 20

In [30]:
weights = initialise_weights(num_layers, num_features)
loss = None

for epoch in range(NUM_EPOCHS):
    weights, loss = calculate_epoch(weights, t_independent, t_dependent, loss=loss)
    print(float(loss))

0.9272493720054626
0.9168375730514526
0.906815230846405
0.8972134590148926
0.8881422877311707
0.8793549537658691
0.8708175420761108
0.8626936674118042
0.8548486232757568
0.847531795501709
0.8406753540039062
0.8341706991195679
0.8279553651809692
0.8219183087348938
0.8159963488578796
0.8102097511291504
0.804700493812561
0.7993839979171753
0.7942628264427185
0.7892581820487976


In [31]:
def train_linear_model(t_independent, t_dependent, num_layers, num_epochs):
    num_features = t_independent.shape[1]
    weights = initialise_weights(num_layers, num_features)
    loss = None

    for epoch in range(num_epochs):
        weights, loss = calculate_epoch(weights, t_independent, t_dependent, loss=loss)
        if epoch % 10 == 0: print(epoch, float(loss))

    return weights, loss

In [32]:
weights, loss = train_linear_model(t_independent, t_dependent, num_layers=1, num_epochs=101)

0 1.2284592390060425
10 0.5932685732841492
20 0.4637145400047302
30 0.45600515604019165
40 0.45087799429893494
50 0.44623124599456787
60 0.4419229030609131
70 0.4378982484340668
80 0.43414369225502014
90 0.430573970079422
100 0.42713913321495056


In [33]:
dict(zip(X_train.columns, *weights.requires_grad_(False)))

{'AgeNorm': tensor(0.3430),
 'SibSp': tensor(-0.0613),
 'Parch': tensor(-0.0990),
 'Family': tensor(-0.0413),
 'Alone': tensor(-0.4125),
 'LogFare': tensor(0.0958),
 'TicketFreq': tensor(0.0947),
 'Pclass_2': tensor(-0.2630),
 'Pclass_3': tensor(-0.3663),
 'Sex_male': tensor(-0.4655),
 'Embarked_Q': tensor(-0.2820),
 'Embarked_S': tensor(0.4286),
 'Deck_DE': tensor(-0.0938),
 'Deck_FG': tensor(0.1596),
 'Deck_Other': tensor(-0.1830),
 'Title_Miss': tensor(0.2965),
 'Title_Mr': tensor(0.4060),
 'Title_Mrs': tensor(0.1030),
 'Title_Other': tensor(-0.4593)}

### 2.5. Measure accuracy

In [34]:
def calculate_accuracy(predictions, actuals):
    return (predictions == actuals).mean()

In [35]:
predictions_prob = calculate_predictions(t_independent, weights)
predictions_classifications = predictions_prob > 0.5
predictions_classifications = pd.Series(predictions_classifications.squeeze()).astype(int)

calculate_accuracy(predictions_classifications, y_train.astype(int))

0.6823793490460157

In [36]:
pd.DataFrame(predictions_prob).describe()

Unnamed: 0,0
count,891.0
mean,0.246126
std,0.451054
min,-1.294903
25%,-0.007012
50%,0.197942
75%,0.570054
max,1.501532


### 2.6. Add a sigmoid
The output layer of a binary classifier should be bounded between 0 and 1.
A sigmoid function handles this.

In [37]:
def calculate_predictions(features, weights):
    """Linear model to calculate predictions."""
    return torch.sigmoid(features @ weights.T).squeeze()
    # return (features @ weights.T).squeeze()
    # return torch.clamp(features @ weights.T, 0, 1).squeeze()

### 2.7 Train the model on a training set and compare validation set accuracy

In [38]:
import random

In [39]:
validation_pct=0.2

In [40]:
def validation_set_split(data, validation_pct):
    """Given a collection of input data, return the indexes of the training and validation sets."""
    num_observations = data.shape[0]
    validation_size = int(num_observations * validation_pct)
    validation_idxs = sorted(random.sample(range(num_observations), validation_size))
    training_idxs = list(set(range(num_observations)).difference(set(validation_idxs)))
    return training_idxs, validation_idxs

In [41]:
training_idxs, validation_idxs = validation_set_split(t_independent, validation_pct=validation_pct)

def train_linear_model(t_independent, t_dependent, num_layers, num_epochs):
    # Split training/validation data
    X_train = t_independent[training_idxs]
    y_train = t_dependent[training_idxs]
    X_validation = t_independent[validation_idxs]
    y_validation = t_dependent[validation_idxs]

    # Initialise weights
    num_features = t_independent.shape[1]
    weights = initialise_weights(num_layers, num_features)
    loss = None

    # Gradient descent
    for epoch in range(num_epochs):
        weights, loss = calculate_epoch(weights, X_train, y_train, loss=loss)
        
        if epoch % 10 == 0:
            training_predictions = calculate_predictions(X_train, weights)
            training_loss = calculate_loss(training_predictions, y_train)

            validation_predictions = calculate_predictions(X_validation, weights)
            validation_loss = calculate_loss(validation_predictions, y_validation)
            print(epoch, float(training_loss), float(validation_loss))

    return weights, loss

In [42]:
LEARNING_RATE = 0.05
weights, loss = train_linear_model(t_independent, t_dependent, num_layers=1, num_epochs=201)

0 0.5023815631866455 0.48137643933296204
10 0.4803462028503418 0.4576272964477539
20 0.45871680974960327 0.4345540702342987
30 0.43934422731399536 0.41457486152648926
40 0.42293399572372437 0.39863744378089905
50 0.4091772437095642 0.3863670229911804
60 0.39735060930252075 0.3768177628517151
70 0.38685426115989685 0.36903780698776245
80 0.37758153676986694 0.36243802309036255
90 0.36982017755508423 0.3569122552871704
100 0.36362501978874207 0.35240206122398376
110 0.358633816242218 0.3485826253890991
120 0.3543953001499176 0.3450676500797272
130 0.3505760729312897 0.34159815311431885
140 0.34697502851486206 0.33805081248283386
150 0.3434840440750122 0.33439019322395325
160 0.3400515615940094 0.33062809705734253
170 0.3366592526435852 0.3267989754676819
180 0.3333069086074829 0.322945773601532
190 0.33000338077545166 0.31911081075668335
200 0.32676100730895996 0.31533166766166687


In [43]:
pd.DataFrame(calculate_predictions(t_independent[validation_idxs], weights).detach().numpy()).describe()

Unnamed: 0,0
count,178.0
mean,0.268347
std,0.21441
min,0.014413
25%,0.093924
50%,0.147523
75%,0.420582
max,0.820426


In [44]:
def calculate_predicted_classifications(predictions_prob):
    predictions_classifications = predictions_prob > 0.5
    return pd.Series(predictions_classifications.squeeze()).astype(int)
    

In [45]:
predictions_prob = calculate_predictions(t_independent[validation_idxs], weights)
predictions_classifications = calculate_predicted_classifications(predictions_prob)

calculate_accuracy(predictions_classifications, y_train[validation_idxs].reset_index(drop=True).astype(int))

0.7359550561797753

### 2.8. Submit to Kaggle

In [46]:
test_feature_df = calculate_features(test_df)
test_feature_df['Deck_Other'] = 0.
test_feature_df = test_feature_df[X_train.columns]
test_feature_df

Unnamed: 0,AgeNorm,SibSp,Parch,Family,Alone,LogFare,TicketFreq,Pclass_2,Pclass_3,Sex_male,Embarked_Q,Embarked_S,Deck_DE,Deck_FG,Deck_Other,Title_Miss,Title_Mr,Title_Mrs,Title_Other
0,0.453947,0.0,0.0,0.0,1.0,2.178064,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.618421,1.0,0.0,1.0,0.0,2.079442,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.815789,0.0,0.0,0.0,1.0,2.369075,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.355263,0.0,0.0,0.0,1.0,2.268252,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.289474,1.0,1.0,2.0,0.0,2.586824,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,0.398324,0.0,0.0,0.0,1.0,2.202765,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
414,0.513158,0.0,0.0,0.0,1.0,4.699571,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
415,0.506579,0.0,0.0,0.0,1.0,2.110213,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
416,0.398324,0.0,0.0,0.0,1.0,2.202765,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [47]:
test_tensor = torch.tensor(test_feature_df.values, dtype=torch.float)
test_predictions_prob = calculate_predictions(test_tensor, weights)
test_predictions_classifications = calculate_predicted_classifications(test_predictions_prob)

In [48]:
test_predictions_classifications.shape

(418,)

In [49]:
output_df = test_df[['PassengerId']]
output_df['Survived'] = test_predictions_classifications
output_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output_df['Survived'] = test_predictions_classifications


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [50]:
output_df.to_csv('linear_model_output_sigmoid2.csv', index=False)

In [51]:
output_df['Survived'].value_counts()

Survived
0    320
1     98
Name: count, dtype: int64

## 3. Train a neural network from scratch
Repeat the previous process but with multiple hidden layers

### 3.1. Initialise weights
We will now have weights per layer of the NN, where the size of each depends on the size of the hidder layers.

In [52]:
num_features = X_train.shape[1]
num_hidden_1 = 20

In [53]:
def initialise_weights_nn(num_features, num_hidden_1):
    # Divide by num_hidden so the magnitudes are comparable to the linear model when they're summed at the end
    layer_1 = (torch.rand([num_features, num_hidden_1]) - 0.5) / num_hidden_1
    layer_1.requires_grad_()
    
    # The 0.3 magic constant is a heuristic value to make the model train better
    layer_2 = torch.rand([num_hidden_1, 1]) - 0.3
    layer_2.requires_grad_()
    
    const = torch.rand(1)[0]
    const.requires_grad_()
    
    return layer_1, layer_2, const

In [54]:
weights = initialise_weights_nn(num_features, num_hidden_1)
layer_1, layer_2, const = weights

### 3.2. Implement the new model

In [55]:
import torch.nn.functional as F

In [56]:
def calculate_predictions_nn(features, weights):
    layer_1, layer_2, const = weights
    res = F.relu(features @ layer_1)
    res = res @ layer_2 + const
    # return torch.sigmoid(res)
    return torch.clamp(res, 0, 1)

In [57]:
calculate_predictions_nn(t_independent, (layer_1, layer_2, const))

tensor([[0.7752],
        [0.7802],
        [0.7261],
        [0.8186],
        [0.7652],
        [0.7065],
        [0.7683],
        [0.8691],
        [0.7491],
        [0.7442],
        [0.7893],
        [0.7513],
        [0.7620],
        [0.9077],
        [0.7247],
        [0.7505],
        [0.8967],
        [0.7385],
        [0.8048],
        [0.7021],
        [0.7586],
        [0.7039],
        [0.6708],
        [0.7881],
        [0.8950],
        [0.9357],
        [0.7134],
        [0.9482],
        [0.6714],
        [0.7636],
        [0.7131],
        [0.8061],
        [0.6710],
        [0.7356],
        [0.7788],
        [0.8111],
        [0.7134],
        [0.7622],
        [0.8019],
        [0.7336],
        [0.7882],
        [0.7703],
        [0.7150],
        [0.7741],
        [0.6706],
        [0.7640],
        [0.7408],
        [0.6710],
        [0.7815],
        [0.7998],
        [0.9509],
        [0.7613],
        [0.7747],
        [0.7798],
        [0.7371],
        [0

### 3.3. Reimplement gradient descent
Only the weights.subtract part needs to change, as we now need to loop through the multiple layers. 

In the original notebook, this part of the function is encapsulated in an update_coeffs which can be overwritten.

But I wanted to explicitly see the change.

In [58]:
def calculate_epoch_nn(weights, t_independent, t_dependent, loss=None):
    """Run one epoch and return the weights and loss"""
    if loss is None:
        # Use the loss from the previous epoch if passed in, otherwise calculate the loss using the current weights
        preds = calculate_predictions_nn(t_independent, weights)
        loss = calculate_loss(preds, t_dependent)

    # Calculate the gradients
    loss.backward()

    with torch.no_grad():
        # Take a gradient descent step
        for layer in weights:  # This for loop is the only change compared to the previous calculate_epoch function
            layer.sub_(layer.grad * LEARNING_RATE)
            layer.grad.zero_()

    preds = calculate_predictions_nn(t_independent, weights)
    loss = calculate_loss(preds, t_dependent)

    return weights, loss

### 3.4. Train the neural net

In [59]:
training_idxs, validation_idxs = validation_set_split(t_independent, validation_pct=validation_pct)

def train_nn_model(t_independent, t_dependent, num_hidden_1, num_epochs):
    # Split training/validation data
    X_train = t_independent[training_idxs]
    y_train = t_dependent[training_idxs]
    X_validation = t_independent[validation_idxs]
    y_validation = t_dependent[validation_idxs]

    # Initialise weights
    num_features = t_independent.shape[1]
    weights = initialise_weights_nn(num_features, num_hidden_1)
    loss = None

    # Gradient descent
    for epoch in range(num_epochs):
        weights, loss = calculate_epoch_nn(weights, X_train, y_train, loss=loss)

        # Periodically print loss
        if epoch % 10 == 0:
            training_predictions = calculate_predictions_nn(X_train, weights)
            training_loss = calculate_loss(training_predictions, y_train)
            validation_predictions = calculate_predictions_nn(X_validation, weights)
            validation_loss = calculate_loss(validation_predictions, y_validation)
            print(epoch, float(training_loss), float(validation_loss))

    return weights, loss

In [61]:
LEARNING_RATE = 0.5
weights, loss = train_nn_model(t_independent, t_dependent, num_hidden_1, num_epochs=301)

0 0.409184992313385 0.42848849296569824
10 0.37873926758766174 0.4046975374221802
20 0.3787175118923187 0.4046306014060974
30 0.3786957263946533 0.404563307762146
40 0.378683477640152 0.4045388400554657
50 0.37868162989616394 0.40453559160232544
60 0.37868162989616394 0.40453559160232544
70 0.37868162989616394 0.40453559160232544
80 0.37868162989616394 0.40453559160232544
90 0.37868162989616394 0.40453559160232544
100 0.37868162989616394 0.40453559160232544
110 0.37868162989616394 0.40453559160232544
120 0.37868162989616394 0.40453559160232544
130 0.37868162989616394 0.40453559160232544
140 0.37868162989616394 0.40453559160232544
150 0.37868162989616394 0.40453559160232544
160 0.37868162989616394 0.40453559160232544
170 0.37868162989616394 0.40453559160232544
180 0.37868162989616394 0.40453559160232544
190 0.37868162989616394 0.40453559160232544
200 0.37868162989616394 0.40453559160232544
210 0.37868162989616394 0.40453559160232544
220 0.37868162989616394 0.40453559160232544
230 0.3786

### 3.5. Use the model for predictions

In [62]:
predictions_prob = calculate_predictions_nn(t_independent[validation_idxs], weights)
predictions_classifications = calculate_predicted_classifications(predictions_prob)

calculate_accuracy(predictions_classifications, y_train[validation_idxs].reset_index(drop=True).astype(int))

0.5955056179775281

In [63]:
pd.DataFrame(predictions_prob.detach().numpy()).describe()

Unnamed: 0,0
count,178.0
mean,0.000215
std,0.002875
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,0.038355


### 3.6. Submit to Kaggle

In [64]:
test_predictions_prob = calculate_predictions_nn(test_tensor, weights)
test_predictions_classifications = calculate_predicted_classifications(test_predictions_prob)

In [65]:
test_predictions_classifications.shape

(418,)

In [66]:
output_df = test_df[['PassengerId']]
output_df['Survived'] = test_predictions_classifications
output_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output_df['Survived'] = test_predictions_classifications


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [67]:
output_df.to_csv('nn_model_output.csv', index=False)

In [68]:
output_df['Survived'].value_counts()

Survived
0    418
Name: count, dtype: int64