# Lesson 5: Linear model and Neural Network From Scratch
A notebook based on Lesson 5 of the Fast AI course.

In this notebook, we will train a linear model from scratch using basic pytorch tensors.
This follows a similar process to the spreadsheet approach from Lesson 3, [here](https://docs.google.com/spreadsheets/d/1hma4bTEFuiS483djqE5dPoLlbsSQOTioqMzsesZGUGI/edit?usp=sharing).

## 0. Set up
Download the source data from Kaggle.

In [1]:
import os
from pathlib import Path
import zipfile

import kaggle


DATA_DIR = Path().absolute().parents[1] / 'datasets' / 'fastai' / 'lesson5'
DATA_FNAME = 'titanic'

if not DATA_DIR.exists():
    kaggle.api.competition_download_cli(str(DATA_FNAME), path=DATA_DIR)
    zipfile.ZipFile(f'{DATA_DIR / DATA_FNAME}.zip').extractall(DATA_DIR)



## 1. Feature engineering
Clean the data, impute missing values and calcualte some meaningful features to train on.

In [2]:
import numpy as np
import pandas as pd

In [3]:
train_df = pd.read_csv(DATA_DIR / 'train.csv')
test_df = pd.read_csv(DATA_DIR / 'test.csv')

In [4]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
continuous_cols = ['AgeNorm', 'SibSp', 'Parch', 'Family', 'Alone', 'LogFare', 'TicketFreq']
categorical_cols = ['Pclass', 'Sex', 'Embarked', 'Deck', 'Title']
    

def calculate_features(df):
    """Add engineered features for the titanic dataset."""
    df = df.copy()

    # Min-max scaler on Age
    df['AgeNorm'] = df['Age'].fillna(df['Age'].mean()) / df['Age'].max()

    # Log of Fares to tame the long tail
    df['LogFare'] = np.log1p(df['Fare'])

    # Group Cabins by Deck
    df['Cabin'].fillna(df['Cabin'].mode().iloc[0], inplace=True)
    df['Deck'] = df['Cabin'].str[0].map(dict(A="ABC", B="ABC", C="ABC", D="DE", E="DE", F="FG", G="FG")).fillna("Other")

    # Features based on family members
    df['Family'] = df['SibSp'] + df['Parch']
    df['Alone'] = df['Family'] == 0

    # Did multiple people travel on the same ticket
    df['TicketFreq'] = df.groupby('Ticket')['Ticket'].transform('count')

    # Use just the title portion of the Name field
    df['Title'] = df['Name'].str.split(', ', expand=True)[1].str.split('. ', expand=True)[0]
    df['Title'] = df['Title'].map({'Mr': 'Mr', 'Miss': 'Miss', 'Mrs': 'Mrs', 'Master': 'Master'}).fillna("Other")

    # One-hot encode categorical variables
    categorical_cols = ['Pclass', 'Sex', 'Embarked', 'Deck', 'Title']
    df['Embarked'].fillna(df['Embarked'].mode().iloc[0], inplace=True)
    df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
    df['const'] = 1

    # Filter the output columns
    categorical_cols_output = []
    for cat_col in categorical_cols:
        categorical_cols_output.extend([k for k in df.columns if f'{cat_col}_' in k])
    output_cols = continuous_cols + categorical_cols_output

    return df[output_cols] * 1.

In [7]:
X_train = calculate_features(train_df)
X_train.head()

Unnamed: 0,AgeNorm,SibSp,Parch,Family,Alone,LogFare,TicketFreq,Pclass_2,Pclass_3,Sex_male,Embarked_Q,Embarked_S,Deck_DE,Deck_FG,Deck_Other,Title_Miss,Title_Mr,Title_Mrs,Title_Other
0,0.275,1.0,0.0,1.0,0.0,2.110213,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.475,1.0,0.0,1.0,0.0,4.280593,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.325,0.0,0.0,0.0,1.0,2.188856,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.4375,1.0,0.0,1.0,0.0,3.990834,2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.4375,0.0,0.0,0.0,1.0,2.202765,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [8]:
y_train = train_df['Survived'] * 1.
y_train.head()

0    0.0
1    1.0
2    1.0
3    1.0
4    0.0
Name: Survived, dtype: float64

## 2. Train a linear model
Train a linear model "from scratch".

### 2.1. Initialise random weights

In [9]:
import torch

In [10]:
num_features = X_train.shape[1]
num_layers = 1

In [11]:
def initialise_weights(num_layers, num_features):
    weights = torch.rand([num_layers, num_features]) - 0.5
    weights.requires_grad_()
    return weights

In [12]:
weights = initialise_weights(num_layers, num_features)
weights

tensor([[-0.0361, -0.4260, -0.0935,  0.3016, -0.4976, -0.0761, -0.4685,  0.4407,
          0.3553,  0.2118, -0.1979, -0.4992,  0.2750, -0.0418,  0.1658,  0.3786,
          0.2520, -0.1228,  0.4572]], requires_grad=True)

### 2.2. Define the model and loss function
This is just a linear model with MAE loss function to start with.

In [13]:
def calculate_loss(predictions, actual):
    """Mean Absolute Error"""
    return (predictions - actual).abs().mean()
    

def calculate_predictions(features, weights):
    """Linear model to calculate predictions."""
    return features @ weights.T

In [14]:
t_independent = torch.tensor(X_train.values, dtype=torch.float)
t_independent

tensor([[0.2750, 1.0000, 0.0000,  ..., 1.0000, 0.0000, 0.0000],
        [0.4750, 1.0000, 0.0000,  ..., 0.0000, 1.0000, 0.0000],
        [0.3250, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.3712, 1.0000, 2.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.3250, 0.0000, 0.0000,  ..., 1.0000, 0.0000, 0.0000],
        [0.4000, 0.0000, 0.0000,  ..., 1.0000, 0.0000, 0.0000]])

In [15]:
t_dependent = torch.tensor(y_train.values, dtype=torch.float)
t_dependent

tensor([0., 1., 1., 1., 0., 0., 0., 0., 1., 1., 1., 1., 0., 0., 0., 1., 0., 1.,
        0., 1., 0., 1., 1., 1., 0., 1., 0., 0., 1., 0., 0., 1., 1., 0., 0., 0.,
        1., 0., 0., 1., 0., 0., 0., 1., 1., 0., 0., 1., 0., 0., 0., 0., 1., 1.,
        0., 1., 1., 0., 1., 0., 0., 1., 0., 0., 0., 1., 1., 0., 1., 0., 0., 0.,
        0., 0., 1., 0., 0., 0., 1., 1., 0., 1., 1., 0., 1., 1., 0., 0., 1., 0.,
        0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 1., 1.,
        0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1.,
        0., 1., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 1., 1., 0.,
        0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1.,
        0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 1., 1., 0., 1., 1., 0., 0., 1., 0., 1., 1., 1., 1., 0., 0.,
        1., 0., 0., 0., 0., 0., 1., 0., 0., 1., 1., 1., 0., 1., 0., 0., 0., 1.,
        1., 0., 1., 0., 1., 0., 0., 0., 

In [16]:
preds = calculate_predictions(t_independent, weights)
preds[:5]

tensor([[-0.4434],
        [-1.0585],
        [-0.9097],
        [-2.0028],
        [-0.8296]], grad_fn=<SliceBackward0>)

In [17]:
preds.shape

torch.Size([891, 1])

In [18]:
loss = calculate_loss(preds, t_dependent)
loss

tensor(1.4634, grad_fn=<MeanBackward0>)

### 2.3 Gradient descent step
Calculate an epoch of gradient descent.

In [19]:
LEARNING_RATE = 1e-3

In [20]:
weights = initialise_weights(num_layers, num_features)
weights

tensor([[-0.0293, -0.2715,  0.0638,  0.0363, -0.0516, -0.3514, -0.1332,  0.2533,
          0.1105, -0.2790, -0.2712, -0.2183,  0.0045, -0.4709,  0.4644,  0.3106,
          0.3412,  0.2424, -0.1394]], requires_grad=True)

In [21]:
# Calculate the loss using the current weights
preds = calculate_predictions(t_independent, weights)
loss = calculate_loss(preds, t_dependent)
print(loss)

# Calculate the gradients
loss.backward()

with torch.no_grad():
    # Take a gradient descent step
    weights.subtract_(weights.grad * LEARNING_RATE)

    # Reset gradients ready for the next step. Otherwise pytorch adds gradients on the next epoch.
    weights.grad.zero_()

    preds = calculate_predictions(t_independent, weights)
    loss = calculate_loss(preds, t_dependent)
    print(loss)

tensor(1.7557, grad_fn=<MeanBackward0>)
tensor(1.7403)


The loss reduced as we hoped.

In [22]:
def calculate_epoch(weights, t_independent, t_dependent, loss=None):
    """Run one epoch and return the weights and loss"""
    
    if loss is None:
        # Use the loss from the previous epoch if passed in, otherwise calculate the loss using the current weights
        preds = calculate_predictions(t_independent, weights)
        loss = calculate_loss(preds, t_dependent)

    # Calculate the gradients
    loss.backward()

    with torch.no_grad():
        # Take a gradient descent step
        weights.subtract_(weights.grad * LEARNING_RATE)

        # Reset gradients ready for the next step. Otherwise pytorch adds gradients on the next epoch.
        weights.grad.zero_()

    preds = calculate_predictions(t_independent, weights)
    loss = calculate_loss(preds, t_dependent)

    return weights, loss

In [29]:
weights = initialise_weights(num_layers, num_features)
weights

tensor([[-0.1442,  0.1342, -0.0528, -0.1240,  0.3205,  0.4128, -0.0689,  0.4535,
          0.1407, -0.2449,  0.2137, -0.4818,  0.1861,  0.1985, -0.2021,  0.2186,
         -0.4276, -0.1588, -0.0288]], requires_grad=True)

In [30]:
weights, loss = calculate_epoch(weights, t_independent, t_dependent, loss=None)
print(float(loss))

0.6520402431488037


In [31]:
weights, loss = calculate_epoch(weights, t_independent, t_dependent, loss=loss)
print(float(loss))

0.650128185749054


In [32]:
weights, loss = calculate_epoch(weights, t_independent, t_dependent, loss=loss)
print(float(loss))

0.648271918296814


In [33]:
weights, loss = calculate_epoch(weights, t_independent, t_dependent, loss=loss)
print(float(loss))

0.6465025544166565


In [34]:
weights, loss = calculate_epoch(weights, t_independent, t_dependent, loss=loss)
print(float(loss))

0.6447898149490356


### 2.4. Loop through multiple epochs
Run multiple epochs of gradient descent.

In [35]:
NUM_EPOCHS = 20
LEARNING_RATE = 0.05

In [37]:
weights = initialise_weights(num_layers, num_features)
loss = None

for epoch in range(NUM_EPOCHS):
    weights, loss = calculate_epoch(weights, t_independent, t_dependent, loss=loss)
    print(float(loss))

1.2274833917617798
0.6289820671081543
0.5323521494865417
0.5231783986091614
0.5187276005744934
0.5152965784072876
0.5122267007827759
0.5097238421440125
0.5074202418327332
0.5051789879798889
0.5029891729354858
0.5008192658424377
0.4987630844116211
0.49666815996170044
0.4947459101676941
0.49274128675460815
0.49112579226493835
0.4897114932537079
0.48834651708602905
0.48723021149635315


In [38]:
def train_linear_model(t_independent, t_dependent, num_layers, num_epochs):
    num_features = t_independent.shape[1]
    weights = initialise_weights(num_layers, num_features)
    loss = None

    for epoch in range(num_epochs):
        weights, loss = calculate_epoch(weights, t_independent, t_dependent, loss=loss)
        if epoch % 10 == 0: print(epoch, float(loss))

    return weights, loss

In [39]:
weights, loss = train_linear_model(t_independent, t_dependent, num_layers=1, num_epochs=100)

0 0.5080814361572266
10 0.4840608239173889
20 0.46877947449684143
30 0.45717185735702515
40 0.4491163492202759
50 0.44316354393959045
60 0.4385168254375458
70 0.43420925736427307
80 0.43203839659690857
90 0.4301871061325073


In [40]:
dict(zip(X_train.columns, *weights.requires_grad_(False)))

{'AgeNorm': tensor(-0.3547),
 'SibSp': tensor(-0.2115),
 'Parch': tensor(-0.1660),
 'Family': tensor(0.2110),
 'Alone': tensor(0.0404),
 'LogFare': tensor(0.0238),
 'TicketFreq': tensor(-0.0129),
 'Pclass_2': tensor(0.2006),
 'Pclass_3': tensor(0.1722),
 'Sex_male': tensor(-0.0697),
 'Embarked_Q': tensor(-0.3245),
 'Embarked_S': tensor(-0.2680),
 'Deck_DE': tensor(0.2622),
 'Deck_FG': tensor(-0.1807),
 'Deck_Other': tensor(0.2796),
 'Title_Miss': tensor(0.1851),
 'Title_Mr': tensor(0.2615),
 'Title_Mrs': tensor(0.2468),
 'Title_Other': tensor(0.4242)}

### 2.5. Measure accuracy

In [41]:
def calculate_accuracy(predictions, actuals):
    return (predictions == actuals).mean()

In [42]:
predictions_prob = calculate_predictions(t_independent, weights)
predictions_classifications = predictions_prob > 0.5
predictions_classifications = pd.Series(predictions_classifications.squeeze()).astype(int)

calculate_accuracy(predictions_classifications, y_train.astype(int))

0.6172839506172839

### 2.6. Add a sigmoid
The output layer of a binary classifier should be bounded between 0 and 1.
A sigmoid function handles this.

In [43]:
def calculate_predictions(features, weights):
    """Linear model to calculate predictions."""
    return torch.sigmoid(features @ weights.T)

### 2.7 Train the model on a training set and compare validation set accuracy

In [44]:
import random

In [45]:
def validation_set_split(data, validation_pct):
    """Given a collection of input data, return the indexes of the training and validation sets."""
    num_observations = data.shape[0]
    validation_size = int(num_observations * validation_pct)
    validation_idxs = sorted(random.sample(range(num_observations), validation_size))
    training_idxs = list(set(range(num_observations)).difference(set(validation_idxs)))
    return training_idxs, validation_idxs

In [46]:
training_idxs, validation_idxs = validation_set_split(t_independent, validation_pct=0.2)

def train_linear_model(t_independent, t_dependent, num_layers, num_epochs, validation_pct):
    # Split training/validation data
    X_train = t_independent[training_idxs]
    y_train = t_dependent[training_idxs]
    X_validation = t_independent[validation_idxs]
    y_validation = t_dependent[validation_idxs]

    # Initialise weights
    num_features = t_independent.shape[1]
    weights = initialise_weights(num_layers, num_features)
    loss = None

    # Gradient descent
    for epoch in range(num_epochs):
        weights, loss = calculate_epoch(weights, X_train, y_train, loss=loss)
        
        if epoch % 10 == 0:
            validation_predictions = calculate_predictions(X_validation, weights)
            validation_loss = calculate_loss(validation_predictions, y_validation)
            print(epoch, float(validation_loss))

    return weights, loss

In [53]:
weights, loss = train_linear_model(t_independent, t_dependent, num_layers=1, num_epochs=51, validation_pct=0.1)

0 0.4971848428249359
10 0.49094733595848083
20 0.4856831431388855
30 0.48164814710617065
40 0.4786932170391083
50 0.47654294967651367


In [60]:
def calculate_predicted_classifications(predictions_prob):
    predictions_classifications = predictions_prob > 0.5
    return pd.Series(predictions_classifications.squeeze()).astype(int)
    

In [61]:
predictions_prob = calculate_predictions(t_independent[validation_idxs], weights)
predictions_classifications = calculate_predicted_classifications(predictions_prob)

calculate_accuracy(predictions_classifications, y_train[validation_idxs].reset_index(drop=True).astype(int))

0.5337078651685393

### 2.8. Submit to Kaggle

In [62]:
test_feature_df = calculate_features(test_df)
test_feature_df['Deck_Other'] = 0.
test_feature_df = test_feature_df[X_train.columns]
test_feature_df

Unnamed: 0,AgeNorm,SibSp,Parch,Family,Alone,LogFare,TicketFreq,Pclass_2,Pclass_3,Sex_male,Embarked_Q,Embarked_S,Deck_DE,Deck_FG,Deck_Other,Title_Miss,Title_Mr,Title_Mrs,Title_Other
0,0.453947,0.0,0.0,0.0,1.0,2.178064,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.618421,1.0,0.0,1.0,0.0,2.079442,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.815789,0.0,0.0,0.0,1.0,2.369075,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.355263,0.0,0.0,0.0,1.0,2.268252,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.289474,1.0,1.0,2.0,0.0,2.586824,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,0.398324,0.0,0.0,0.0,1.0,2.202765,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
414,0.513158,0.0,0.0,0.0,1.0,4.699571,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
415,0.506579,0.0,0.0,0.0,1.0,2.110213,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
416,0.398324,0.0,0.0,0.0,1.0,2.202765,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [63]:
test_tensor = torch.tensor(test_feature_df.values, dtype=torch.float)
test_tensor

tensor([[0.4539, 0.0000, 0.0000,  ..., 1.0000, 0.0000, 0.0000],
        [0.6184, 1.0000, 0.0000,  ..., 0.0000, 1.0000, 0.0000],
        [0.8158, 0.0000, 0.0000,  ..., 1.0000, 0.0000, 0.0000],
        ...,
        [0.5066, 0.0000, 0.0000,  ..., 1.0000, 0.0000, 0.0000],
        [0.3983, 0.0000, 0.0000,  ..., 1.0000, 0.0000, 0.0000],
        [0.3983, 1.0000, 1.0000,  ..., 0.0000, 0.0000, 0.0000]])

In [65]:
test_df

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [74]:
test_predictions_prob = calculate_predictions(test_tensor, weights)
test_predictions_classifications = calculate_predicted_classifications(test_predictions_prob)

In [75]:
test_predictions_classifications.shape

(418,)

In [77]:
output_df = test_df[['PassengerId']]
output_df['Survived'] = test_predictions_classifications
output_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output_df['Survived'] = test_predictions_classifications


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [78]:
output_df.to_csv('linear_model_output.csv')

In [79]:
output_df['Survived'].value_counts()

Survived
0    418
Name: count, dtype: int64