### Forward and backward pass

In [2]:
import pickle, gzip, math, os, time, shutil, torch, matplotlib as mpl, numpy as np
from pathlib import Path
from torch import tensor

import pandas as pd

torch.manual_seed(42)

mpl.rcParams['image.cmap'] = 'gray'
torch.set_printoptions(precision=2, linewidth=125, sci_mode=False)
np.set_printoptions(precision=2, linewidth=125)

In [3]:
mnist_path = Path('data/mnist')
train_df = pd.read_csv(mnist_path/'train.csv')
test_df = pd.read_csv(mnist_path/'test.csv') # for inference

In [6]:
def get_xy(df):
    x = df.iloc[:, 1:].values
    y = df.iloc[:, 0].values
    return x, y

In [7]:
# Grab arrays
X, y = get_xy(train_df)
x_train, y_train = X[:32000], y[:32000]
x_valid, y_valid = X[32000:], y[32000:]

# Map to tensors
x_train, y_train, x_valid, y_valid = map(tensor, (x_train, y_train, x_valid, y_valid))

# normalize x_train and x_test
x_train, x_valid= x_train.float()/255., x_valid.float()/255.

In [8]:
# shapes
n,m = x_train.shape
c = y_train.max() + 1 # number of classes
n,m,c

(32000, 784, tensor(10))

In [9]:
# Create number of hidden neurons
nh = 50

In [10]:
# Our network
# Our network is randonmly initialized
w1 = torch.randn(m, nh)
b1 = torch.zeros(nh)
w2 = torch.randn(nh, 1)
b2 = torch.zeros(1)

In [11]:
# Creating a linear layer
def lin(x, w, b): return x@w + b

# relu activation function
def relu(x): return x.clamp_min(0.)

In [12]:
# Feeding x_train through linear and grabbing shape
t = lin(x_train, w1, b1)
t.shape

torch.Size([32000, 50])

In [13]:
t

tensor([[  5.51,   1.20, -11.81,  ...,   3.12,  -0.65,  -3.11],
        [ 25.07,  -1.47,  -7.63,  ...,  -7.06, -15.77,   4.61],
        [  2.25,   9.32,  -6.63,  ...,   6.35,   4.57,  17.86],
        ...,
        [ -1.02,  -5.44,   5.62,  ...,   6.06,  -4.23,  15.03],
        [  8.19,  -0.80, -11.33,  ...,   5.66,   2.46,   4.89],
        [ 30.54,  11.79,  -5.01,  ...,  -2.40,  -4.23,  36.82]])

In [14]:
# Feed through the relu
t = relu(lin(x_train, w1, b1))
t.shape

torch.Size([32000, 50])

In [15]:
t

tensor([[ 5.51,  1.20,  0.00,  ...,  3.12,  0.00,  0.00],
        [25.07,  0.00,  0.00,  ...,  0.00,  0.00,  4.61],
        [ 2.25,  9.32,  0.00,  ...,  6.35,  4.57, 17.86],
        ...,
        [ 0.00,  0.00,  5.62,  ...,  6.06,  0.00, 15.03],
        [ 8.19,  0.00,  0.00,  ...,  5.66,  2.46,  4.89],
        [30.54, 11.79,  0.00,  ...,  0.00,  0.00, 36.82]])

In [16]:
# Create our first model
def model(xb):
    l1 = lin(xb, w1, b1)
    l2 = relu(l1)
    l3 = lin(l2, w2, b2) # return output
    return l3

In [17]:
# Now we can feed into the whole dataset
res = model(x_train)
res.shape

torch.Size([32000, 1])

In [18]:
# Or batches of the dataset
res = model(x_train[:10])
res.shape

torch.Size([10, 1])

In [19]:
def batch_forward(X, Y, model, batch_size=64):
    res = torch.zeros_like(Y, dtype=torch.float)[:,None]
    for i in range(0, X.shape[0], batch_size):
        res[i:i+batch_size] = model(X[i:i+batch_size])
    return res

In [20]:
res = batch_forward(
    X=x_valid,
    Y=y_valid,
    model=model,
    batch_size=64
)

#### Loss Function: MSE
MSE is not a suitable loss function for multi-class classification; we'll use a better loss function soon. We will use this to keep things simple for now

In [22]:
res.shape, y_valid.shape

(torch.Size([10000, 1]), torch.Size([10000]))

## IMPORTANT BROADCASTING RULE
If we were to run res-y_train to grab a loss, the broadcasting with the shapes above will return a maxtrix of 10000x10000. The reason is because it does broadcasting from right to left. 

To the right dimension will be used as the anchor to then broadcast to the left.

In [24]:
(res-y_valid).shape

torch.Size([10000, 10000])

This is essentially treating the operation as 10000x1 * 1x10000 through broadcasting. 

In [25]:
# So to do this appropriately we need to add a dimension to y_valid
(res - y_valid[:,None]).shape

torch.Size([10000, 1])

In [26]:
# we could also just remove the dimension from res
# This is more in line with what we want
(res[:,0] - y_valid).shape

torch.Size([10000])

In [31]:
# turn ys into floats
y_train, y_valid = y_train.float(), y_valid.float()

In [32]:
# run predictions through the model
preds = model(x_train)
preds.shape

torch.Size([32000, 1])

In [33]:
# mse loss, mean squared error
def mse(output, target): return (output[:,0] - target).pow(2).mean()

In [34]:
mse(preds, y_train)

tensor(4345.16)