In [1]:
import torch
import torchvision
from torch import nn
from torch.utils import data
from torchvision import transforms
import numpy as np

Problem with Linear Model: assumes a linear boundary could be drawn and assumes monotonicity: that a relationship between 2 variables stays consistent ad infintium

We could define a linear layer on top of our current linear layer, but an affine function on top of an affine function is still affine, so we gain nothing. Our model is already currently capable of representing any affine function

Given activations, we end up with linear combinations of non-linearities, forming complex non-linear decision boundaries. These become **Universal Approximators** - they can learn any function

## Activations

Differentiable non-linear functions that "Activate" a certain amount in some set space based on the specific value the input takes, but distorts scaling (not all inputs scaled evenly to outputs). All have gradients.

In [104]:
# ReLu Activation - max(0, input) - scaled inputs in the 0 to + inf space (all - inputs mapped to same value => - weights distorted + becomes different function)
t1 = torch.arange(-8.0, 8.0, 0.5)
torch.relu(t1) # Derivative 0 up until 0, then 1, not differentiable at 0
# Amplifies positive activations, does not change negative so positive comparitively larger
# Well behaved - lets argument through if positive so positive weights do not vanish
# Problem: loses all negative information.
# Does not learn negative scaling, just scales weight to 0
# Adjusts positive weights towards optimum, negative weights to 0 (?)
# Continues to adjust extreme weights, boosting confidence if even more extreme, does not adjust low confidence


tensor([0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.5000,
        1.0000, 1.5000, 2.0000, 2.5000, 3.0000, 3.5000, 4.0000, 4.5000, 5.0000,
        5.5000, 6.0000, 6.5000, 7.0000, 7.5000])

In [105]:
torch.sigmoid(t1) # Sigmoid activation squashees to [0,1] space
# Goal: break functional linearity
# Problem: linear around 0, may adjusts small weights in a linear way such that linear boundary is not broken
# Derivative close to 0 at extremes

tensor([3.3535e-04, 5.5278e-04, 9.1105e-04, 1.5012e-03, 2.4726e-03, 4.0701e-03,
        6.6929e-03, 1.0987e-02, 1.7986e-02, 2.9312e-02, 4.7426e-02, 7.5858e-02,
        1.1920e-01, 1.8243e-01, 2.6894e-01, 3.7754e-01, 5.0000e-01, 6.2246e-01,
        7.3106e-01, 8.1757e-01, 8.8080e-01, 9.2414e-01, 9.5257e-01, 9.7069e-01,
        9.8201e-01, 9.8901e-01, 9.9331e-01, 9.9593e-01, 9.9753e-01, 9.9850e-01,
        9.9909e-01, 9.9945e-01])

In [106]:
torch.tanh(t1) # Maps to the [-1,1] space, also squashes inputs so that 
# we obtain reasonable activations in linear combinations
# Derivative is 0 at extremes - does not adjust extreme weights (extremely high/low confidence = rule in / out)

tensor([-1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -0.9999, -0.9998,
        -0.9993, -0.9982, -0.9951, -0.9866, -0.9640, -0.9051, -0.7616, -0.4621,
         0.0000,  0.4621,  0.7616,  0.9051,  0.9640,  0.9866,  0.9951,  0.9982,
         0.9993,  0.9998,  0.9999,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000])

## Multi Layer Neural Networks

In [107]:
# Function from previous notebook
# Converting to function for future use, default num_workers is 4 bc CPU threads
def load_fashion_mnist(batch_size: int = 512, num_workers: int = 4):
    data_transform = transforms.ToTensor() # Obtaining data to tensor converter
    
    # Downloading data
    mnist_train = torchvision.datasets.FashionMNIST(root = "../data", train = True, transform = data_transform, download= True)  # Defining fashion MNIST train from torch datasets
    mnist_test = torchvision.datasets.FashionMNIST(root = "../data", train = False, transform = data_transform, download = True)
    
    # Loading data onto an iterator
    train_data_loader = data.DataLoader(mnist_train, batch_size, shuffle = True, num_workers = 4)
    test_data_loader = data.DataLoader(mnist_test, batch_size, shuffle = True, num_workers = 4)
    
    # Returning iterator
    return train_data_loader, test_data_loader 
    

In [108]:
train_loader, test_loader = load_fashion_mnist(256, 4)

In [109]:
train_loader, test_loader # Verifying loaders

(<torch.utils.data.dataloader.DataLoader at 0x7f99e49b60d0>,
 <torch.utils.data.dataloader.DataLoader at 0x7f99e488a910>)

Typically choose layer widths in powers of 2, as these tend to be more hardware efficient (don't use uneccessary bit storage)

In [110]:
def relu(X):
    zero_tensor = torch.zeros_like(X) # Defines a 0 tensor of identical shape to a given input
    return torch.max(X, zero_tensor) # PyTorch distributes max comparisons elementwise across 2 tensors

In [111]:
relu(torch.Tensor([[-1, 1, 2, 3], [1,2,3,-4]]))

tensor([[0., 1., 2., 3.],
        [1., 2., 3., 0.]])

In [112]:
# We can use @ for matrix multiplication
t1 @ t1.T # Dot product using matrix multiplication

tensor(684.)

In [113]:
# Defining a multilayer model

# Architecture: flatten input, map to 256 dimensions, break linearity having learned 
# 256 intermediate representations, in breaking linearity map all values to the positive space, keeping only poisitive activations
# Lastly, use these features to reach 10 confidences which will be used to make a prediction.
                            

In [175]:
def init_weights(X):
    if isinstance(X, torch.nn.Linear): # should only initialize weights of linear layers
        torch.nn.init.normal_(X.weight, mean = 0, std = 0.) #using pytorch to randomly initialize weights of that layer

# Applying init to model to initialize all layer weights
def build_model(input_dim:int, learn_rate:int, layer_dim, weight_decay_param:int = 0.09):
    model = torch.nn.Sequential(torch.nn.Flatten(), torch.nn.Linear(784, 64), torch.nn.ReLU(), torch.nn.Linear(64, 10), torch.nn.ReLU(), torch.nn.Softmax())
    model.apply(init_weights)
    trainer = torch.optim.Adam(model.parameters(), lr = learn_rate, weight_decay=weight_decay_param)
    loss = torch.nn.CrossEntropyLoss()
    return model, trainer, loss

In [176]:
hyperparams = {"learn_rate": 0.0005, "input_dim": 784, "layer_dim": [784, 256, 256, 10], "num_epochs": 20}
model, trainer,loss = build_model(hyperparams["learn_rate"], hyperparams["input_dim"], hyperparams["layer_dim"])

In [177]:
model, trainer, loss

(Sequential(
   (0): Flatten(start_dim=1, end_dim=-1)
   (1): Linear(in_features=784, out_features=64, bias=True)
   (2): ReLU()
   (3): Linear(in_features=64, out_features=10, bias=True)
   (4): ReLU()
   (5): Softmax(dim=None)
 ),
 Adam (
 Parameter Group 0
     amsgrad: False
     betas: (0.9, 0.999)
     eps: 1e-08
     lr: 784
     weight_decay: 0.09
 ),
 CrossEntropyLoss())

In [178]:
for i in range(hyperparams["num_epochs"]):
    for data, label in train_loader:
        trainer.zero_grad() # resetting gradient to 0
        cost = loss(model(data), label) # Computing cost
        cost.backward() # Calculating gradients
        trainer.step() # Stepping (backprop application)
    print("cost = ", cost)

  input = module(input)


cost =  tensor(2.2632, grad_fn=<NllLossBackward>)
cost =  tensor(2.3487, grad_fn=<NllLossBackward>)
cost =  tensor(2.3153, grad_fn=<NllLossBackward>)
cost =  tensor(2.3466, grad_fn=<NllLossBackward>)
cost =  tensor(2.2522, grad_fn=<NllLossBackward>)
cost =  tensor(2.3362, grad_fn=<NllLossBackward>)
cost =  tensor(2.3466, grad_fn=<NllLossBackward>)
cost =  tensor(2.3570, grad_fn=<NllLossBackward>)
cost =  tensor(2.3987, grad_fn=<NllLossBackward>)
cost =  tensor(2.3674, grad_fn=<NllLossBackward>)
cost =  tensor(2.4091, grad_fn=<NllLossBackward>)
cost =  tensor(2.3466, grad_fn=<NllLossBackward>)
cost =  tensor(2.2841, grad_fn=<NllLossBackward>)
cost =  tensor(2.3674, grad_fn=<NllLossBackward>)
cost =  tensor(2.2632, grad_fn=<NllLossBackward>)
cost =  tensor(2.3778, grad_fn=<NllLossBackward>)
cost =  tensor(2.3362, grad_fn=<NllLossBackward>)
cost =  tensor(2.1799, grad_fn=<NllLossBackward>)
cost =  tensor(2.4091, grad_fn=<NllLossBackward>)
cost =  tensor(2.3362, grad_fn=<NllLossBackward>)


In [119]:
# Intuition for L2 regularization / weight decay :  distance of function from 0 == its complexity
# Add penalty term to loss function which represents complexity (larger weights for single features)
# Minimizing this will include a tendency to reduce weights such that function of weights minimized inside of loss
    # Beneficial to reduce as weight reduction -> smaller loss
# Regularization becomes L + lambda/2 * || w || ^2. 
# Reduces by derivation to lambda * sum(weights) derivative, which is a measure of the sum of complexity.
# Lambda controls the fractional importance of error which we assign to complexity 
# (weights small => more variables with smaller weights => learns robust patterns => 
# more complex decision boundaries => fits better (penalty on larger terms))
    # Gradient of L2 = mutlivar function with each variable scaled by lambda (reduce by lambda * var gradient = weight bc linear fn)
    # => constant reduction from this part of function, larger if complexity more important (lambda) to reduce
        # Lambda is sort of a weight decay parameter
# Largest weights have the largest gradients and reduce more - better and more equivalent use of all information


In [49]:
# Trying neural network on easier data
def init_weights(X):
    if isinstance(X, torch.nn.Linear): # should only initialize weights of linear layers
        torch.nn.init.normal_(X.weight, mean = 0, std = 0.1) #using pytorch to randomly initialize weights of that layer

# A generalized form
def build_model(input_dim:int, learn_rate:int, layer_dims:list, weight_decay_param:int = 0.002):
    
    # Model with dropout added to ensure simpler feature fit
    model = torch.nn.Sequential(torch.nn.Flatten(), 
                                torch.nn.Linear(layer_dims[0], layer_dims[1]), torch.nn.Sigmoid(),
                                torch.nn.Linear(layer_dims[1], layer_dims[2]), 
                                torch.nn.ReLU(), torch.nn.Linear(layer_dims[2], layer_dims[3]), torch.nn.ReLU(), torch.nn.Softmax())
    model.apply(init_weights)
    trainer = torch.optim.Adam(model.parameters(), lr = learn_rate, weight_decay=weight_decay_param)
    loss = torch.nn.CrossEntropyLoss()
    return model, trainer, loss

model, trainer,loss = build_model(hyperparams["learn_rate"], hyperparams["input_dim"], hyperparams["layer_dim"])
trainer.zero_grad()

for i in range(hyperparams["num_epochs"]):
    for data, label in train_loader:
        trainer.zero_grad() # resetting gradient to 0
        cost = loss(model(data), label) # Computing cost
        cost.backward() # Calculating gradients
        trainer.step() # Stepping (backprop application)
    print("cost = ", cost)


cost =  tensor(2.3043, grad_fn=<NllLossBackward>)
cost =  tensor(2.3100, grad_fn=<NllLossBackward>)
cost =  tensor(2.3135, grad_fn=<NllLossBackward>)
cost =  tensor(2.3089, grad_fn=<NllLossBackward>)
cost =  tensor(2.2970, grad_fn=<NllLossBackward>)
cost =  tensor(2.2898, grad_fn=<NllLossBackward>)
cost =  tensor(2.3012, grad_fn=<NllLossBackward>)
cost =  tensor(2.3095, grad_fn=<NllLossBackward>)
cost =  tensor(2.3088, grad_fn=<NllLossBackward>)
cost =  tensor(2.3056, grad_fn=<NllLossBackward>)
cost =  tensor(2.3061, grad_fn=<NllLossBackward>)
cost =  tensor(2.3049, grad_fn=<NllLossBackward>)
cost =  tensor(2.3033, grad_fn=<NllLossBackward>)
cost =  tensor(2.3107, grad_fn=<NllLossBackward>)
cost =  tensor(2.3070, grad_fn=<NllLossBackward>)
cost =  tensor(2.3011, grad_fn=<NllLossBackward>)
cost =  tensor(2.2998, grad_fn=<NllLossBackward>)
cost =  tensor(2.2981, grad_fn=<NllLossBackward>)
cost =  tensor(2.2930, grad_fn=<NllLossBackward>)
cost =  tensor(2.3061, grad_fn=<NllLossBackward>)


## Dropout and Bias Variance Tradeoff

Linear Models have **high bias** - "decide in advance" that they will represent only a specific type of relationship.

Also have **low variance** - decision boundary pretty consistent based on random fluctuations in data (overall slope of line may change, but will not learn a completely different function. Will only learn a certain function with some fluctuations

Neural Networks, on the other hand, have low bias as they can fit any function (no assumption) but high variance because they could fit a dataset so perfectly that a random fluctuation in data will completely change the decision boundary

In [38]:
# Dropout introduced to fix high variance and tendency to build non generalizable,
# extremely complex decision boundaries
# Point of dropout is to train NN while breaking co-adaptation: next layers rely on previous 
# Ideally we'd like to generalize decision boundary by training a few neural nets with different architectures/data
    # To account for fluctuation and then average them out
# Stochastic fluctuations from training and population data balance out if we do this: get close to true
    # Population distribution data-wise and a model that overfits in every different way ends up with the center of all overfits, which is likely a relatively smooth mid point
        # The act of averaging is inherently an act of losing extreme patterns as they become smoother and more general: populations behave smoothly.
# Units present with probability p at training time, always present at test time
# At train: unit present with probability p, adds to prediction w*p to total prediction.
# At test, always present with weight w*p, to account for expected contribution

### Methodology

In [39]:
# Drop with probability p, retain with probability 1-p and divide by 1-p so that E(h) = 1-p *h/1-p = h.
# Scales back to normal neural space where feature contributes equivalently to next activationa nd total prediction comprehensively
# Allows a non-neglection of features in the feature space, and yet every prediction is a simpler boundary as it is reliant on fewer variables
# AKA: we do not make any features less significant than others, but rather return to neural space by averaging a set of space-scaled simplistic neural predictions within it
# Randomize so different model
# More simplistic neural feature, but identical prediction space where otherwise it would be lesser

In [120]:
# What effect will backprop have?
    # Theory 1: forces weights to increase as 0 weights may be unreliable if high weights are dropped out
    # Theory 2: forces weights to become lower as learns simpler multi-variate patterns
    # Theory 3: evenly weighs features such that high confidence complexities are backtracked
# Conclusion - equivalent weight distributions, weights could be increased or decreased. Increased if need to use a remaining var to make prediction,
    # Decreased if that var turns out to be suboptimal in the next dropout. Overall seems to lead to likely homogenous confidence AKA simpler patterns.
    # Reduces complexity of patterns because balances out importance of variables, also reduces crucial feature search in training because feature needs to 
        # Be very crucial to persist in every dropout. However, trends pattern towards true crucial features.

In [182]:
# Experiment as to why things are failing

# Trying neural network on easier data
def init_weights(X):
    if isinstance(X, torch.nn.Linear): # should only initialize weights of linear layers
        torch.nn.init.normal_(X.weight, mean = 0, std = 0.1) #using pytorch to randomly initialize weights of that layer

# A generalized form
def build_model(input_dim:int, learn_rate:int, layer_dims:list, weight_decay_param:int = 0.002):
    
    # Model with dropout added to ensure simpler feature fit
    model = torch.nn.Sequential(torch.nn.Linear(layer_dims[0], layer_dims[1]), torch.nn.ReLU(), torch.nn.Dropout(0.2),
                                torch.nn.Linear(layer_dims[1], layer_dims[2]), 
                                torch.nn.ReLU(), torch.nn.Linear(layer_dims[2], layer_dims[3]), torch.nn.ReLU(), torch.nn.Softmax())
    model.apply(init_weights)
    trainer = torch.optim.Adam(model.parameters(), lr = learn_rate, weight_decay=weight_decay_param)
    loss = torch.nn.MSELoss()
    return model, trainer, loss

model = torch.nn.Sequential(torch.nn.Linear(784, 1), torch.nn.Linear(1,1))
trainer = torch.optim.Adam(model.parameters(), lr = 0.0005)
loss = torch.nn.MSELoss()

for i in range(hyperparams["num_epochs"]):
    ones = torch.ones(784) + torch.rand(784)
    two = torch.Tensor([2])
    trainer.zero_grad() # resetting gradient to 0
    prediction = model(ones)
    cost = loss(prediction, two) # Computing cost
    cost.backward() # Calculating gradients
    trainer.step() # Stepping (backprop application)
    print(cost.sum())
    

tensor(1.2395, grad_fn=<SumBackward0>)
tensor(1.1396, grad_fn=<SumBackward0>)
tensor(0.0730, grad_fn=<SumBackward0>)
tensor(0.0931, grad_fn=<SumBackward0>)
tensor(0.3864, grad_fn=<SumBackward0>)
tensor(0.2824, grad_fn=<SumBackward0>)
tensor(0.3197, grad_fn=<SumBackward0>)
tensor(0.1080, grad_fn=<SumBackward0>)
tensor(0.1953, grad_fn=<SumBackward0>)
tensor(0.0519, grad_fn=<SumBackward0>)
tensor(0.0050, grad_fn=<SumBackward0>)
tensor(0.0106, grad_fn=<SumBackward0>)
tensor(0.0037, grad_fn=<SumBackward0>)
tensor(0.0261, grad_fn=<SumBackward0>)
tensor(0.1318, grad_fn=<SumBackward0>)
tensor(0.0201, grad_fn=<SumBackward0>)
tensor(0.0916, grad_fn=<SumBackward0>)
tensor(0.2036, grad_fn=<SumBackward0>)
tensor(0.2473, grad_fn=<SumBackward0>)
tensor(0.1187, grad_fn=<SumBackward0>)
