In [1]:
%matplotlib inline

In [14]:
#export
from exp.nb_06 import *
from torch import nn

In [3]:
xTraining, yTraining, xValidation, yValidation = getMnistData()
xTrainingNormalized, xValidationNormalized = normalizeVectors(xTraining, xValidation)
(xTrainingNormalized.mean(), xTrainingNormalized.std, xValidationNormalized.mean(), xValidationNormalized.std())

(tensor(-7.6999e-06),
 <function Tensor.std>,
 tensor(-7.0751e-08),
 tensor(1.0000))

In [4]:
numberOfClasses = 10
hiddenLayerOutput = 50
batchSize = 64
lossFuction = Functional.cross_entropy

In [5]:
trainingDataSet, validationDataSet = Dataset(xTrainingNormalized, yTraining), Dataset(xValidationNormalized, yValidation)

In [6]:
trainingDataLoader, validationDataLoader = createDataLoaders(trainingDataSet, validationDataSet, batchSize)

In [7]:
imageDataBunch = DataBunch(trainingDataLoader, validationDataLoader, numberOfClasses)

In [8]:
layerSizes = [8, 16, 32, 64, 64]

In [9]:
convolutionalModelSR1 = createBetterConvolutionModel(numberOfClasses, layerSizes)
convolutionalModelSR1

Sequential(
  (0): LambdaLayer()
  (1): Sequential(
    (0): Conv2d(1, 8, kernel_size=(5, 5), stride=(2, 2), padding=(2, 2))
    (1): GeneralRectifiedLinearUnit()
  )
  (2): Sequential(
    (0): Conv2d(8, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (1): GeneralRectifiedLinearUnit()
  )
  (3): Sequential(
    (0): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (1): GeneralRectifiedLinearUnit()
  )
  (4): Sequential(
    (0): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (1): GeneralRectifiedLinearUnit()
  )
  (5): Sequential(
    (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (1): GeneralRectifiedLinearUnit()
  )
  (6): AdaptiveAvgPool2d(output_size=1)
  (7): LambdaLayer()
  (8): Linear(in_features=64, out_features=10, bias=True)
)

In [10]:
accuracy(convolutionalModelSR1(xTraining), yTraining)

tensor(0.0994)

In [11]:
phases = [0.3, 0.7]
weightsScheduler = aggregateSchedulers(phases, createCosineSchedulers(0.3, 0.6, 0.2)) 
biasScheduler = aggregateSchedulers(phases, createCosineSchedulers(0.9, 1.8, 0.6))

In [12]:
teacher = TeacherWithHooks(schedulingFunctions=[weightsScheduler, biasScheduler])

In [13]:
teacher.teachModel(convolutionalModelSR1, imageDataBunch, 2)

Epoch #0 Training: Loss 0.4548206627368927 Accuracy 0.8388347029685974
Epoch #0 Validation: Loss 0.08740277588367462 Accuracy 0.9732001423835754

Epoch #1 Training: Loss 0.1314106285572052 Accuracy 0.9611372947692871
Epoch #1 Validation: Loss 0.14704951643943787 Accuracy 0.9635087251663208



## Layer Normalization

*Internal Covariant Shift*- When the accuracy rate of your model starts to no longer improve at the current learning rate. Only improving when the learning rate becomes smaller, only allowing for marginal returns.

Batch Normalization may negate the need for **layer dropout**. Batch norm works on the **activations** and not the **parameters**

It is important to have parameters such that everytime something is imputing, there is always activations after the matrix multiplication. (I think this allows better gradient calculations?)

$\gamma$(gamma) and $\beta$ (beta) shift normalization of the layer to our pleasing. Which allows us to move our loss in a direction that is smaller (closer to the expected value) $\hat{x} = W\hat{x} + \vec{b}$ 

$\hat{x}$ is the activations. So we want to move the activations closest to y so $\hat{y} = \gamma\hat{x} + \beta$ <- we want that to be close to y ie $0 = y - \hat{y}$


### Linear Interpolation
Literally finding the slope intercept form and evaluating for x.

In this case the starting vector is the first point and the second point is the second vector.

The two points form a line which can be interpreted as $y = mx+b$ the third argument is the x you are passing (0 gives you the first vector, 1 give you the second vector, 0.5 gives you a vector that is halfway between the two that are on the line that the two points form)

In [43]:
a = torch.tensor([1, 2, 3]).float()
b = torch.tensor([3, 6, 13]).float()
torch.lerp(a, b, .5), torch.lerp(a, b, .0), torch.lerp(a, b, 1)

(tensor([2., 4., 8.]), tensor([1., 2., 3.]), tensor([ 3.,  6., 13.]))

In [44]:
class BatchNormalization(nn.Module):
    def __init__(self, layerSize, momentum=0.1, epsilon=1e-5):
        super().__init__()
        self.momentum, self.epsilon = momentum, epsilon
        self.gamma = nn.Parameter(torch.ones(layerSize, 1, 1)) # being a parameter allows it to be adjust in backprop
        self.beta = nn.Parameter(torch.zeros(layerSize, 1, 1)) # ie this is a part of the convolution model.
        self.register_buffer('variances', torch.ones(1, layerSize, 1, 1))
        self.register_buffer('means', torch.zeros(1, layerSize, 1, 1))
        
    def forward(self, activations):
        normalizedActivations = self._normalizeActivations(activations)
        return self.gamma * normalizedActivations + self.beta
    
    def _normalizeActivations(self, activations):
        batchMean, batchVariance = self._getMeanAndVariance(activations)
        return (activations - batchMean) / (batchVariance + self.epsilon).sqrt()
    
    def _getMeanAndVariance(self, activations):
        if self.training:
            with torch.no_grad():
                return self._updateStatistics(activations)
        else:
            return self.means, self.variances

    def _updateStatistics(self, activations):
        batchMeans = activations.mean((0,2,3), True)
        batchVariances = activations.var((0, 2, 3), True)
        self.means.lerp_(batchMeans, self.momentum) # we only want to step a little bit away from
        self.variances.lerp_(batchVariances, self.momentum) # the stored means eg momentum is tiny
        return batchMeans, batchVariances

In [45]:
def createBatchNormalizedConvolutionLayer(inputSize, 
                                   outputSize, 
                                   kernelSize=3, 
                                   stride=2,
                                   leaky=0.1,
                                   subtractValue=0.4,
                                   maxToClamp=6.0
                                  ):
    layers = [
        torch.nn.Conv2d(inputSize, 
                           outputSize, 
                           kernelSize, 
                           stride, 
                           kernelSize//2), 
       GeneralRectifiedLinearUnit(leaky, 
                                  subtractValue, 
                                  maxToClamp),
        BatchNormalization(outputSize)
    ]
    return torch.nn.Sequential(*layers)

def createBatchNormalizedConvolutionLayers(numberOfClasses, layerSizes):
    adjustedLayerSizes = [1]  + layerSizes # Make the first layer take a dimension of one
    convolutionLayers = [createBatchNormalizedConvolutionLayer(adjustedLayerSizes[i], 
                                               adjustedLayerSizes[i+1],
                                               5 if i == 0 else 3)
                        for i in range(len(adjustedLayerSizes) - 1)]
    finishingLayers = [
        torch.nn.AdaptiveAvgPool2d(1),
        LambdaLayer(flattenImage),
        torch.nn.Linear(layerSizes[-1], numberOfClasses)
    ]
    return [LambdaLayer(resizeImage)] + convolutionLayers + finishingLayers


def createBatchNormalizedConvolutionModel(numberOfClasses, layerSizes):
    return torch.nn.Sequential(*createBatchNormalizedConvolutionLayers(numberOfClasses, layerSizes))

In [46]:
convolutionModelSR2 = createBatchNormalizedConvolutionModel(numberOfClasses, layerSizes)
convolutionModelSR2

Sequential(
  (0): LambdaLayer()
  (1): Sequential(
    (0): Conv2d(1, 8, kernel_size=(5, 5), stride=(2, 2), padding=(2, 2))
    (1): GeneralRectifiedLinearUnit()
    (2): BatchNormalization()
  )
  (2): Sequential(
    (0): Conv2d(8, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (1): GeneralRectifiedLinearUnit()
    (2): BatchNormalization()
  )
  (3): Sequential(
    (0): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (1): GeneralRectifiedLinearUnit()
    (2): BatchNormalization()
  )
  (4): Sequential(
    (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (1): GeneralRectifiedLinearUnit()
    (2): BatchNormalization()
  )
  (5): AdaptiveAvgPool2d(output_size=1)
  (6): LambdaLayer()
  (7): Linear(in_features=32, out_features=10, bias=True)
)

In [47]:
accuracy(convolutionModelSR2(xTrainingNormalized), yTraining)

RuntimeError: output with shape [1, 8, 1, 1] doesn't match the broadcast shape [1, 8, 1, 8]

In [None]:
%debug

> [0;32m<ipython-input-44-7ab82ab3d130>[0m(29)[0;36m_updateStatistics[0;34m()[0m
[0;32m     26 [0;31m        [0mbatchMeans[0m [0;34m=[0m [0mactivations[0m[0;34m.[0m[0mmean[0m[0;34m([0m[0;34m([0m[0;36m0[0m[0;34m,[0m[0;36m2[0m[0;34m,[0m[0;36m3[0m[0;34m)[0m[0;34m,[0m [0;32mTrue[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     27 [0;31m        [0mbatchVariances[0m [0;34m=[0m [0mactivations[0m[0;34m.[0m[0mvar[0m[0;34m([0m[0;34m([0m[0;36m0[0m[0;34m,[0m [0;36m2[0m[0;34m,[0m [0;36m3[0m[0;34m)[0m[0;34m,[0m [0;32mTrue[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     28 [0;31m        [0mself[0m[0;34m.[0m[0mmeans[0m[0;34m.[0m[0mlerp_[0m[0;34m([0m[0mbatchMeans[0m[0;34m,[0m [0mself[0m[0;34m.[0m[0mmomentum[0m[0;34m)[0m [0;31m# we only want to step a little bit away from[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 29 [0;31m        [0mself[0m[0;34m.[0m[0mvariances[0m[0;34m.[0m[0mler