# Implementing Adam Optimiser

In this notebook I implement the Adam optimiser, introduced in [this paper](https://arxiv.org/pdf/1412.6980.pdf), and then test it using a simple CNN

## Imports

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch import Tensor
from torch.utils.data import Dataset
from torchvision import datasets
from torchvision.transforms import ToTensor
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader

import time

## Get Data

In [None]:
training_data = datasets.MNIST(
    root="data",
    train=True,
    download=True,
    transform=ToTensor()
)

test_data = datasets.MNIST(
    root="data",
    train=False,
    download=True,
    transform=ToTensor()
)
train_dataloader = DataLoader(training_data, batch_size=128, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=128, shuffle=True)

## Create Adam Optimiser

In [None]:
class Adam():
  def __init__(self,params,lr=0.001,beta_1=.9,beta_2=.999,epsilon=10**-8):
    #by default uses parameters recommended in the paper
    self.params=list(params)
    self.lr=lr
    self.beta_1=beta_1
    self.beta_2=beta_2
    self.epsilon=epsilon
    self.t = 0
    #moment vectors
    self.v = [0 for x in range(len(self.params))]
    self.m= [0 for x in range(len(self.params))]


  def step(self):
    for i in range(len(self.params)):
      with torch.no_grad():
        #next timestep
        self.t+=1
        #set gradient
        g_t = self.params[i].grad
        #update moment vectors
        #notice that the beta parameters act as a decay rate
        self.m[i]=self.beta_1*self.m[i]+(1-self.beta_1)*g_t
        self.v[i]=self.beta_2*self.v[i]+(1-self.beta_2)*(g_t**2)
        #normalise moments
        m_hat_t = self.m[i]/(1-(self.beta_1**self.t))
        v_hat_t = self.v[i]/(1-(self.beta_2**self.t))
        #update parameters of neural network
        self.params[i]-=self.lr*(m_hat_t/(torch.sqrt(v_hat_t+self.epsilon)))

  def zero_grad(self):
    for i in range(len(self.params)):
      self.params[i].grad=None




## Create Model

In [None]:
class ConvNet(nn.Module):
    def __init__(self):
        super(ConvNet, self).__init__()
        #2 convolutional layers followed by 3 FC layers
        self.conv1 = nn.Conv2d(1, 6, 5)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(256, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        x = torch.flatten(x, 1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

## Train

In [None]:
def train(model,gpu=True,learning_rate=0.01,num_epochs = 2):
    if gpu:model = model.cuda()
    else:model = model.cpu()

    loss_fn = nn.CrossEntropyLoss()

    optimizer = Adam(model.parameters(), lr=learning_rate)
    for epoch in range(num_epochs):
        for inputs, labels in train_dataloader:
            if gpu:inputs,labels = inputs.cuda(),labels.cuda()
            else:inputs.cpu(),labels.cpu()
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()

## Test

In [None]:
def test(model):
    model = model.cpu()
    with torch.no_grad():
        correct = 0
        total = 0
        for inputs, labels in test_dataloader:
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    return correct/total

## It works!

In [None]:
model = ConvNet()

#set to false if no gpu

train(model,gpu=True)
test(model)

0.9773