In [1]:
import torch

In [2]:
class Module(object):
    def forward(self, *input):
        raise NotImplementedError()

    def backward(self, *grad_wrt_output):
        raise NotImplementedError()

    def param(self):
        return []

In [3]:
class Variable:
    """Trainable variable."""
    def __init__(self, data, requires_grad=False):
        self.data = data
        self.grad = torch.empty_like(data)  # acumulated gradient

    def zero_grad(self):
        self.grad.zero_()
    
    def update(self, eta):
        self.data = self.data - eta * self.grad

In [4]:
class Tanh(Module):
    def forward(self, x):
        self.x = x
        return torch.tanh(x)
    
    def _dtanh(self, x):
        return 4 * (x.exp() + x.mul(-1).exp()).pow(-2)
    
    def backward(self, dl_dx):
        dl_ds = dl_dx * self._dtanh(self.x)
        return dl_ds
    
    def __call__(self, x):
        return self.forward(x)

    def param(self):
        return []

In [5]:
class Linear(Module):
    def __init__(self, in_features, out_features, bias=True):
        if bias:
            self.b = Variable(torch.empty(out_features).normal_(0, 1e-6))
        self.w = Variable(torch.empty(in_features, out_features).normal_(0, 1e-6))
        
        #self.dl_dw = torch.empty_like(self.W)
        #if bias:
        #    self.bias = torch.empty(out_features)
        #    self.bias_grad(torch.empty_like(self.bias))
        #    self.dl_db = torch.empty_like(self.bias)
    
    def reset_grad():
        self.w.grad.zero_()
        if self.b:
            self.b.grad.zero_()

    def forward(self, x):
        self.x = x
        out = x @ self.w.data
        if self.b:
            out += self.b.data
        return out

    def __call__(self, x):
        return self.forward(x)

    def backward(self, dl_ds):
        dl_dx = self.w.data @ dl_ds
        self.w.grad.add_(self.x.view(-1, 1) @ dl_ds.view(1, -1))
        if self.b:
            self.b.grad.add_(dl_ds)
        return dl_dx
    
    def param(self):
        return [self.w, self.b]

### try

In [6]:
import dlc_practical_prologue as prologue
from tqdm import tqdm

In [7]:
train_input, train_target, test_input, test_target = prologue.load_data(one_hot_labels=True, normalize=True)
gamma = 0.9
train_target *= gamma 

* Using MNIST
** Reduce the data-set (use --full for the full thing)
** Use 1000 train and 1000 test samples


In [8]:
class Sequential:
    def __init__(self, elems):
        self.elems = elems
    
    def forward(self, x):
        out = x
        for elem in self.elems:
            out = elem(out)
        return out

    def __call__(self, x):
        return self.forward(x)
    
    def backward(self, dl_dx):
        dl = dl_dx
        for elem in reversed(self.elems):
            dl = elem.backward(dl)
    
    def param(self):
        p = []
        for elem in self.elems:
            p += elem.param()
        return p

In [9]:
# define network
l1 = Linear(784, 50)
t1 = Tanh()
l2 = Linear(50, 10)
t2 = Tanh()
net = Sequential([l1, t1, l2, t2])

In [10]:
def loss(v, t):
    x = (t - v)
    return x.t() @ x
def dloss(v, t):
    return 2 * (v - t)

In [11]:
for i in tqdm(range(1000)):
    for var in net.param():
        var.zero_grad()
    for x, t in zip(train_input, train_target):
        out = net(x)
        dl = dloss(out, t)
        net.backward(dl)
    eta = 0.1 / train_input.shape[0]
    for var in net.param():
        var.update(eta)

100%|██████████| 1000/1000 [07:54<00:00,  1.82it/s]


In [13]:
nb_correct = 0
for x, t in zip(test_input, test_target):
    pred = net(x)
    if pred.argmax() == t.argmax():
        nb_correct+=1
print(f"{nb_correct} class predicted correctly")

840 class predicted correctly
