In [1]:
import torch
from dlc_practical_prologue import load_data
from torch import Tensor

Write the two functions <br>
def sigma(x) <br>
def dsigma(x) <br> 
that take as input a ﬂoat tensor and returns a tensor of same size, obtained by applying component-wise respectively tanh, and the ﬁrst derivative of tanh. <br>
Hint: The functions should have no python loop, and use in particular torch.tanh , torch.exp , torch.mul , and torch.pow . My versions are 34 and 62 character long. <br>


In [2]:
def sigma(x):
    return x.tanh()

In [3]:
def dsigma(x):
    return 1 - torch.pow(sigma(x), 2)

In [4]:
test_tensor = torch.ones(10)

In [5]:
dsigma(test_tensor)


 0.4200
 0.4200
 0.4200
 0.4200
 0.4200
 0.4200
 0.4200
 0.4200
 0.4200
 0.4200
[torch.cuda.FloatTensor of size 10 (GPU 0)]

---

# 2 Loss
Write the two functions <br>

def loss(v, t) <br>
def dloss(v, t) <br>

that take as input two ﬂoat tensors of same dimensions with v the predicted tensor and t the target one, and return respectively abs(v-t)^2, and a tensor of same size equal to the gradient of that quantity as a function of v. <br>

Hint: The functions should have no python loop, and use in particular torch.sum , torch.pow . My versions are 48 and 40 character long.


In [6]:
v = torch.ones(10).fill_(10)
t = torch.ones(10).fill_(5)

In [7]:
def loss(v, t):
    return torch.pow((v-t), 2).sum()
    

In [8]:
def dloss(v, t):
    return 2 * (v - t)

---

# 3 Forward/Back Pass

In [9]:
def forward_pass(w1, b1, w2, b2, x):
    x0 = x
    s1 = torch.mv(w1.t(), x0) + b1
    x1 = sigma(s1)
    s2 = torch.mv(w2.t(), x1) + b2
    x2 = sigma(s2)

    return x0, s1, x1, s2, x2

In [10]:
def backward_pass(w1, b1, w2, b2,
                  t,
                  x, s1, x1, s2, x2,
                  dl_dw1, dl_db1, dl_dw2, dl_db2):
    x0 = x
    dl_dx2 = dloss(x2, t)
    dl_ds2 = dsigma(s2) * dl_dx2
    dl_dx1 = w2.t().mv(dl_ds2)
    dl_ds1 = dsigma(s1) * dl_dx1

    dl_dw2.add_(dl_ds2.view(-1, 1).mm(x1.view(1, -1)))
    dl_db2.add_(dl_ds2)
    dl_dw1.add_(dl_ds1.view(-1, 1).mm(x0.view(1, -1)))
    dl_db1.add_(dl_ds1)

In [11]:
train_input, train_target, test_input, test_target = load_data(cifar = None, one_hot_labels = True, normalize = True, flatten = True)

* Using MNIST
** Reduce the data-set (use --full for the full thing)
** Use 1000 train and 1000 test samples


In [12]:
train_input.shape

torch.Size([1000, 784])

In [13]:
# Multiply target label by 0.9 (fall within value of tanh)

In [14]:
train_target = train_target * 0.9

In [39]:
test_input = test_input * 0.9

In [15]:
# Create the w1 tensor

In [16]:
w1 = Tensor(784, 50).normal_(0, 1e-6)

In [17]:
w2 = Tensor(50, 10).normal_(0, 1e-6)

In [18]:
b1 = Tensor(50).normal_(0, 1e-6)
b2 = Tensor(10).normal_(0, 1e-6)

In [19]:
dl_dw2 = Tensor(w2.size())

In [20]:
dl_dw1 = Tensor(w1.size())


In [21]:
dl_b1 = Tensor(b1.size())

In [22]:
dl_b2 = Tensor(b2.size())

---

In [23]:
x0, s1, x1, s2, x2 = forward_pass(w1, b1, w2, b2, train_input[0])

In [24]:
x2.shape

torch.Size([10])

In [25]:
x2


1.00000e-06 *
 -0.6545
 -0.7528
 -1.1291
 -0.2537
 -1.2630
 -0.1830
  0.3645
 -0.3536
 -0.0948
 -2.1960
[torch.cuda.FloatTensor of size 10 (GPU 0)]

In [26]:
# index location of the maximum prediction value
x2.max(0)[1][0]

6

In [27]:
torch.max(x2, 0)

(
 1.00000e-07 *
   3.6450
 [torch.cuda.FloatTensor of size 1 (GPU 0)], 
  6
 [torch.cuda.LongTensor of size 1 (GPU 0)])

In [28]:
x2.max(0)

(
 1.00000e-07 *
   3.6450
 [torch.cuda.FloatTensor of size 1 (GPU 0)], 
  6
 [torch.cuda.LongTensor of size 1 (GPU 0)])

In [29]:
train_target[0]


-0.9000
-0.9000
-0.9000
-0.9000
-0.9000
 0.9000
-0.9000
-0.9000
-0.9000
-0.9000
[torch.cuda.FloatTensor of size 10 (GPU 0)]

In [30]:
# the row index and column index from the prediction should be larger than 0
train_target[0,6] 

-0.8999999761581421

---

# Training Step

In [None]:
# train for 1000 epochs 
for k in range(0, 1000):
    
    acc_loss = 0
    nb_train_errors = 0
    
    dl_dw1.zero()
    dl_db1.zero()
    dl_dw2.zero()
    dl_db2.zero()
    
    for n in range(0, nb_train_samples):
        x0, s1, x1, s2 , x2 = forward_pass(w1, b1, w2, b2, train_input[n])
        
        pred = x2.max(0)[1][0] # the column location of maximum prediction
        if train_target[n, pred] < 0 : nb_train_errors += 1 # train_target[n, pred] should return positive integer if predicted correctly
        acc_loss = acc_loss + loss (x2, train_target[n])
        
        backward_pass(w1, b1, w2, b2, train_target[n], x0, s1, x1, s2, x2, dl_dw1, dl_db1, dl_dw2, dl_db2)
        # the backward pass stage will set dl_dw1 ~ dl_db2 
        
        w1 = w1 - eta * dl_dw1
        b1 = b1 - eta * dl_db1
        w2 = w2 - eta * dl_dw2
        b2 = b2 - eta * dl_db2
        
        nb_test_errors = 0 
        
        
         