In [19]:
from __future__ import print_function

import torch

class TwoLayerNet(object):
  """
  two-layer-perceptron.
  Input dimension : N
  Hidden layer dimension : H
  Output dimension : C


  input - linear layer - ReLU - linear layer - output
  """

  def __init__(self, input_size, hidden_size, output_size, std=1e-4):
    """
    initialise model with small random price of weight and bias initialised as 0.
    weight and bias will be saved in dictionary called self.params

    W1: weight of first layer; (D, H)
    b1: bias of first layer; (H,)
    W2: weight of second layer; (H, C)
    b2: bias of second layer; (C,)

    Inputs:
    - input_size: dimension of input layer.
    - hidden_size: number of neurons in hidden layer.
    - output_size: output dimesion.
    """
    self.params = {}
    self.params['W1'] = std * torch.randn(input_size, hidden_size)
    self.params['b1'] = torch.zeros(hidden_size)
    self.params['W2'] = std * torch.randn(hidden_size, output_size)
    self.params['b2'] = torch.zeros(output_size)

  def loss(self, X, y=None):
    """
    calculate the gradient and loss of neural network

    Inputs:
    - X: input data. shape (N, D). each x[i] is one training sample and total N number of sample given as input
    - y: vector of training label. y[i] is the integer type label of x[i]
      if y is given, return loss and gradient and if not return output

    Returns:
    if y is not given, return the score matrix which shape is (N,C)
    scores[i, c] is the score of class c of input X[i]

    if y is given, return the tuple: (loss, grads)
    loss: loss (scalar) of training batch
    grads: dictionary of {parameter name: gradient} (it should use same key with self.params)
    """
    # call weight and bias in dictionary
    W1, b1 = self.params['W1'], self.params['b1']
    W2, b2 = self.params['W2'], self.params['b2']
    N, D = X.size()

    # calculate forward path
    scores = None
    #carry out forward path, save the value of output layer in the 'scores' (shape : (N, C))  #
    #input - linear layer - ReLU - linear layer - output             #
    
    scores=None
    hidden_layer1=torch.mm(X,W1)+b1
    hidden_layer1_output=torch.nn.functional.relu(hidden_layer1)
    output_layer=torch.mm(hidden_layer1_output,W2)+b2
    scores=output_layer
    
    # if answer (target) is not given, return 'scores' and finish it
    if y is None:
      return scores

    # calculate loss
    loss = None
    e = torch.exp(scores)
    softmax = e / torch.sum(e, dim=1, keepdim=True)
    
    #  calculate the value of loss by using Output and save in 'loss' (scalar)
    #
    #       loss function : negative log likelihood                    
    #       Use the value of softmax in the 'softmax' variable when calculating 
    #
    #  y' indicates the answer index and apply - log to the answer probability         
    
    index=0
    loss=0
    while index<len(y):
        for prob in softmax:
            loss-=torch.log(prob[y[index]])
            index+=1
    number_of_data=X.shape[0]
    loss=loss/number_of_data
    


    # Backward path(calculate Gradient) 
    grads = {}
    # calculate the gradient of weight and bias and save in 'grads' dictionary   
    # key of dictionary should be set same as self.params             
    # grads['W1'] have to have same shape with self.params['W1']                     
            
    count=0
    dsoftmax=softmax.clone()
    while count<len(y):
        for prob in dsoftmax:
            prob[y[count]]-=1
            count+=1
    dsoftmax/=number_of_data
    
    
    grads['W2'] = torch.mm(hidden_layer1_output.t(), dsoftmax)
    grads['b2'] = torch.sum(dsoftmax, dim=0)
        
    dhidden_layer1_output = torch.mm(dsoftmax,W2.t())*(hidden_layer1_output>0).float() #dhidden_layer1_output --> dhidden_layer1
    grads['W1'] = torch.mm(X.t(), dhidden_layer1_output)#dhidden_layer1_output=dhidden_layer1
    grads['b1'] = torch.sum(dhidden_layer1_output, dim=0)

    return loss, grads

  def train(self, X, y,
            learning_rate=1e-3, learning_rate_decay=0.95,
            num_iters=100,
            batch_size=200, verbose=False):
    """
    neural network training using SGD

    Inputs:
    - X: numpy array of shape (N, D) (training data)
    - y: numpy array of shape (N,)(training labels; y[i] = c
                                  c is the label of X[i], 0 <= c < C)
    - learning_rate: Scalar learning rate
    - num_iters: Number of steps
    - batch_size: Number of training examples in a mini-batch.
    - verbose: if true, print progress
    """
    num_train = X.shape[0]
    iterations_per_epoch = max(num_train / batch_size, 1)

    # optimization using SGD
    loss_history = []
    train_acc_history = []
    val_acc_history = []

    for it in range(num_iters):
      loss, grads = self.loss(X, y=y)
      loss_history.append(loss)

      
      # call gradient in the 'grads' dictionary and carry out SGD update 
      for key in self.params.keys():
            self.params[key]-=learning_rate*grads[key]

      if verbose and it % 100 == 0:
        print('iteration %d / %d: loss %f' % (it, num_iters, loss))


      if it % iterations_per_epoch == 0:
        # Accuracy
        train_acc = (self.predict(X) == y).float().mean()
        train_acc_history.append(train_acc)

        learning_rate *= learning_rate_decay

    return {
      'loss_history': loss_history,
      'train_acc_history': train_acc_history,
      'val_acc_history': val_acc_history,
    }

  def predict(self, X):
    return torch.argmax(self.loss(X),1)


In [20]:
input_size = 4
hidden_size = 10
num_classes = 3
num_inputs = 5

def init_toy_model():
    torch.manual_seed(0)
    return TwoLayerNet(input_size, hidden_size, num_classes, std=1e-1)

def init_toy_data():
    torch.manual_seed(1)
    X = 10 * torch.randn(num_inputs, input_size)
    y = torch.LongTensor([0, 1, 2, 2, 1])
    return X, y

net = init_toy_model()
X, y = init_toy_data()
scores = net.loss(X)
print('Your scores:')
print(scores)
print()
print('correct scores:')
correct_scores = torch.Tensor(
  [[ 0.24617445,  0.1261572,   1.1627575 ],
 [ 0.18364899, -0.0675799,  -0.21310908],
 [-0.2075074,  -0.12525336, -0.06508598],
 [ 0.08643292,  0.07172455,  0.2353122 ],
 [ 0.8219606,  -0.32560882, -0.77807254]]
)
print(correct_scores)
print()
print('Difference between your scores and correct scores:')
print(torch.sum(torch.abs(scores - correct_scores)))
loss, _ = net.loss(X, y)
correct_loss = 1.2444149

print('Difference between your loss and correct loss:')
print(torch.sum(torch.abs(loss - correct_loss)))
loss, grads = net.loss(X, y)

results = net.train(X, y, 0.05)
print(results)

print("Train acc: %f -> %f\nTrain loss: %f -> %f" % (results['train_acc_history'][0], results['train_acc_history'][-1]
                                                , results['loss_history'][0],results['loss_history'][-1]))


Your scores:
tensor([[ 0.2462,  0.1262,  1.1628],
        [ 0.1836, -0.0676, -0.2131],
        [-0.2075, -0.1253, -0.0651],
        [ 0.0864,  0.0717,  0.2353],
        [ 0.8220, -0.3256, -0.7781]])

correct scores:
tensor([[ 0.2462,  0.1262,  1.1628],
        [ 0.1836, -0.0676, -0.2131],
        [-0.2075, -0.1253, -0.0651],
        [ 0.0864,  0.0717,  0.2353],
        [ 0.8220, -0.3256, -0.7781]])

Difference between your scores and correct scores:
tensor(7.4506e-09)
Difference between your loss and correct loss:
tensor(0.)
{'loss_history': [tensor(1.2444), tensor(0.9947), tensor(0.8408), tensor(0.7414), tensor(0.6628), tensor(0.5965), tensor(0.5410), tensor(0.4949), tensor(0.4568), tensor(0.4252), tensor(0.3989), tensor(0.3769), tensor(0.3582), tensor(0.3423), tensor(0.3287), tensor(0.3168), tensor(0.3064), tensor(0.2971), tensor(0.2889), tensor(0.2815), tensor(0.2748), tensor(0.2687), tensor(0.2631), tensor(0.2580), tensor(0.2532), tensor(0.2489), tensor(0.2448), tensor(0.2411), ten