In [4]:
from __future__ import print_function
import torch
import math

In [13]:
lr = 0.01
mu = 0.0

In [5]:
grad_output = torch.randn(1, 5)
print('grad_output: ', grad_output)

grad_output:  
-0.1607 -0.2673  0.0717  0.9223 -2.7848
[torch.FloatTensor of size 1x5]



# Linear

In [2]:
# Forward propagation
input = torch.randn(1, 3)
weight = torch.randn(5, 3)
bias = torch.randn(5,)
output = input.mm(weight.t())
output += bias.unsqueeze(0).expand_as(output)

print('input: ', input)
print('weight: ', weight.size())
print('bias: ', bias.size())
print('output: ', output.size())

input:  
 0.5483 -1.5596  1.0649
[torch.FloatTensor of size 1x3]

weight:  torch.Size([5, 3])
bias:  torch.Size([5])
output:  torch.Size([1, 5])


## ESGD

In [55]:
grad_input = grad_weight = grad_bias = None
grad_input = grad_output.mm(weight)
grad_weight = grad_output.t().mm(input)
grad_bias = grad_output.sum(0).squeeze(0)

print(grad_input, grad_weight, grad_bias)


 1.7658  1.6261 -5.1807
[torch.FloatTensor of size 1x3]
 
-0.0881  0.2506 -0.1711
-0.1466  0.4169 -0.2846
 0.0393 -0.1119  0.0764
 0.5057 -1.4384  0.9821
-1.5268  4.3431 -2.9655
[torch.FloatTensor of size 5x3]
 
-0.1607
-0.2673
 0.0717
 0.9223
-2.7848
[torch.FloatTensor of size 5]



## ISGD

In [9]:
def alpha_linear(s,d,c):
    return -s.mul(d)

In [54]:
# Constants
s = torch.sign(grad_output)
z_norm = math.sqrt((torch.norm(input) ** 2 + 1.0))
d = z_norm * math.sqrt(lr/(1.0+lr*mu)) * torch.sqrt(torch.abs(grad_output))
alpha = alpha_linear(s,d,c)

new_weight = weight / (1.0 + lr * mu) + (alpha.mul(d)).t().mm(input) / z_norm **2
grad_weight = (weight - new_weight) / lr

new_bias = bias / (1.0 + lr * mu) + alpha.mul(d).squeeze() / z_norm **2
grad_bias = (bias - new_bias) / lr

print(grad_input, grad_weight, grad_bias)


 1.7658  1.6261 -5.1807
[torch.FloatTensor of size 1x3]
 
-0.0881  0.2506 -0.1711
-0.1466  0.4169 -0.2846
 0.0393 -0.1119  0.0764
 0.5057 -1.4384  0.9821
-1.5268  4.3431 -2.9655
[torch.FloatTensor of size 5x3]
 
-0.1607
-0.2673
 0.0717
 0.9223
-2.7848
[torch.FloatTensor of size 5]



In [43]:
v = alpha.mul(d)
print(weight)
print(v)
print(v.t() * weight)
print(weight * v.t())
print([-1.4060*0.0078, -0.7285*0.0078, -0.0530*0.0078])


-1.4060 -0.7285 -0.0530
 0.1984 -1.0676 -1.0577
 0.0716  0.4500  0.0120
-0.7115 -0.0378 -1.5769
-0.8058 -0.4403  1.4430
[torch.FloatTensor of size 5x3]


 0.0078  0.0130 -0.0035 -0.0449  0.1355
[torch.FloatTensor of size 1x5]


-0.0110 -0.0057 -0.0004
 0.0026 -0.0139 -0.0138
-0.0002 -0.0016 -0.0000
 0.0319  0.0017  0.0708
-0.1092 -0.0597  0.1956
[torch.FloatTensor of size 5x3]



RuntimeError: inconsistent tensor size, expected r_ [5 x 3], t [5 x 3] and src [1 x 5] to have the same number of elements, but got 15, 15 and 5 elements respectively at /Users/soumith/minicondabuild3/conda-bld/pytorch_1518385717421/work/torch/lib/TH/generic/THTensorMath.c:1036

In [18]:

# Note that torch.norm outputs a float instead of a tensor
z_norm = math.sqrt((torch.norm(input) ** 2 + 1.0))
d = z_norm * math.sqrt(lr/(1.0+lr*mu)) * torch.sqrt(abs_grad_output)
c = output/(1.0+lr*mu)
# print('s: ', s)
# print('delta: ', d) 
# print(c)

# Calculate alpha
alpha = alpha_linear(s,d,c)
# print(alpha)

# Calculate gradients
new_weight = weight / (1.0 + lr * mu) + alpha.mul(d).mm(weight) / z_norm**2
grad_weight = (weight - new_weight) / lr
# print(weight)
# print(new_weight)
# print(grad_weight)

new_bias = bias / (1.0 + lr * mu) #+ alpha.mul(d).squeeze().mul(bias) / z_norm**2
grad_bias = (bias - new_bias) / lr
# print(bias)
# print(new_bias)
# print(grad_bias)


sgn_output = (output >= 0).type(torch.FloatTensor)
grad_input = (grad_output.mul(sgn_output)).mm(weight)
# print(grad_input)

print(grad_input, grad_weight, grad_bias)


 0.8381
 0.2784
 0.0492
-0.8235
 0.2812
[torch.FloatTensor of size 5]


 0.8381
 0.2784
 0.0492
-0.8235
 0.2812
[torch.FloatTensor of size 5]


 0
 0
 0
 0
 0
[torch.FloatTensor of size 5]


 2.4169  1.6287 -3.7273
[torch.FloatTensor of size 1x3]
 
 1.7658  1.6261 -5.1807
 1.7659  1.6261 -5.1807
 1.7658  1.6261 -5.1807
 1.7658  1.6261 -5.1807
 1.7658  1.6261 -5.1807
[torch.FloatTensor of size 5x3]
 
 0
 0
 0
 0
 0
[torch.FloatTensor of size 5]



# Relu

## ESGD

## Forward propagation

In [None]:
input = torch.randn(1, 3)
weight = torch.randn(5, 3)
bias = torch.randn(5,)
output = input.mm(weight.t())
output += bias.unsqueeze(0).expand_as(output)
relu = torch.clamp(output, min=0.0)

print('input: ', input)
print('weight: ', weight.size())
print('bias: ', bias.size())
print('output: ', output.size())
print('relu: ', relu.size())

## Back-propagation

## ISGD

In [None]:
# Constants
s = torch.sign(grad_output)
abs_grad_output = torch.abs(grad_output)
# Note that torch.norm outputs a float instead of a tensor
z_norm = math.sqrt((torch.norm(input) ** 2 + 1.0))
d = z_norm * math.sqrt(lr/(1.0+lr*mu)) * torch.sqrt(abs_grad_output)
c = output/(1.0+lr*mu)
# print('s: ', s)
# print('delta: ', d) 
# print(c)

# Calculate alpha
alpha = alpha_relu(s,d,c)

# Calculate gradients
new_weight = weight / (1.0 + lr * mu) + alpha.mul(d).mm(weight) / z_norm**2
grad_weight = (weight - new_weight) / lr
# print(weight)
# print(new_weight)
# print(grad_weight)

new_bias = bias / (1.0 + lr * mu) + alpha.mul(d).squeeze().mul(bias) / z_norm**2
grad_bias = (bias - new_bias) / lr
# print(bias)
# print(new_bias)
# print(grad_bias)

sgn_output = (output >= 0).type(torch.FloatTensor)
grad_input = (grad_output.mul(sgn_output)).mm(weight)
print(grad_input)

In [None]:
def alpha_relu(s,d,c):
#     cond1 = (s == 1).mul(c <= 0).type(torch.FloatTensor)
    cond2 = (s == 1).mul(c > 0).mul(c <= d**2).type(torch.FloatTensor)
    cond3 = (s == 1).mul(c > d**2).type(torch.FloatTensor)
#     cond4 = (s == -1).mul(c <= -d**2/2.0).type(torch.FloatTensor)
    cond5 = (s == -1).mul(c > -d**2/2.0).type(torch.FloatTensor)
    # print(cond1, cond2, cond3, cond4, cond5)

    alpha = (0.0
#              + 0.0 * cond1
            - (c.div(d)).mul(cond2)
            - d.mul(cond3)
#             + 0.0 * cond4
            + d.mul(cond5)
            )

    return alpha

In [None]:
alpha_relu(s,d,c)

In [None]:
# Understand grad_output_pos_out.sum(0).squeeze(0)
print(grad_output_pos_out)
print(grad_output_pos_out.sum(0))
print(grad_output_pos_out.sum(0).squeeze(0))

## Standard RELU

In [None]:
pos_out = (output >= 0).type(torch.FloatTensor)
grad_output_pos_out = torch.mul(grad_output, pos_out)
grad_input = grad_output_pos_out.mm(weight)
grad_weight = grad_output_pos_out.t().mm(input)
grad_bias = grad_output_pos_out.sum(0).squeeze(0)

print('grad_output: ', grad_output.size())
print('output: ', output.size())
print('pos_out: ', pos_out.size())
print('grad_output_pos_out: ', grad_output_pos_out.size())
print('grad_input: ', grad_input.size())
print('grad_bias: ', grad_bias.size())
print('grad_weight: ', grad_weight.size())