In [1]:
from __future__ import print_function
import torch
import math

In [2]:
lr = 0.01
mu = 0.0

In [3]:
grad_output = torch.randn(1, 5)
print('grad_output: ', grad_output)

grad_output:  
 0.2842 -1.9836  0.9476 -0.0175 -0.9988
[torch.FloatTensor of size 1x5]



# Linear

In [4]:
# Forward propagation
input = torch.randn(1, 3)
weight = torch.randn(5, 3)
bias = torch.randn(5,)
output = input.mm(weight.t())
output += bias.unsqueeze(0).expand_as(output)

print('input: ', input)
print('weight: ', weight.size())
print('bias: ', bias.size())
print('output: ', output.size())

input:  
-0.5359  0.7310  0.1093
[torch.FloatTensor of size 1x3]

weight:  torch.Size([5, 3])
bias:  torch.Size([5])
output:  torch.Size([1, 5])


## ESGD

In [5]:
grad_input = grad_weight = grad_bias = None
grad_input = grad_output.mm(weight)
grad_weight = grad_output.t().mm(input)
grad_bias = grad_output.sum(0).squeeze(0)

print(grad_input, grad_weight, grad_bias)


-3.9880 -4.9535 -3.0896
[torch.FloatTensor of size 1x3]
 
-0.1523  0.2077  0.0311
 1.0629 -1.4500 -0.2169
-0.5078  0.6927  0.1036
 0.0094 -0.0128 -0.0019
 0.5352 -0.7301 -0.1092
[torch.FloatTensor of size 5x3]
 
 0.2842
-1.9836
 0.9476
-0.0175
-0.9988
[torch.FloatTensor of size 5]



## ISGD

In [6]:
def alpha_linear(s,d,c):
    return -s.mul(d)

In [8]:
# Constants
s = torch.sign(grad_output)
z_norm = math.sqrt((torch.norm(input) ** 2 + 1.0))
d = z_norm * math.sqrt(lr/(1.0+lr*mu)) * torch.sqrt(torch.abs(grad_output))
c = output/(1.0+lr*mu)
alpha = alpha_linear(s,d,c)

new_weight = weight / (1.0 + lr * mu) + (alpha.mul(d)).t().mm(input) / z_norm **2
grad_weight = (weight - new_weight) / lr

new_bias = bias / (1.0 + lr * mu) + alpha.mul(d).squeeze() / z_norm **2
grad_bias = (bias - new_bias) / lr

print(grad_input, grad_weight, grad_bias)


-3.9880 -4.9535 -3.0896
[torch.FloatTensor of size 1x3]
 
-0.1523  0.2077  0.0311
 1.0629 -1.4500 -0.2169
-0.5078  0.6927  0.1036
 0.0094 -0.0128 -0.0019
 0.5352 -0.7301 -0.1092
[torch.FloatTensor of size 5x3]
 
 0.2842
-1.9836
 0.9477
-0.0175
-0.9988
[torch.FloatTensor of size 5]



In [None]:
v = alpha.mul(d)
print(weight)
print(v)
print(v.t() * weight)
print(weight * v.t())
print([-1.4060*0.0078, -0.7285*0.0078, -0.0530*0.0078])

In [None]:

# Note that torch.norm outputs a float instead of a tensor
z_norm = math.sqrt((torch.norm(input) ** 2 + 1.0))
d = z_norm * math.sqrt(lr/(1.0+lr*mu)) * torch.sqrt(abs_grad_output)
c = output/(1.0+lr*mu)
# print('s: ', s)
# print('delta: ', d) 
# print(c)

# Calculate alpha
alpha = alpha_linear(s,d,c)
# print(alpha)

# Calculate gradients
new_weight = weight / (1.0 + lr * mu) + alpha.mul(d).mm(weight) / z_norm**2
grad_weight = (weight - new_weight) / lr
# print(weight)
# print(new_weight)
# print(grad_weight)

new_bias = bias / (1.0 + lr * mu) #+ alpha.mul(d).squeeze().mul(bias) / z_norm**2
grad_bias = (bias - new_bias) / lr
# print(bias)
# print(new_bias)
# print(grad_bias)


sgn_output = (output >= 0).type(torch.FloatTensor)
grad_input = (grad_output.mul(sgn_output)).mm(weight)
# print(grad_input)

print(grad_input, grad_weight, grad_bias)

# Relu

## ESGD

## Forward propagation

In [None]:
input = torch.randn(1, 3)
weight = torch.randn(5, 3)
bias = torch.randn(5,)
output = input.mm(weight.t())
output += bias.unsqueeze(0).expand_as(output)
relu = torch.clamp(output, min=0.0)

print('input: ', input)
print('weight: ', weight.size())
print('bias: ', bias.size())
print('output: ', output.size())
print('relu: ', relu.size())

## Back-propagation

## ISGD

In [None]:
# Constants
s = torch.sign(grad_output)
abs_grad_output = torch.abs(grad_output)
# Note that torch.norm outputs a float instead of a tensor
z_norm = math.sqrt((torch.norm(input) ** 2 + 1.0))
d = z_norm * math.sqrt(lr/(1.0+lr*mu)) * torch.sqrt(abs_grad_output)
c = output/(1.0+lr*mu)
# print('s: ', s)
# print('delta: ', d) 
# print(c)

# Calculate alpha
alpha = alpha_relu(s,d,c)

# Calculate gradients
new_weight = weight / (1.0 + lr * mu) + alpha.mul(d).mm(weight) / z_norm**2
grad_weight = (weight - new_weight) / lr
# print(weight)
# print(new_weight)
# print(grad_weight)

new_bias = bias / (1.0 + lr * mu) + alpha.mul(d).squeeze().mul(bias) / z_norm**2
grad_bias = (bias - new_bias) / lr
# print(bias)
# print(new_bias)
# print(grad_bias)

sgn_output = (output >= 0).type(torch.FloatTensor)
grad_input = (grad_output.mul(sgn_output)).mm(weight)
print(grad_input)

In [None]:
def alpha_relu(s,d,c):
#     cond1 = (s == 1).mul(c <= 0).type(torch.FloatTensor)
    cond2 = (s == 1).mul(c > 0).mul(c <= d**2).type(torch.FloatTensor)
    cond3 = (s == 1).mul(c > d**2).type(torch.FloatTensor)
#     cond4 = (s == -1).mul(c <= -d**2/2.0).type(torch.FloatTensor)
    cond5 = (s == -1).mul(c > -d**2/2.0).type(torch.FloatTensor)
    # print(cond1, cond2, cond3, cond4, cond5)

    alpha = (0.0
#              + 0.0 * cond1
            - (c.div(d)).mul(cond2)
            - d.mul(cond3)
#             + 0.0 * cond4
            + d.mul(cond5)
            )

    return alpha

In [None]:
alpha_relu(s,d,c)

In [None]:
# Understand grad_output_pos_out.sum(0).squeeze(0)
print(grad_output_pos_out)
print(grad_output_pos_out.sum(0))
print(grad_output_pos_out.sum(0).squeeze(0))

## Standard RELU

In [None]:
pos_out = (output >= 0).type(torch.FloatTensor)
grad_output_pos_out = torch.mul(grad_output, pos_out)
grad_input = grad_output_pos_out.mm(weight)
grad_weight = grad_output_pos_out.t().mm(input)
grad_bias = grad_output_pos_out.sum(0).squeeze(0)

print('grad_output: ', grad_output.size())
print('output: ', output.size())
print('pos_out: ', pos_out.size())
print('grad_output_pos_out: ', grad_output_pos_out.size())
print('grad_input: ', grad_input.size())
print('grad_bias: ', grad_bias.size())
print('grad_weight: ', grad_weight.size())