## 优化器优化数据

In [1]:
import os
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torchvision.transforms as transforms
import torch.optim as optim
from matplotlib import pyplot as plt

In [2]:
class StudentMLP(nn.Module):
    def __init__(self, size):
        super(StudentMLP, self).__init__()
        self.fc1 = nn.Linear(size, size)

    def forward(self, x):
        x = self.fc1(x)
        return x

#     权重初始化为全1,bias为0
    def initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.ones_(m.weight.data)
                m.bias.data.zero_()
                print("\nafter init:  ",m) 
                print('weight is: ', m.weight)
                print(m.bias)
                
#     权重初始化为全2,bias为0        
    def initialize_weights2(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.ones_(m.weight.data)
                m.weight.data *= 2
                m.bias.data.zero_()
                print("\nafter init:  ",m) 
                print('weight is: ', m.weight)
                print(m.bias)

In [3]:
# 创建共同表示数据
data = torch.tensor([[1, 1], [0., 1]], requires_grad=True)

print("-------data: ",data)
print("-------data's grad: ",data.grad)

# 创建student和transferBridge网络,两者计算时需要用到共同表示数据,但不对其进行梯度更新
net1 = StudentMLP(2)
net1.initialize_weights()

net2 = StudentMLP(2)
net2.initialize_weights2()

# 创建对应优化器
optimizer = optim.SGD([data], lr=1)
optimizer_net1 = optim.SGD(net1.parameters(), lr=1)
optimizer_net2 = optim.SGD(net2.parameters(), lr=1)

-------data:  tensor([[1., 1.],
        [0., 1.]], requires_grad=True)
-------data's grad:  None

after init:   Linear(in_features=2, out_features=2, bias=True)
weight is:  Parameter containing:
tensor([[1., 1.],
        [1., 1.]], requires_grad=True)
Parameter containing:
tensor([0., 0.], requires_grad=True)

after init:   Linear(in_features=2, out_features=2, bias=True)
weight is:  Parameter containing:
tensor([[2., 2.],
        [2., 2.]], requires_grad=True)
Parameter containing:
tensor([0., 0.], requires_grad=True)


### net1使用data1运算,并更新net1的参数

In [4]:
print('\n\ncompute net1(data):')

out1 = sum(net1(data[0]))
print(out1)
print("-------data: ",data)
print("-------data's grad: ",data.grad)
print(net1.fc1.weight.grad)
print(net2.fc1.weight.grad)
# out1.backward(retain_graph=True)
out1.backward(retain_graph=False)
print("after out1 backwrad")
print("-------data: ",data)
print("-------data's grad: ",data.grad)
print(net1.fc1.weight.grad)
print(net2.fc1.weight.grad)



compute net1(data):
tensor(4., grad_fn=<AddBackward0>)
-------data:  tensor([[1., 1.],
        [0., 1.]], requires_grad=True)
-------data's grad:  None
None
None
after out1 backwrad
-------data:  tensor([[1., 1.],
        [0., 1.]], requires_grad=True)
-------data's grad:  tensor([[2., 2.],
        [0., 0.]])
tensor([[1., 1.],
        [1., 1.]])
None


In [5]:
out1.backward(retain_graph=False)

RuntimeError: Trying to backward through the graph a second time, but the buffers have already been freed. Specify retain_graph=True when calling backward the first time.

In [5]:
print("-------data: ",data)
print("-------data's grad: ",data.grad)
print(net1.fc1.weight)
print(net1.fc1.weight.grad)
optimizer_net1.step()
optimizer_net1.zero_grad()
print("===========after step, zero_grad")
print("-------data: ",data)
print("-------data's grad: ",data.grad)
print(net1.fc1.weight)
print(net1.fc1.weight.grad)

-------data:  tensor([[1., 1.],
        [0., 1.]], requires_grad=True)
-------data's grad:  tensor([[2., 2.],
        [0., 0.]])
Parameter containing:
tensor([[1., 1.],
        [1., 1.]], requires_grad=True)
tensor([[1., 1.],
        [1., 1.]])
-------data:  tensor([[1., 1.],
        [0., 1.]], requires_grad=True)
-------data's grad:  tensor([[2., 2.],
        [0., 0.]])
Parameter containing:
tensor([[0., 0.],
        [0., 0.]], requires_grad=True)
tensor([[0., 0.],
        [0., 0.]])


### net2使用data1运算,并更新net1的参数

In [6]:
print('\n\ncompute net2(data):')
out2 = sum(net2(data[0]))
print(out2)

print("-------data: ",data)
print("-------data's grad: ",data.grad)
print(net1.fc1.weight.grad)
print(net2.fc1.weight.grad)
# out2.backward(retain_graph=True)
out2.backward(retain_graph=False)
print("after out2 backwrad")
print("-------data: ",data)
print("-------data's grad: ",data.grad)
print(net1.fc1.weight.grad)
print(net2.fc1.weight.grad)



compute net2(data):
tensor(8., grad_fn=<AddBackward0>)
-------data:  tensor([[1., 1.],
        [0., 1.]], requires_grad=True)
-------data's grad:  tensor([[2., 2.],
        [0., 0.]])
tensor([[0., 0.],
        [0., 0.]])
None
after out2 backwrad
-------data:  tensor([[1., 1.],
        [0., 1.]], requires_grad=True)
-------data's grad:  tensor([[6., 6.],
        [0., 0.]])
tensor([[0., 0.],
        [0., 0.]])
tensor([[1., 1.],
        [1., 1.]])


In [7]:
print("-------data: ",data)
print("-------data's grad: ",data.grad)
print(net2.fc1.weight)
print(net2.fc1.weight.grad)
optimizer_net2.step()
optimizer_net2.zero_grad()
print("===========after step, zero_grad")
print("-------data: ",data)
print("-------data's grad: ",data.grad)
print(net2.fc1.weight)
print(net2.fc1.weight.grad)

-------data:  tensor([[1., 1.],
        [0., 1.]], requires_grad=True)
-------data's grad:  tensor([[6., 6.],
        [0., 0.]])
Parameter containing:
tensor([[2., 2.],
        [2., 2.]], requires_grad=True)
tensor([[1., 1.],
        [1., 1.]])
-------data:  tensor([[1., 1.],
        [0., 1.]], requires_grad=True)
-------data's grad:  tensor([[6., 6.],
        [0., 0.]])
Parameter containing:
tensor([[1., 1.],
        [1., 1.]], requires_grad=True)
tensor([[0., 0.],
        [0., 0.]])


### 计算contrastive loss

In [8]:
print("-------data: ",data)
print("-------data's grad: ",data.grad)

contrastive_loss = torch.tensor(0, dtype=torch.float32)
contrastive_loss -= torch.sum(data[0] * data[0])
print(contrastive_loss)
contrastive_loss += torch.sum(data[0] * data[1])
print(contrastive_loss)

print("===========after step, zero_grad")
print("-------data: ",data)
print("-------data's grad: ",data.grad)

-------data:  tensor([[1., 1.],
        [0., 1.]], requires_grad=True)
-------data's grad:  tensor([[6., 6.],
        [0., 0.]])
tensor(-2., grad_fn=<SubBackward0>)
tensor(-1., grad_fn=<AddBackward0>)
-------data:  tensor([[1., 1.],
        [0., 1.]], requires_grad=True)
-------data's grad:  tensor([[6., 6.],
        [0., 0.]])


In [9]:
print("-------data: ",data)
print("-------data's grad: ",data.grad)

contrastive_loss.backward(retain_graph=True)
print("after out2 backwrad")
print("-------data: ",data)
print("-------data's grad: ",data.grad)




-------data:  tensor([[1., 1.],
        [0., 1.]], requires_grad=True)
-------data's grad:  tensor([[6., 6.],
        [0., 0.]])
after out2 backwrad
-------data:  tensor([[1., 1.],
        [0., 1.]], requires_grad=True)
-------data's grad:  tensor([[4., 5.],
        [1., 1.]])


## 只要grad没清零,可以连续backward,step

In [10]:
print("before step: data is", data)
print("before step: grad is", data.grad)
optimizer.step()
print("after step: data is", data)
print("after step: grad is", data.grad)

before step: data is tensor([[1., 1.],
        [0., 1.]], requires_grad=True)
before step: grad is tensor([[4., 5.],
        [1., 1.]])
after step: data is tensor([[-3., -4.],
        [-1.,  0.]], requires_grad=True)
after step: grad is tensor([[4., 5.],
        [1., 1.]])


In [11]:
print("before step: data is", data)
print("before step: grad is", data.grad)
optimizer.step()
print("after step: data is", data)
print("after step: grad is", data.grad)

before step: data is tensor([[-3., -4.],
        [-1.,  0.]], requires_grad=True)
before step: grad is tensor([[4., 5.],
        [1., 1.]])
after step: data is tensor([[-7., -9.],
        [-2., -1.]], requires_grad=True)
after step: grad is tensor([[4., 5.],
        [1., 1.]])


In [13]:
print("before step: data is", data)
print("before step: grad is", data.grad)
# contrastive_loss.backward(retain_graph=True)
contrastive_loss.backward(retain_graph=False)
print("after step: data is", data)
print("after step: grad is", data.grad)

before step: data is tensor([[-7., -9.],
        [-2., -1.]], requires_grad=True)
before step: grad is tensor([[4., 5.],
        [1., 1.]])
after step: data is tensor([[-7., -9.],
        [-2., -1.]], requires_grad=True)
after step: grad is tensor([[16., 22.],
        [-6., -8.]])


In [39]:
for i in range(1600):
    print('\rEpisode {}, Reward'.format(i/1600), end='')

Episode 0.0, RewardEpisode 0.000625, RewardEpisode 0.00125, RewardEpisode 0.001875, RewardEpisode 0.0025, RewardEpisode 0.003125, RewardEpisode 0.00375, RewardEpisode 0.004375, RewardEpisode 0.005, RewardEpisode 0.005625, RewardEpisode 0.00625, RewardEpisode 0.006875, RewardEpisode 0.0075, RewardEpisode 0.008125, RewardEpisode 0.00875, RewardEpisode 0.009375, RewardEpisode 0.01, RewardEpisode 0.010625, RewardEpisode 0.01125, RewardEpisode 0.011875, RewardEpisode 0.0125, RewardEpisode 0.013125, RewardEpisode 0.01375, RewardEpisode 0.014375, RewardEpisode 0.015, RewardEpisode 0.015625, RewardEpisode 0.01625, RewardEpisode 0.016875, RewardEpisode 0.0175, RewardEpisode 0.018125, RewardEpisode 0.01875, RewardEpisode 0.019375, RewardEpisode 0.02, RewardEpisode 0.020625, RewardEpisode 0.02125, RewardEpisode 0.021875, RewardEpisode 0.0225, RewardEpisode 0.023125, RewardEpisode 0.02375, RewardEpisode 0.024375, RewardEpisode 0.025, RewardEpisode 0.025625

In [None]:
featureSet_list = [1, 2]

label = torch.tensor([0., 1], requires_grad=True)

hidden_layers = list()
transferBridge_set = list()

for i in range(3):
    transferBridge_set.append(StudentMLP(2))
    hidden_layers.append(label)

    
test_common_representation = torch.tensor([0., 1], requires_grad=True)
test_optimizer = optim.Adam([test_common_representation], lr=1, betas=(0.9, 0.999))

transferBridge_criterion = nn.MSELoss()


for converge in range(100):
    tmp_loss = torch.zeros((len(featureSet_list)), dtype=torch.float32)
    for i in range(len(featureSet_list)):
        outputs = transferBridge_set[i](test_common_representation)
        tmp_loss[i] = transferBridge_criterion(hidden_layers[i], outputs)
    
#     test_loss = None

    test_loss = torch.sum(tmp_loss)
    print("test loss: ", test_loss)

#     test_loss.backward(retain_graph=True)
    test_loss.backward()
    test_optimizer.step()
    print(test_loss)
    test_optimizer.zero_grad()
