Re-program the implicit differentiation optimization to check whether the program is corrected

In [71]:
# import torch.nn as nn
# import torch
#
#
# class NeuralTest(nn.Module):
#     def __init__(self):
#         super(NeuralTest, self).__init__()
#         self.layer = nn.Linear(3, 1)
#
#     def forward(self, x):
#         return self.layer(x)
#
#
# x = torch.rand(3)
# net = NeuralTest()
# opt = torch.optim.Adam(net.parameters())
# print(list(net.parameters()))


In [72]:
import torch
from torch.autograd import grad, Variable

In [None]:
def gather_flat_grad(loss_grad):
    #Helper function to flatten the grad
    return torch.cat([p.view(-1) for p in loss_grad])  #g_vector

In [73]:
# Set the seed for reproducibility
torch.manual_seed(1)

<torch._C.Generator at 0x7ff1160efad0>

Observations:
- Even I set the the weight to be very close to w, the loss still fuctuates.
    - Thought: Maybe the update is too stochastic? Learning rate too high? or rand in x? Too small dataset?, no bias term?
    - Answer: rand in x, Fix by randn in x, increases number of epochs
- Predicted loss < True loss means do we find a better line than the one we are generated?
    - Happens when x is generated with rand (uniform generation) and when regularization

Conclusions:
- The SGD solution is approximately equal to the closed-form solution

In [78]:
# THIS BLOCK SERVES AS AN EXACT SOLUTION
w_hat = torch.matmul(torch.matmul(torch.inverse(torch.matmul(x_train.T, x_train)), x_train.T), y_train)
print('w_hat: ', w_hat)
y_train_predicted = torch.matmul(x_train, w_hat)
print('y_predicted: ', y_train_predicted)
loss = torch.nn.functional.mse_loss(y_train_predicted, y_train)
print('loss: ', loss)
true_loss = torch.nn.functional.mse_loss(torch.matmul(x_train, true_w), y_train)
print('true_loss: ', true_loss)

w_hat:  tensor([[3.6566],
        [1.2004]])
y_predicted:  tensor([[-6.4791],
        [-4.3234],
        [ 3.4589],
        [-1.4549],
        [ 3.0117],
        [ 6.3483],
        [-0.5179],
        [-3.0762]])
loss:  tensor(2.3744)
true_loss:  tensor(2.9038)


In [79]:
# THIS BLOCK SERVES AS THE SANITY CHECK FOR THE MAIN TRAINING PROCESS
torch.manual_seed(1)
h_epoch = 10000  # Hyperparameter epoch
epoch = 1000  # Epoch for training

#Create underlying linear function
x = torch.randn((10, 2))
true_w = torch.tensor([[3.], [1.]])
y = torch.matmul(x, true_w) + torch.randn((10, 1))

# Split train_valid
x_train = x[:8, ]
y_train = y[:8, ]

x_valid = x[8:, ]
y_valid = y[8:, ]
#Parameters and hyperparameters
w = torch.tensor([[2.5], [1.3]], requires_grad=True)
lamb = torch.tensor([3.], requires_grad=True)  #Intentionally high value

#Define optimizer (Note: The choice of optimizer is similar to the problem setting)
optimizer = torch.optim.Adam([w], lr = 0.001)
h_optimizer = torch.optim.RMSprop([lamb])
for ep in range(epoch):
    total_train_loss = 0
    for i in range(len(x_train)):
        optimizer.zero_grad()
        y_predicted = torch.matmul(x_train[i], w)
        train_loss = torch.nn.functional.mse_loss(y_predicted, y_train[i])
        total_train_loss += train_loss
        train_loss.backward()
        optimizer.step()
    print('Train loss at ' + str(ep) + ': ' + str(total_train_loss / len(x_train)))
    print('w: ', w)

Train loss at 0: tensor(3.6336, grad_fn=<DivBackward0>)
w:  tensor([[2.5057],
        [1.3033]], requires_grad=True)
Train loss at 1: tensor(3.6201, grad_fn=<DivBackward0>)
w:  tensor([[2.5105],
        [1.3053]], requires_grad=True)
Train loss at 2: tensor(3.6084, grad_fn=<DivBackward0>)
w:  tensor([[2.5153],
        [1.3071]], requires_grad=True)
Train loss at 3: tensor(3.5970, grad_fn=<DivBackward0>)
w:  tensor([[2.5201],
        [1.3089]], requires_grad=True)
Train loss at 4: tensor(3.5856, grad_fn=<DivBackward0>)
w:  tensor([[2.5249],
        [1.3107]], requires_grad=True)
Train loss at 5: tensor(3.5744, grad_fn=<DivBackward0>)
w:  tensor([[2.5297],
        [1.3124]], requires_grad=True)
Train loss at 6: tensor(3.5632, grad_fn=<DivBackward0>)
w:  tensor([[2.5344],
        [1.3141]], requires_grad=True)
Train loss at 7: tensor(3.5521, grad_fn=<DivBackward0>)
w:  tensor([[2.5392],
        [1.3159]], requires_grad=True)
Train loss at 8: tensor(3.5411, grad_fn=<DivBackward0>)
w:  tens

This Section is using L2 regularization
Observations:
- The higher the lamb value, the higher the training loss and the more difference between closed-form weight solution and SGD weight solution.

In [91]:
w_hat = torch.matmul(torch.matmul(torch.inverse(torch.matmul(x_train.T, x_train) + lamb * torch.eye(2)), x_train.T), y_train)
print('w_hat: ', w_hat)
y_train_predicted = torch.matmul(x_train, w_hat)
print('y_predicted: ', y_train_predicted)
loss = torch.nn.functional.mse_loss(y_train_predicted, y_train)
print('loss: ', loss)
true_loss = torch.nn.functional.mse_loss(torch.matmul(x_train, true_w), y_train)
print('true_loss: ', true_loss)

w_hat:  tensor([[3.6521],
        [1.2001]], grad_fn=<MmBackward>)
y_predicted:  tensor([[-6.4720],
        [-4.3200],
        [ 3.4549],
        [-1.4521],
        [ 3.0094],
        [ 6.3403],
        [-0.5180],
        [-3.0726]], grad_fn=<MmBackward>)
loss:  tensor(2.3745, grad_fn=<MseLossBackward>)
true_loss:  tensor(2.9038)


In [90]:
# THIS BLOCK SERVES AS THE SANITY CHECK FOR THE MAIN TRAINING PROCESS
torch.manual_seed(1)
h_epoch = 10000  # Hyperparameter epoch
epoch = 1000  # Epoch for training

#Create underlying linear function
x = torch.randn((10, 2))
true_w = torch.tensor([[3.], [1.]])
y = torch.matmul(x, true_w) + torch.randn((10, 1))

# Split train_valid
x_train = x[:8, ]
y_train = y[:8, ]

x_valid = x[8:, ]
y_valid = y[8:, ]
#Parameters and hyperparameters
w = torch.tensor([[2.5], [1.3]], requires_grad=True)
lamb = torch.tensor([0.01], requires_grad=True)  #Change the value form 3 to 0.1 and 0.01 and observe the behavior

#Define optimizer (Note: The choice of optimizer is similar to the problem setting)
optimizer = torch.optim.Adam([w], lr = 0.001)
h_optimizer = torch.optim.RMSprop([lamb])
for ep in range(epoch):
    total_train_loss = 0
    for i in range(len(x_train)):
        optimizer.zero_grad()
        y_predicted = torch.matmul(x_train[i], w)
        train_loss = torch.nn.functional.mse_loss(y_predicted, y_train[i]) + lamb  * torch.sum(w ** 2)
        total_train_loss += train_loss
        train_loss.backward()
        optimizer.step()
    print('Train loss at ' + str(ep) + ': ' + str(total_train_loss / len(x_train)))
    print('w: ', w)

Train loss at 0: tensor([3.7132], grad_fn=<DivBackward0>)
w:  tensor([[2.5056],
        [1.3033]], requires_grad=True)
Train loss at 1: tensor([3.7002], grad_fn=<DivBackward0>)
w:  tensor([[2.5104],
        [1.3051]], requires_grad=True)
Train loss at 2: tensor([3.6889], grad_fn=<DivBackward0>)
w:  tensor([[2.5152],
        [1.3069]], requires_grad=True)
Train loss at 3: tensor([3.6779], grad_fn=<DivBackward0>)
w:  tensor([[2.5199],
        [1.3086]], requires_grad=True)
Train loss at 4: tensor([3.6671], grad_fn=<DivBackward0>)
w:  tensor([[2.5246],
        [1.3103]], requires_grad=True)
Train loss at 5: tensor([3.6563], grad_fn=<DivBackward0>)
w:  tensor([[2.5293],
        [1.3119]], requires_grad=True)
Train loss at 6: tensor([3.6456], grad_fn=<DivBackward0>)
w:  tensor([[2.5340],
        [1.3136]], requires_grad=True)
Train loss at 7: tensor([3.6350], grad_fn=<DivBackward0>)
w:  tensor([[2.5387],
        [1.3152]], requires_grad=True)
Train loss at 8: tensor([3.6244], grad_fn=<DivBa

The following block combines the optimization of weight and hyperparameter together
Observations:
    - The training loss still decreases even when the hyperparameter is wrong, but after hyperparameter adjusts, the training loss seems to decreases compared to previous hepoch, but not during epoch.
        - Question: Does this mean the loss already converge? Does this mean optimizing lambda is not important?
        - Thought: Which hyperparameter should we optimize?
    - After epoch 200, the hyperparameter becomes negative, w starts to increase?
        - Question:
    - Training epoch vs Hyperparam epoch
    - Note: that we have individual lambda for each weight


In [101]:
torch.manual_seed(1)
h_epoch = 10000  # Hyperparameter epoch
epoch = 1000  # Epoch for training

#Create underlying linear function
x = torch.randn((10, 2))
true_w = torch.tensor([[3.], [1.]])
y = torch.matmul(x, true_w) + torch.randn((10, 1))

# Split train_valid
x_train = x[:8, ]
y_train = y[:8, ]

x_valid = x[8:, ]
y_valid = y[8:, ]
#Parameters and hyperparameters
w = torch.tensor([[2.5], [1.3]], requires_grad=True)
lamb = torch.tensor([3.,3.], requires_grad=True)  #Intentionally high value

#Define optimizer (Note: The choice of optimizer is similar to the problem setting)
optimizer = torch.optim.Adam([w], lr = 0.001)
h_optimizer = torch.optim.RMSprop([lamb])

# Note the update is currently very noisy
# Define the loop
for hep in range(h_epoch):
    # Train (SGD)
    for ep in range(epoch):
        total_train_loss = 0
        for i in range(len(x_train)):
            optimizer.zero_grad()
            y_predicted = torch.matmul(x_train[i], w)
            train_loss = torch.nn.functional.mse_loss(y_predicted, y_train[i]) + lamb * torch.sum(w ** 2)
            print(train_loss)
            total_train_loss += train_loss
            train_loss.backward(create_graph=True)
            optimizer.step()
        # if ep % 100 == 0: # Only print every 100 epoch
        #     print('Train loss at ' + str(ep) + ': ' + str(total_train_loss / len(x_train)))

    # Train the hyperparameter
    total_d_val_loss_d_lamb = torch.zeros(lamb.size())
    d_valid_loss_d_w = torch.zeros(w.size())
    for i in range(len(x_valid)):
        w.grad.zero_()
        y_predicted = torch.matmul(x_valid[i], w)
        valid_loss = torch.nn.functional.mse_loss(y_predicted, y_valid[i])
        valid_loss_grad = grad(valid_loss, w)
        d_valid_loss_d_w += valid_loss_grad[0]
    d_valid_loss_d_w /= len(x_valid)

    for i in range(len(x_train)):
        y_predicted = torch.matmul(x_train[i], w)
        train_loss = torch.nn.functional.mse_loss(y_predicted, y_train[i]) + lamb * torch.sum(w ** 2)
        w.grad.zero_(), h_optimizer.zero_grad()
        d_train_loss_d_w = grad(train_loss, w, create_graph=True)

        w.grad.zero_(), h_optimizer.zero_grad()
        d_train_loss_d_w[0].backward(d_valid_loss_d_w)

        if lamb.grad is not None:
            total_d_val_loss_d_lamb -= lamb.grad
    total_d_val_loss_d_lamb /= len(x_train)

    lamb.grad = total_d_val_loss_d_lamb
    h_optimizer.step()

    w.grad.zero_(), h_optimizer.zero_grad()
    print('lamb after epoch '+ str(hep) + ': ' + str(lamb))
    print('w value: ', w)



tensor([28.2738], grad_fn=<AddBackward0>)
tensor([23.7981], grad_fn=<AddBackward0>)
tensor([39.3672], grad_fn=<AddBackward0>)
tensor([29.5345], grad_fn=<AddBackward0>)
tensor([25.9012], grad_fn=<AddBackward0>)
tensor([23.8358], grad_fn=<AddBackward0>)
tensor([23.9760], grad_fn=<AddBackward0>)
tensor([24.3781], grad_fn=<AddBackward0>)
tensor([28.1708], grad_fn=<AddBackward0>)
tensor([23.6195], grad_fn=<AddBackward0>)
tensor([39.2583], grad_fn=<AddBackward0>)
tensor([29.3523], grad_fn=<AddBackward0>)
tensor([25.7606], grad_fn=<AddBackward0>)
tensor([23.6493], grad_fn=<AddBackward0>)
tensor([23.7943], grad_fn=<AddBackward0>)
tensor([24.2133], grad_fn=<AddBackward0>)
tensor([28.0699], grad_fn=<AddBackward0>)
tensor([23.4425], grad_fn=<AddBackward0>)
tensor([39.1504], grad_fn=<AddBackward0>)
tensor([29.1711], grad_fn=<AddBackward0>)
tensor([25.6213], grad_fn=<AddBackward0>)
tensor([23.4642], grad_fn=<AddBackward0>)
tensor([23.6137], grad_fn=<AddBackward0>)
tensor([24.0499], grad_fn=<AddBack

KeyboardInterrupt: 