In [7]:
import sys
sys.path.append("source")
import torch
import numpy as np
from hsicbt.utils import misc
from hsicbt.model.mhlinear import ModelLinear

In [9]:
# # # our model
model = ModelLinear(last_hidden_width=10)
print(model)

ModelLinear(
  (input_layer): Sequential(
    (0): Linear(in_features=784, out_features=64, bias=True)
    (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=False, track_running_stats=True)
    (2): ReLU()
  )
  (sequence_layer): Sequential(
    (0): Sequential(
      (0): Linear(in_features=64, out_features=64, bias=True)
      (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=False, track_running_stats=True)
      (2): ReLU()
    )
    (1): Sequential(
      (0): Linear(in_features=64, out_features=64, bias=True)
      (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=False, track_running_stats=True)
      (2): ReLU()
    )
    (2): Sequential(
      (0): Linear(in_features=64, out_features=64, bias=True)
      (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=False, track_running_stats=True)
      (2): ReLU()
    )
    (3): Sequential(
      (0): Linear(in_features=64, out_features=64, bias=True)
      (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=False, track_run

In [10]:
# # # Preparation
batch_size = 32
train_x = torch.randn(batch_size, 784)
train_y = torch.randint(0,10,(batch_size,)).long()
criterion = torch.nn.CrossEntropyLoss()
output, hiddens = model(train_x)
idx_range = []

In [12]:
print("========== Proposed approach ============")
layer_idx = 3 # let's say third layer
it = 0 # It's ugly, the aim is trying to query the parameters of the model at each layer, which is skip 2 because weight and bias
for i in range(len(hiddens)):
    idx_range.append(np.arange(it, it+2).tolist())
    it += 2
params, param_names = misc.get_layer_parameters(model=model, idx_range=idx_range[layer_idx])
optimizer = torch.optim.SGD(params, lr=0.1, momentum=.9, weight_decay=0.001) # we only expose the weights at layer_idx to optimizer
loss = criterion(output, train_y)
loss.backward()




In [13]:
# # # Check before&after weight update
norm_before_step = []
for p in model.parameters():
    norm_before_step.append(torch.norm(p).item())
optimizer.step() # let's apply weights on model
norm_after_step = []
for p in model.parameters():
    norm_after_step.append(torch.norm(p).item())
# # # Difference checking
print(f"Diff of the model weight and bias (Only layer:{layer_idx} are updated)")
print([val[0]-val[1] for val in zip(norm_before_step, norm_after_step) ])


Diff of the model weight and bias (Only layer:3 are updated)
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.008398056030273438, 6.0439109802246094e-05, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [14]:
print("========== Standard backprop ============")
model = ModelLinear()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=.9, weight_decay=0.001)
output, hiddens = model(train_x)
loss = criterion(output, train_y)
loss.backward()



In [16]:
norm_before_step = []
for p in model.parameters():
    norm_before_step.append(torch.norm(p).item())
optimizer.step()
norm_after_step = []
for p in model.parameters():
    norm_after_step.append(torch.norm(p).item())

In [17]:
print("Diff of the model weight and bias in backprop (All weights should be changed)")
print([val[0]-val[1] for val in zip(norm_before_step, norm_after_step) ])


Diff of the model weight and bias in backprop (All weights should be changed)
[-3.0573997497558594, 3.217160701751709e-05, -0.18912172317504883, 0.00010091066360473633, -0.12189292907714844, 0.00011330842971801758, -0.07697868347167969, 0.00011420249938964844, -0.04471778869628906, 0.00011616945266723633, -0.02904987335205078, 0.00010466575622558594, -0.018402099609375, 0.00011450052261352539]
