In [1]:
import torch 
import torch.nn as nn

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.first_layer = nn.Linear(100,50)
        self.second_layer = nn.Linear(50, 1)
    def forward(self, x):
        x = nn.functional.relu(self.first_layer(x))
        x = self.second_layer(x)
        return x
mlp = MLP()

In [3]:
from torch.utils.data import TensorDataset
x = torch.randn(1000, 100).to(device)
y = (torch.rand(1000)>0.5).int().float().to(device) # When feeding the data to optimizer, it only accepts floats

In [4]:
from torch.utils.data import TensorDataset, DataLoader
dataset = TensorDataset(x, y)
dataloader = DataLoader(dataset, batch_size=100, shuffle=True)

In [5]:
for i in dataloader:
    print(i[0])
    print(i[1])

tensor([[ 0.3349, -0.3357,  1.5577,  ...,  1.2535, -0.8741,  1.1964],
        [ 0.5997, -0.1040, -0.0421,  ...,  2.0984, -1.6125, -0.2931],
        [ 0.9497, -0.1843, -2.0102,  ..., -0.4697,  1.1201,  1.2877],
        ...,
        [ 0.1315, -0.4559, -1.6739,  ...,  1.5253, -1.2748, -1.0905],
        [ 0.4752,  0.9459, -0.5876,  ...,  2.5902, -0.0187, -1.1862],
        [-0.2458, -0.7106, -1.1863,  ...,  0.4584, -0.0682, -1.2806]],
       device='cuda:0')
tensor([1., 1., 0., 0., 1., 1., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 1., 0.,
        1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 1., 0., 1.,
        0., 1., 1., 1., 0., 0., 0., 0., 1., 1., 1., 1., 0., 1., 1., 0., 0., 1.,
        1., 0., 1., 1., 1., 0., 1., 1., 1., 0., 0., 1., 1., 1., 1., 0., 0., 1.,
        1., 1., 0., 0., 0., 1., 1., 0., 0., 1., 1., 0., 0., 1., 1., 0., 0., 1.,
        1., 0., 0., 1., 1., 1., 1., 1., 1., 0.], device='cuda:0')
tensor([[-1.1079, -2.1684,  0.1284,  ...,  1.2020,  0.9195,  0.1577],
      

## Set-up learning rate

In [6]:
import torch.optim as optim
# optimizer = SGD([{'params': model.classifier[0].parameters(), 'lr': 3e-6, 'momentum': 0.9 }, 

optimizer = optim.Adam([{'params':mlp.first_layer.parameters(), 'lr':1e-2}, 
                        {'params':mlp.second_layer.parameters(), 'lr':1e-3}],lr = 2e-2)
criterion = nn.BCEWithLogitsLoss()

mlp.to(device)
criterion.to(device)

BCEWithLogitsLoss()

In [7]:
n_epoch = 5
for epoch in range(n_epoch):
    for batch in dataloader:
                
        optimizer.zero_grad()
        
        x, y = batch
        
        predictions = mlp(x).squeeze() # Getting rid of batch times 1
        
        loss = criterion(predictions, y)
            
        loss.backward()
        
        optimizer.step()

In [8]:
for param_group in optimizer.param_groups:
    print(param_group['lr'])



0.01
0.001


In [9]:
optimizer.__dict__

{'defaults': {'lr': 0.02,
  'betas': (0.9, 0.999),
  'eps': 1e-08,
  'weight_decay': 0,
  'amsgrad': False},
 'state': defaultdict(dict, {Parameter containing:
              tensor([[-0.0803,  0.0802,  0.0560,  ..., -0.0338, -0.0241,  0.0831],
                      [-0.0092, -0.0657,  0.0860,  ...,  0.0911, -0.0204, -0.0413],
                      [ 0.2514, -0.1719, -0.0718,  ..., -0.1128, -0.0528,  0.1447],
                      ...,
                      [ 0.0942,  0.0703,  0.0420,  ...,  0.0204, -0.0685, -0.0527],
                      [-0.0645, -0.0242, -0.0934,  ...,  0.0526,  0.0162,  0.1657],
                      [-0.0109, -0.1492, -0.1180,  ...,  0.0320, -0.1101,  0.0688]],
                     device='cuda:0', requires_grad=True): {'step': 50,
               'exp_avg': tensor([[ 1.2066e-04, -3.2025e-04,  2.2104e-04,  ..., -1.4491e-04,
                         2.1351e-04,  1.9971e-04],
                       [-2.1194e-04, -2.2359e-04,  3.3994e-04,  ..., -5.9924e-04,
          

In [10]:
torch.save(optimizer.state_dict(), 'optimizer.pt')

In [11]:
optimizer2 = optim.Adam([{'params':mlp.first_layer.parameters(), 'lr':1e-2}, 
                        {'params':mlp.second_layer.parameters(), 'lr':1e-3}],lr = 2e-2)
optimizer2.load_state_dict(torch.load('optimizer.pt'))

In [12]:
optimizer2.__dict__

{'defaults': {'lr': 0.02,
  'betas': (0.9, 0.999),
  'eps': 1e-08,
  'weight_decay': 0,
  'amsgrad': False},
 'state': defaultdict(dict, {Parameter containing:
              tensor([[-0.0803,  0.0802,  0.0560,  ..., -0.0338, -0.0241,  0.0831],
                      [-0.0092, -0.0657,  0.0860,  ...,  0.0911, -0.0204, -0.0413],
                      [ 0.2514, -0.1719, -0.0718,  ..., -0.1128, -0.0528,  0.1447],
                      ...,
                      [ 0.0942,  0.0703,  0.0420,  ...,  0.0204, -0.0685, -0.0527],
                      [-0.0645, -0.0242, -0.0934,  ...,  0.0526,  0.0162,  0.1657],
                      [-0.0109, -0.1492, -0.1180,  ...,  0.0320, -0.1101,  0.0688]],
                     device='cuda:0', requires_grad=True): {'step': 50,
               'exp_avg': tensor([[ 1.2066e-04, -3.2025e-04,  2.2104e-04,  ..., -1.4491e-04,
                         2.1351e-04,  1.9971e-04],
                       [-2.1194e-04, -2.2359e-04,  3.3994e-04,  ..., -5.9924e-04,
          

## Scheduler

In [13]:
import torch.optim as optim
optimizer = optim.Adam(params = mlp.parameters(), lr = 0)
criterion = nn.BCEWithLogitsLoss()

mlp.to(device)
criterion.to(device)

BCEWithLogitsLoss()

In [14]:
print(optimizer.param_groups[0]['lr'])

0


In [15]:
n_epoch = 3
global_steps = 0
warm_up_steps = 1000
max_learning_rate = 0.01
for epoch in range(n_epoch):
    for batch in dataloader:
        global_steps += 1  
        optimizer.zero_grad()
        
        x, y = batch
        
        predictions = mlp(x).squeeze() # Getting rid of batch times 1
        
        loss = criterion(predictions, y)
            
        loss.backward()
        
        optimizer.step()
        if global_steps < 1000:
            optimizer.param_groups[0]['lr'] = global_steps*max_learning_rate/warm_up_steps
        else:
            optimizer.param_groups[0]['lr'] = max_learning_rate

In [16]:
optimizer.__dict__

{'defaults': {'lr': 0,
  'betas': (0.9, 0.999),
  'eps': 1e-08,
  'weight_decay': 0,
  'amsgrad': False},
 'state': defaultdict(dict, {Parameter containing:
              tensor([[-0.0816,  0.0813,  0.0561,  ..., -0.0335, -0.0243,  0.0828],
                      [-0.0087, -0.0660,  0.0853,  ...,  0.0914, -0.0205, -0.0428],
                      [ 0.2527, -0.1736, -0.0723,  ..., -0.1129, -0.0533,  0.1445],
                      ...,
                      [ 0.0938,  0.0712,  0.0417,  ...,  0.0199, -0.0689, -0.0538],
                      [-0.0638, -0.0231, -0.0943,  ...,  0.0536,  0.0168,  0.1666],
                      [-0.0109, -0.1504, -0.1196,  ...,  0.0311, -0.1104,  0.0687]],
                     device='cuda:0', requires_grad=True): {'step': 30,
               'exp_avg': tensor([[ 1.0543e-04, -2.4661e-04,  9.2098e-05,  ..., -1.4196e-04,
                         1.4496e-05,  1.2534e-04],
                       [-5.0293e-04,  1.5700e-04,  5.9730e-04,  ..., -7.2543e-04,
             