In [1]:
import re
import json
from tqdm.notebook import tqdm
import torch
import pytorch_lightning as pl
from data.dataset import NERDataset
from models.utils import Namespace
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence, pad_sequence, PackedSequence
from models.networks import GlobalContextualDeepTransition
from pytorch_lightning.callbacks import ModelCheckpoint

In [2]:
sourceName = 'data/conll03/eng.train.src'
targetName = 'data/conll03/eng.train.trg'
gloveFile = 'data/conll03/trimmed.300d.Cased.txt'
symbFile = 'data/conll03/sym.glove'
testSrc = 'data/conll03/eng.testb.src'
testTrg = 'data/conll03/eng.testb.trg'

data = NERDataset(sourceName, targetName, gloveFile, symbFile)
data.readTestFile(testSrc, testTrg)
loader = data.getLoader(1024, shuffle=False)

In [3]:
prevCheckpointPath = 'lightning_logs/checkpoint-v0.ckpt'

with open('config.json', 'r') as file:
    kwargs = json.load(file)
    
model = GlobalContextualDeepTransition.load_from_checkpoint(prevCheckpointPath, **kwargs)
model = model.eval()

In [4]:
def logitsToLogProbs(logits):
    return logits - torch.logsumexp(logits, dim=1, keepdim=True)

In [8]:
"""
At any point, we store
    index
    timeTillNow
    pathFollowedTillNow
    hValue
for the sole purpose of backtracking

We declare a matrix(batch_size, beam_size) of these to store info
"""
class Node:
    def __init__(self, path=[], value=0):
#         self.index  = index
#         self.time   = time
        self.path   = path
        self.value =  value
#         self.hiddenState = hiddenState
        
    def __len__(self):
        return len(self.path)

    def expandable(self, targetLen):
        return len(self.path) < targetLen
    
    def expandWithChoices(self, choices, logProbs):
        newNodes = []
        for choice, logProb in zip(choices, logProbs):
            newNodes.append( Node(self.path+[choice], self.value+logProb) )
        

In [9]:
batch = next(iter(loader))

In [10]:
with torch.no_grad():
    words, chars, charMask, targets = batch
    encoded, initHiddenState, initPrevTarget = model.encode(words, chars, charMask)

In [12]:
batchSize = words.batch_sizes[0].item() # batchSize
beamSize  = 4 # beamsize
units = model.sequenceLabeller.decoderUnits
print(f"batchSize={batchSize}, beamSize={beamSize}, units={units}")

batchSize=47, beamSize=4, units=256


In [None]:
"""Init nodes in a matrix for each beam and """

# encoded pages
start = 0
encodedPages = []
for pageLen in words.batch_sizes:
    page = encoded[start:start+pageLen].repeat(beamSize, 1)
    encodedPages.append(page) # [e1, e2, e3, e1, e2, e3.. etc, repeated beamSize times]
    start += pageLen



# state = Namespace(
#     nodes = [[Node() for _ in range(beamSize)] for _ in range(batchSize)] # only beamSize choices to backtrack
#     encodedPages = encodedPages
#     hiddenState = [torch.zeros(page.shape) for page in ]
# )

# print(*[page.shape for page in encodedPages], sep='\n')

In [32]:
print("Initial")
x = torch.arange(12, dtype=torch.float).reshape((3, 4))
print(x)

print("Repeated")
x = x.repeat(2, 1)
print(x)

print("Reshaped")
x = x.reshape(2,3,4)
x[0] += 0.2 * torch.randint(2, size=(3, 4)) - 0.1
print(x)

print("Maximum")
print(x.max(axis=0)[0])

Initial
tensor([[ 0.,  1.,  2.,  3.],
        [ 4.,  5.,  6.,  7.],
        [ 8.,  9., 10., 11.]])
Repeated
tensor([[ 0.,  1.,  2.,  3.],
        [ 4.,  5.,  6.,  7.],
        [ 8.,  9., 10., 11.],
        [ 0.,  1.,  2.,  3.],
        [ 4.,  5.,  6.,  7.],
        [ 8.,  9., 10., 11.]])
Reshaped
tensor([[[ 0.1000,  1.1000,  1.9000,  2.9000],
         [ 3.9000,  5.1000,  5.9000,  6.9000],
         [ 7.9000,  8.9000,  9.9000, 10.9000]],

        [[ 0.0000,  1.0000,  2.0000,  3.0000],
         [ 4.0000,  5.0000,  6.0000,  7.0000],
         [ 8.0000,  9.0000, 10.0000, 11.0000]]])
Maximum
tensor([[ 0.1000,  1.1000,  2.0000,  3.0000],
        [ 4.0000,  5.1000,  6.0000,  7.0000],
        [ 8.0000,  9.0000, 10.0000, 11.0000]])


In [89]:
def rnnPlusWarmupDecay(learningRate=8e-3, minValue=5e-6):
    def subroutine(step):
        """
            if the resultant lr ( = decay * learning_rate) is very small,
            then return decay such that resultant lr becomes minvalue
            
            that is decay * learning_rate >= minvalue
            so decay can never be less than minValue/learning_rate
            therefore we max it with the least val
            decay = max(decay, minValue/learning_rate)
        """
        exp = (1000 - step) / 3000
        decay = min(1, 2 ** exp)
        return max(decay, minValue/learningRate)
    return subroutine

In [95]:
w1 = torch.nn.Parameter(torch.randn(3, 4))
w2 = torch.nn.Parameter(torch.randn(3, 4))

optim = torch.optim.Adam([w1, w2], lr=0.1)
sched = torch.optim.lr_scheduler.LambdaLR(optim, rnnPlusWarmupDecay())

In [97]:
for i in range(9000):
    loss = w1*w2
    loss.sum().backward()
    if i%500 == 0:
        lr = optim.param_groups[0]['lr']
        print(i, lr)
    
    optim.step()
    sched.step()

0 0.015749013123685915
500 0.01403077560386716
1000 0.0125
1500 0.011136233976754242
2000 0.009921256574801246
2500 0.008838834764831846
3000 0.007874506561842957
3500 0.00701538780193358
4000 0.00625
4500 0.00556811698837712
5000 0.004960628287400625
5500 0.004419417382415923
6000 0.003937253280921478
6500 0.0035076939009667917
7000 0.003125
7500 0.00278405849418856
8000 0.0024803141437003125
8500 0.0022097086912079614


In [51]:
sched

0.008

In [81]:
optim.param_groups[0]

{'params': [Parameter containing:
  tensor([[-1.4074, -2.0451, -1.9878, -0.7623],
          [-1.5438, -0.5121,  0.6965, -0.7930],
          [ 0.0273,  0.6387,  0.3439,  0.0112]], requires_grad=True),
  Parameter containing:
  tensor([[-1.3529,  0.6924,  0.5541,  0.8220],
          [ 0.5065, -1.0960, -0.2883, -0.5091],
          [ 1.3843,  1.2723, -0.3518, -1.1477]], requires_grad=True)],
 'lr': 9.090909090909092e-05,
 'betas': (0.9, 0.999),
 'eps': 1e-08,
 'weight_decay': 0,
 'amsgrad': False,
 'initial_lr': 0.001}