In [1]:
from reformer_pytorch import Reformer, ReformerLM
import torch
import numpy as np
import sys
sys.path.append("../..")
from sklearn.utils import shuffle

train_limit = 10

In [2]:
x = torch.randint(0, 10000, (1, 2048)).cuda()
print(x.shape)
print(x[0:20])

# this is used during pre-trainining 
# therefore output is in size of vocabulary
modelLM = ReformerLM(
    num_tokens=10000,
    dim=16,
    depth=12,
    max_seq_len=2048,
    heads=8,
    causal = False
).cuda()
y = modelLM(x).detach().cpu()
print(y.shape)
y

torch.Size([1, 2048])
tensor([[9663, 5815, 6604,  ..., 7963, 3734, 1388]], device='cuda:0')
torch.Size([1, 2048, 10000])


tensor([[[-0.2571,  0.0946,  1.2171,  ...,  1.0211,  0.6078, -1.5222],
         [-0.5475, -1.0385,  0.3527,  ..., -0.3380,  0.3230, -0.1097],
         [-0.1876,  0.2611, -0.2140,  ..., -0.1548, -0.2156,  1.3618],
         ...,
         [-0.0234, -0.1973, -0.3887,  ...,  0.0345,  0.2153,  0.5647],
         [-0.1371,  0.4535,  0.1623,  ...,  0.7661, -0.2427,  0.7677],
         [-0.7233, -0.1527,  0.0673,  ...,  1.0133,  0.8355, -0.0027]]])

# How to understand that output!?

```
([1, 2048, 10000])
```
... means that for every token in input sequence model outputs 10000 probabilities for each token in vocabulary. 


In [3]:
model = Reformer(
    dim=16,
    depth=12,
    heads=8,
    lsh_dropout=0.1,
    causal=False
).cuda()

x = torch.randn(1, 2048, 16).cuda()
y = model(x).detach().cpu() # (1, 8192, 512)
print(y.shape)
y

torch.Size([1, 2048, 16])


tensor([[[-0.3885,  1.6093,  0.0879,  ...,  0.4407,  1.6088, -0.1378],
         [ 0.5257,  0.3730, -0.8231,  ...,  0.4697, -0.0235, -0.7727],
         [-0.4090, -2.6898, -0.5801,  ...,  1.4676, -0.4837,  0.0421],
         ...,
         [-0.8758, -0.1590,  0.7603,  ...,  1.2296, -0.6198,  1.8286],
         [ 0.8860,  0.2445, -0.8100,  ...,  0.9405,  0.9864, -0.3628],
         [-1.1039,  2.7194, -1.2797,  ..., -0.0714,  1.0309,  2.3854]]])

How to update params from LM to actual new model:

In [4]:
model_dict = model.state_dict()

# filter out the output layer weights from the pre-trained weights
pretrained_dict = {k: v for k, v in modelLM.state_dict().items() if \
     k not in ['norm.weight', 'norm.bias', 'out.1.weight', 'out.1.bias', 'layer_pos_emb.inv_freq', 'token_emb.weight']}
# remove "reformer." from the keys
pretrained_dict = {k.replace("reformer.", ""): v for k, v in pretrained_dict.items()}

print(set(model_dict.keys()) - set(pretrained_dict.keys()))
print(set(pretrained_dict.keys()) - set(model_dict.keys()))
#model_dict.update(pretrained_dict)
#model.load_state_dict(model_dict)

set()
set()


# Trying on my data, buildin own model

In [5]:
x_train = r"C:\Users\dtrizna\Code\nebula\data\data_filtered\speakeasy_trainset\speakeasy_VocabSize_10000_maxLen_2048_x.npy"
x_train = np.load(x_train)
y_train = r"C:\Users\dtrizna\Code\nebula\data\data_filtered\speakeasy_trainset\speakeasy_y.npy"
y_train = np.load(y_train)

if train_limit:
    x_train, y_train = shuffle(x_train, y_train, random_state=42)
    x_train = x_train[:train_limit]
    y_train = y_train[:train_limit]

In [7]:
from nebula.attention import MyReformerLM

x = torch.randint(0, 10000, (1, 2048)).cuda()
print(x.shape)
print(x[0:20])

# this is used during pre-trainining 
# therefore output is in size of vocabulary
modelLM = MyReformerLM(
    num_tokens=10000,
    dim=16,
    depth=12,
    max_seq_len=2048,
    heads=8,
    causal = False
).cuda()
y = modelLM.classify(x).detach().cpu()
print(y.shape)
y

torch.Size([1, 2048])
tensor([[8918, 4020, 8907,  ..., 1570,  489, 5653]], device='cuda:0')
before token_emb torch.Size([1, 2048])
after token_emb torch.Size([1, 2048, 16])
after layer_pos_emb torch.Size([1, 2048, 16])
after to_model_dim, before reformer torch.Size([1, 2048, 16])
after reformer torch.Size([1, 2048, 16])
after core, before fcOutput torch.Size([1, 2048, 16])
torch.Size([1, 2048, 1])


tensor([[[-0.4142],
         [ 0.1246],
         [ 0.3318],
         ...,
         [-0.1138],
         [-0.6565],
         [ 0.1685]]])

In [13]:
# force realod of class from within notebook
import importlib
import nebula
importlib.reload(nebula.attention)
from nebula.attention import TransformerEncoderModel

model = TransformerEncoderModel(
    vocabSize=10000,
).cuda()

x = torch.randint(0, 10000, (1, 2048)).cuda()
y = model(x).detach().cpu()
print(y.shape)

shape before transformer torch.Size([1, 2048, 32])
shape after transformer torch.Size([1, 2048, 32])
shape after mean torch.Size([1, 32])
shape after ffnn torch.Size([1, 64])
torch.Size([1, 1])
