In [8]:
import os
[f for f in os.listdir() if '.py' in f]

['data.py', 'pretrained_model.py', 'transformer.py', 'utils.py']

In [9]:
from data import *
from utils import *
from pretrained_model import *
from transformer import *

In [10]:
dataset = CustomDataset('enwiki-latest-pages-articles_preprocessed.txt')

#Wrap it around a dataloader
dataloader = DataLoader(dataset, batch_size = 2, num_workers = 0)

In [11]:
from torch.nn.functional import leaky_relu


In [12]:
trf =  Transformer(d_model = 100, nhead = 2, num_encoder_layers = 3, 
                   dim_feedforward = 100, dropout = .1, activation = 'lrelu')

In [143]:
trf(torch.rand((10, 32, 100)), src_key_padding_mask=torch.ones((32, 10))).shape

torch.Size([10, 32, 100])

In [13]:
pretrained = PretrainedModel()

In [43]:
torch.transpose(torch.rand((10, 32, 100)), 1, 0).shape

torch.Size([32, 10, 100])

In [44]:
class Embedder(nn.Module):
    def __init__(self, vocab_size, d_model):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, d_model)
    def forward(self, x):
        return self.embed(x)

In [57]:
import math
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
#         self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return x

In [60]:
emb = Embedder(5, 4)
PE(emb(torch.tensor([[1, 2, 3]]))).shape

torch.Size([1, 3, 4])

In [52]:
tok = BertTokenizer.from_pretrained('bert-base-uncased')

VOCAB_SIZE = vocab_size = tok.vocab_size

In [170]:
class TinyBert(nn.Module):
    def __init__(self, vocab_size = VOCAB_SIZE, emb_size=10):
        super(TinyBert, self).__init__()
        self.emb_size = emb_size
        self.model = Transformer(
            d_model = emb_size, nhead = 2, num_encoder_layers = 3, 
            dim_feedforward = emb_size, dropout = .1, activation = 'lrelu')
        self.embedder = Embedder(vocab_size, emb_size)
        self.PE = PositionalEncoding(emb_size)
    def forward(self, src, mask=None):
        if mask is None:
            mask = torch.ones_like(src, dtype = float)
        #reshaping cus trf module is stupid
        self.mask = mask
        self.emb_raw = emb_raw = self.embedder(src)
        self.emb = emb = self.PE(emb_raw)
        self.emb_transposed = emb_transposed = torch.transpose(emb, 1, 0)
        self.trf_output = trf_output = self.model(emb_transposed, src_key_padding_mask=mask)
        return trf_output

In [171]:
tb = TinyBert()

In [173]:
# tb(torch.tensor([[1, 2, 3]]))

In [208]:
# from tensorflow.python.keras.preprocessing.sequence import pad_sequences
import torch.optim as optim

class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.pretrained_model = PretrainedModel()
        self.tinybert = TinyBert()
#         self.tinybert = Transformer(d_model = 100, nhead = 2, num_encoder_layers = 3, 
#                    dim_feedforward = 100, dropout = .1, activation = 'lrelu')
        self.tokenizer = self.pretrained_model.tokenizer
        self.y = []
        self.optimizer = optim.RMSprop(self.parameters(), lr=0.01)

    def forward(self, text):
        if isinstance(text[0], list):
            return self.forward_sentence(text)
        elif isinstance(text[0], str):
            return self.forward_maskLM(text)
        else:
            raise ValueError('wtf is this text?' + text + type(text[0]))
            
    def preprocess_LM(self, text):
        self.y = []
        sentences = [build_sentence_list(
            'CLS', [self.tokenizer.tokenize(line)]) for line in text]
        
        lengths = [len(sentence) - 2 for sentence in sentences]
        mask_idxes = [np.random.randint(0, length) for length in lengths]
        
        masks = [np.ones(length + 2) for length in lengths]
        for mask_idx, mask, sentence in zip(mask_idxes, masks, sentences):
            mask[mask_idx + 1] = 0
            self.y.append(sentence[mask_idx + 1])
            sentence[mask_idx + 1] = '[MASK]'
        self.attention_mask = attention_mask = to_cuda(torch.tensor(pad_sequences(masks, padding='post')))
        self.tokenized_text = tokenized_text = to_cuda(torch.tensor(pad_sequences([
            self.tokenizer.convert_tokens_to_ids(sentence) for sentence in sentences]).tolist()))
        return tokenized_text, attention_mask
    def forward_maskLM(self, text):
        tokenized_text, attention_mask = self.preprocess_LM(text)
        self.pretrained_hidden = pretrained_hidden = self.pretrained_model(
            tokenized_text = tokenized_text, attention_mask = attention_mask)
        self.tb_out = tb_out = self.tinybert(tokenized_text, mask=attention_mask)
        self.tb_out_masked = tb_out_masked = tb_out * attention_mask.transpose(1, 0).unsqueeze(-1)
        return tb_out_masked
        

In [209]:

mdl = Model()

In [213]:
dataset = CustomDataset('enwiki-latest-pages-articles_preprocessed.txt')

#Wrap it around a dataloader
dataloader = DataLoader(dataset, batch_size = 8, num_workers = 0, shuffle=True)


In [216]:
itr = 0
for text in dataloader:
    print(len(text), text)
    itr += 1
    if itr > 2:
        break

8 ['These interactions were the last time Cobain saw his daughter.', 'Cambridge is home to the following commercially licensed and student-run radio stations: ', "Elizabeth's work had a major influence on prominent writers of the day, including the American poets Edgar Allan Poe and Emily Dickinson.", 'Additionally, Aaron owns a chain of 30 restaurants around the country.', 'Gilbert Arthur Ã\xa0 Beckett ', 'The defense of Little Round Top with a bayonet charge by the 20th Maine, ordered by Col. Joshua L. Chamberlain but possibly led by Lt. Holman S. Melcher, was one of the most fabled episodes in the Civil War and propelled Col. Chamberlain into prominence after the war.', "Increase Mather was said to have publicly burned Calef's book in Harvard Yard around the time he was removed from the head of the college and replaced by Samuel Willard.", 'Districts may apply to the British Crown for the grant of borough status upon advice of the Privy Council of the United Kingdom.']
8 ['The Vikin

In [203]:

mdl(['hi there', 'strawberries']).shape

torch.Size([4, 2, 10])