In [1]:
# =============================================================================
# Libs
# =============================================================================
from torch.utils.data import Dataset
import torch.nn.functional as F
from collections import Counter
from os.path import exists
import torch.optim as optim
import torch.nn as nn
import numpy as np
import random
import torch
import math
import re

!git clone https://github.com/deepanshudashora/custom_models.git

In [3]:
from custom_models.transformers.model import Transformer
from custom_models.transformers.datamodules.bert_datamodule import SentencesDataset,create_sentences_and_vocab

In [4]:

def get_batch(loader, loader_iter):
    try:
        batch = next(loader_iter)
    except StopIteration:
        loader_iter = iter(loader)
        batch = next(loader_iter)
    return batch, loader_iter


In [6]:
# =============================================================================
# #Init
# =============================================================================
print('initializing..')
batch_size = 1024
seq_len = 20
embed_size = 128
inner_ff_size = embed_size * 4
n_heads = 8
n_code = 8
n_vocab = 40000
dropout = 0.1
# n_workers = 12

optim_kwargs = {'lr':1e-4, 'weight_decay':1e-4, 'betas':(.9,.999)}

#1) load text
print('loading text...')
sentence_path = 'training.txt'
vocab_path = "vocab.txt"

sentences, vocab = create_sentences_and_vocab(sentence_path,vocab_path)
print('creating dataset...')
dataset = SentencesDataset(sentences, vocab, seq_len)
# kwargs = {'num_workers':n_workers, 'shuffle':True,  'drop_last':True, 'pin_memory':True, 'batch_size':batch_size}
kwargs = {'shuffle':True,  'drop_last':True, 'pin_memory':True, 'batch_size':batch_size}
data_loader = torch.utils.data.DataLoader(dataset, **kwargs)

initializing..
loading text...
tokenizing sentences...
creating/loading vocab...
creating dataset...


In [7]:
# =============================================================================
#init model
print('initializing model...')
model = Transformer(n_code=n_code, n_heads=n_heads, embed_size=embed_size, inner_ff_size=inner_ff_size, n_embeddings=len(dataset.vocab), seq_len=seq_len, dropout=dropout)
model = model.cuda()

# =============================================================================
# Optimizer
# =============================================================================
print('initializing optimizer and loss...')
optimizer = optim.Adam(model.parameters(), **optim_kwargs)
loss_model = nn.CrossEntropyLoss(ignore_index=dataset.IGNORE_IDX)

# =============================================================================
# Train
# =============================================================================
print('training...')
print_each = 10
model.train()
batch_iter = iter(data_loader)
n_iteration = 10000
for it in range(n_iteration):

    #get batch
    batch, batch_iter = get_batch(data_loader, batch_iter)

    #infer
    masked_input = batch['input']
    masked_target = batch['target']

    masked_input = masked_input.cuda(non_blocking=True)
    masked_target = masked_target.cuda(non_blocking=True)
    output = model(masked_input)

    #compute the cross entropy loss
    output_v = output.view(-1,output.shape[-1])
    target_v = masked_target.view(-1,1).squeeze()
    loss = loss_model(output_v, target_v)

    #compute gradients
    loss.backward()

    #apply gradients
    optimizer.step()

    #print step
    if it % print_each == 0:
        print('it:', it,
              ' | loss', np.round(loss.item(),2),
              ' | Δw:', round(model.embeddings.weight.grad.abs().sum().item(),3))

    #reset gradients
    optimizer.zero_grad()


# =============================================================================
# Results analysis
# =============================================================================
print('saving embeddings...')
N = 3000
np.savetxt('values.tsv', np.round(model.embeddings.weight.detach().cpu().numpy()[0:N], 2), delimiter='\t', fmt='%1.2f')
s = [dataset.rvocab[i] for i in range(N)]
open('names.tsv', 'w+').write('\n'.join(s) )


print('end')


initializing model...
initializing optimizer and loss...
training...
it: 0  | loss 10.25  | Δw: 1.215
it: 10  | loss 9.6  | Δw: 0.552
it: 20  | loss 9.37  | Δw: 0.364
it: 30  | loss 9.21  | Δw: 0.277
it: 40  | loss 9.06  | Δw: 0.243
it: 50  | loss 8.87  | Δw: 0.217
it: 60  | loss 8.71  | Δw: 0.201
it: 70  | loss 8.59  | Δw: 0.195
it: 80  | loss 8.44  | Δw: 0.185
it: 90  | loss 8.3  | Δw: 0.172
it: 100  | loss 8.13  | Δw: 0.167
it: 110  | loss 8.02  | Δw: 0.156
it: 120  | loss 7.87  | Δw: 0.153
it: 130  | loss 7.72  | Δw: 0.152
it: 140  | loss 7.6  | Δw: 0.154
it: 150  | loss 7.45  | Δw: 0.148
it: 160  | loss 7.35  | Δw: 0.144
it: 170  | loss 7.27  | Δw: 0.139
it: 180  | loss 7.17  | Δw: 0.146
it: 190  | loss 7.06  | Δw: 0.142
it: 200  | loss 6.99  | Δw: 0.143
it: 210  | loss 6.88  | Δw: 0.149
it: 220  | loss 6.75  | Δw: 0.142
it: 230  | loss 6.75  | Δw: 0.142
it: 240  | loss 6.68  | Δw: 0.152
it: 250  | loss 6.61  | Δw: 0.143
it: 260  | loss 6.6  | Δw: 0.151
it: 270  | loss 6.59  | Δw: