In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

import torchtext
from torchtext.data import Field, BucketIterator

import spacy
import numpy as np

import random
import math
import time
from torchtext import data
from transformers import T5Tokenizer, T5Model

In [2]:
tokenizer = T5Tokenizer.from_pretrained('t5-small')

In [3]:
tokens = tokenizer.tokenize('Hello world how are you?')

print(tokens)

['▁Hello', '▁world', '▁how', '▁are', '▁you', '?']


In [4]:
indexes = tokenizer.convert_tokens_to_ids(tokens)

print(indexes)

[8774, 296, 149, 33, 25, 58]


In [5]:
init_token = tokenizer.pad_token
eos_token = tokenizer.eos_token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token

print(init_token, eos_token, pad_token, unk_token)

<pad> </s> <pad> <unk>


In [6]:
init_token_idx = tokenizer.convert_tokens_to_ids(init_token)
eos_token_idx = tokenizer.convert_tokens_to_ids(eos_token)
pad_token_idx = tokenizer.convert_tokens_to_ids(pad_token)
unk_token_idx = tokenizer.convert_tokens_to_ids(unk_token)

print(init_token_idx, eos_token_idx, pad_token_idx, unk_token_idx)

0 1 0 2


In [7]:
max_input_length = tokenizer.max_model_input_sizes['t5-small']

print(max_input_length)

512


In [8]:
def tokenize_and_cut(sentence):
    tokens = tokenizer.tokenize(sentence) 
    tokens = tokens[:max_input_length-2]
    return tokens

In [9]:
SRC = data.Field(batch_first = True,
                  use_vocab = False,
                  tokenize = tokenize_and_cut,
                  preprocessing = tokenizer.convert_tokens_to_ids,
                  init_token = init_token_idx,
                  eos_token = eos_token_idx,
                  pad_token = pad_token_idx,
                  unk_token = unk_token_idx)

TRG = data.Field(batch_first = True,
                  use_vocab = False,
                  tokenize = tokenize_and_cut,
                  preprocessing = tokenizer.convert_tokens_to_ids,
                  init_token = init_token_idx,
                  eos_token = eos_token_idx,
                  pad_token = pad_token_idx,
                  unk_token = unk_token_idx)

In [10]:
fields = [('src', SRC), ('trg', TRG)]

In [11]:
train_data = data.TabularDataset.splits(
                path = '',
                train = 'squad.csv',
                format = 'csv',
                fields = fields,
                skip_header = True)

train_data , valid_data = train_data[0].split(split_ratio=0.98,
                                             random_state = random.seed(4321))

In [12]:
print(len(train_data.examples))
print(len(valid_data.examples))

58743
1199


In [13]:
print(vars(train_data.examples[10000]))

{'src': [2625, 3, 10, 16, 8, 336, 192, 4160, 6, 1202, 10897, 9, 447, 7, 41, 102, 208, 201, 92, 801, 38, 3693, 3, 102, 208, 6, 65, 14021, 45, 3, 9, 4621, 11276, 512, 13, 422, 2643, 1564, 1587, 2852, 3, 9, 12946, 6373, 1391, 5, 3, 9, 3693, 2358, 19, 3, 9, 1407, 24, 5755, 7, 659, 1461, 139, 6373, 338, 8, 1202, 17470, 1504, 5, 8, 166, 3693, 2358, 47, 8520, 57, 3, 4059, 965, 3, 89, 18208, 7, 16, 8, 507, 2079, 7, 5, 11417, 3, 10, 16, 8, 507, 2079, 7, 6, 113, 8520, 8, 166, 3693, 2358, 58], 'trg': [3, 4059, 965, 3, 89, 18208, 7]}


In [14]:
tokens = tokenizer.convert_ids_to_tokens(vars(train_data.examples[10000])['src'])

print(tokens)
tokens = tokenizer.convert_ids_to_tokens(vars(train_data.examples[10000])['trg'])

print(tokens)

['▁context', '▁', ':', '▁in', '▁the', '▁last', '▁two', '▁decades', ',', '▁photo', 'volt', 'a', 'ic', 's', '▁(', 'p', 'v', '),', '▁also', '▁known', '▁as', '▁solar', '▁', 'p', 'v', ',', '▁has', '▁evolved', '▁from', '▁', 'a', '▁pure', '▁niche', '▁market', '▁of', '▁small', '▁scale', '▁applications', '▁towards', '▁becoming', '▁', 'a', '▁mainstream', '▁electricity', '▁source', '.', '▁', 'a', '▁solar', '▁cell', '▁is', '▁', 'a', '▁device', '▁that', '▁convert', 's', '▁light', '▁directly', '▁into', '▁electricity', '▁using', '▁the', '▁photo', 'electric', '▁effect', '.', '▁the', '▁first', '▁solar', '▁cell', '▁was', '▁constructed', '▁by', '▁', 'char', 'les', '▁', 'f', 'ritt', 's', '▁in', '▁the', '▁18', '80', 's', '.', '▁query', '▁', ':', '▁in', '▁the', '▁18', '80', 's', ',', '▁who', '▁constructed', '▁the', '▁first', '▁solar', '▁cell', '?']
['▁', 'char', 'les', '▁', 'f', 'ritt', 's']


In [15]:
device = torch.device('cuda')

BATCH_SIZE = 20

train_iterator, valid_iterator = BucketIterator.splits(
                                (train_data, valid_data), 
                                batch_size = BATCH_SIZE,
                                device = device,
                                sort_key=lambda x: len(x.src))

In [16]:
class T5Network(nn.Module):
    def __init__(self):
        
        super().__init__()
        
        self.t5 = t5 = T5Model.from_pretrained('t5-small')
        
        self.out = nn.Linear(t5.config.to_dict()['d_model'], t5.config.to_dict()['vocab_size'])
                
    def forward(self, src, trg):
        
        embedded = self.t5(input_ids=src, decoder_input_ids=trg)
        
        output = self.out(embedded[0])
        
        return output

In [17]:
model = T5Network().cuda()

Some weights of T5Model were not initialized from the model checkpoint at t5-small and are newly initialized: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 76,988,544 trainable parameters


In [19]:
LEARNING_RATE = 0.0004

optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)

In [20]:
criterion = nn.CrossEntropyLoss(ignore_index = pad_token_idx)

In [21]:
N_EPOCHS = 4
CLIP = 1

best_valid_loss = float('inf')

In [22]:
for epoch in range(N_EPOCHS):
    
    start = time.time()
    # TRAIN 
    #########################################################################
    model.train()
    epoch_loss = 0
    
    for i, batch in enumerate(train_iterator):
        
        src = batch.src
        trg = batch.trg
        
        optimizer.zero_grad()
        
        output = model(src, trg[:,:-1])
        
        output_dim = output.shape[-1]
            
        output = output.contiguous().view(-1, output_dim)
        trg = trg[:,1:].contiguous().view(-1)
        
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    train_loss = epoch_loss / len(train_iterator)
    #########################################################################
    
    # VALID
    #########################################################################
    model.eval()
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(valid_iterator):

            src = batch.src
            trg = batch.trg

            output = model(src, trg[:,:-1])
            
            output_dim = output.shape[-1]
            
            output = output.contiguous().view(-1, output_dim)
            trg = trg[:,1:].contiguous().view(-1)
            
            loss = criterion(output, trg)

            epoch_loss += loss.item()
        
    valid_loss = epoch_loss / len(valid_iterator)
    #########################################################################
    end = time.time()
    
    print(f"EPOCH : {epoch+1}\tTRAIN LOSS : {train_loss:.2f}\tVALID LOSS : {valid_loss:.2f}\tTIME : {end-start:.2f}\n")
    torch.save(model.state_dict(), f'saved_models/squad_model_{epoch+1}.pt')

EPOCH : 1	TRAIN LOSS : 2.56	VALID LOSS : 0.84	TIME : 1069.69

EPOCH : 2	TRAIN LOSS : 0.64	VALID LOSS : 0.59	TIME : 1107.69

EPOCH : 3	TRAIN LOSS : 0.38	VALID LOSS : 0.53	TIME : 1094.08

EPOCH : 4	TRAIN LOSS : 0.27	VALID LOSS : 0.52	TIME : 1101.33



In [23]:
def translate_sentence(sentence, src_field, trg_field, model, device, max_len = 50):
    model.eval()

    src_indexes = [init_token_idx] + sentence + [eos_token_idx]
    src_tensor = torch.LongTensor(src_indexes).unsqueeze(0).to(device)

    trg_indexes = [init_token_idx]

    for i in range(max_len):

        trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device)
        
        with torch.no_grad():
            output = model(src_tensor, trg_tensor)
        
        pred_token = output.argmax(2)[:,-1].item()
        
        trg_indexes.append(pred_token)

        if pred_token == eos_token_idx:
            break
            
    return trg_indexes[1:]

In [24]:
idxs = random.sample(range(0,len(valid_data.examples)),20)

for i in idxs:
    src = vars(valid_data.examples[i])['src']
    trg = vars(valid_data.examples[i])['trg']
    translation = translate_sentence(src, SRC, TRG, model, device)

    print(f"SRC : {' '.join(tokenizer.convert_ids_to_tokens(src))}")
    print(f"TRG : {' '.join(tokenizer.convert_ids_to_tokens(trg))}")
    print(f"PREDICTED : {' '.join(tokenizer.convert_ids_to_tokens(translation))}\n")

SRC : ▁context ▁ : ▁the ▁consolidation ▁of ▁the ▁" hot ▁ a c " ▁format ▁contrast e d ▁with ▁the ▁near - de mise ▁of ▁most ▁other ▁ a c ▁formats : ▁beginning ▁with ▁the ▁2005 - 2007 ▁economic ▁down turn ▁and ▁eventual ▁recession ▁most ▁stations ▁went ▁for ▁the ▁more ▁chart - based ▁ ch r , ▁along ▁with ▁the ▁top ▁40 , ▁urban ▁and ▁even ▁ latin o ▁formats . ▁diminish ing ▁physical ▁record ▁sales ▁also ▁proved ▁ a ▁major ▁blow ▁to ▁the ▁ a c ▁genre . ▁query ▁ : ▁along ▁with ▁ ch r , ▁top ▁40 ▁and ▁ latin o , ▁what ▁format ▁have ▁former ▁ a c ▁stations ▁transition e d ▁to ?
TRG : ▁urban
PREDICTED : ▁urban </s>

SRC : ▁context ▁ : ▁bird ▁migration ▁is ▁not ▁limited ▁to ▁birds ▁that ▁can ▁fly . ▁most ▁species ▁of ▁pen guin ▁( s phen is cid a e ) ▁migrate ▁by ▁swimming . ▁these ▁routes ▁can ▁cover ▁over ▁1,000 ▁km ▁( 620 ▁mi ). ▁du sky ▁ gro use ▁den d rag a pus ▁ obscur us ▁perform ▁alt it u d inal ▁migration ▁mostly ▁by ▁walking . ▁ e mus ▁ d rom a i us ▁ nova e hol l and i a e ▁in ▁austral