In [1]:
import torch
import torch.nn as nn
import torchtext
from torchtext import data
from transformers import T5Tokenizer, T5Model
import wikipedia

In [2]:
def filter_para(x):
    if len(x) < 20:
        return False
    if '==' in x:
        return False
    
    return True

In [3]:
def wiki_results(query):
    search_results = wikipedia.search(query, results=4)
    wiki_obj = wikipedia.page(search_results[0])
    text = wiki_obj.content
    paras = text.split('\n')
    paras = [para for para in paras if filter_para(para)]
    
    return paras

In [4]:
tokenizer = T5Tokenizer.from_pretrained('t5-small')

In [5]:
init_token = tokenizer.pad_token
eos_token = tokenizer.eos_token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token

print(init_token, eos_token, pad_token, unk_token)

<pad> </s> <pad> <unk>


In [6]:
init_token_idx = tokenizer.convert_tokens_to_ids(init_token)
eos_token_idx = tokenizer.convert_tokens_to_ids(eos_token)
pad_token_idx = tokenizer.convert_tokens_to_ids(pad_token)
unk_token_idx = tokenizer.convert_tokens_to_ids(unk_token)

print(init_token_idx, eos_token_idx, pad_token_idx, unk_token_idx)

0 1 0 2


In [7]:
max_input_length = tokenizer.max_model_input_sizes['t5-small']

print(max_input_length)

512


In [8]:
def tokenize_and_cut(sentence):
    tokens = tokenizer.tokenize(sentence) 
    tokens = tokens[:max_input_length-2]
    return tokens

In [9]:
SRC = data.Field(batch_first = True,
                  use_vocab = False,
                  tokenize = tokenize_and_cut,
                  preprocessing = tokenizer.convert_tokens_to_ids,
                  init_token = init_token_idx,
                  eos_token = eos_token_idx,
                  pad_token = pad_token_idx,
                  unk_token = unk_token_idx)

TRG = data.Field(batch_first = True,
                  use_vocab = False,
                  tokenize = tokenize_and_cut,
                  preprocessing = tokenizer.convert_tokens_to_ids,
                  init_token = init_token_idx,
                  eos_token = eos_token_idx,
                  pad_token = pad_token_idx,
                  unk_token = unk_token_idx)

In [10]:
class T5Network(nn.Module):
    def __init__(self):
        
        super().__init__()
        
        self.t5 = t5 = T5Model.from_pretrained('t5-small')
        
        self.out = nn.Linear(t5.config.to_dict()['d_model'], t5.config.to_dict()['vocab_size'])
                
    def forward(self, src, trg):
        
        embedded = self.t5(input_ids=src, decoder_input_ids=trg)
        
        output = self.out(embedded[0])
        
        return output

In [11]:
models = []

for i in range(4):
    new_model = T5Network().cuda()
    new_model.load_state_dict(torch.load(f'model_{i+1}.pt'))
    models.append(new_model)

Some weights of T5Model were not initialized from the model checkpoint at t5-small and are newly initialized: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of T5Model were not initialized from the model checkpoint at t5-small and are newly initialized: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of T5Model were not initialized from the model checkpoint at t5-small and are newly initialized: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of T5Model were not initialized from the model checkpoint at t5-small and are newly initialized: ['encoder.embed_tokens.weight', 'decod

In [12]:
def translate_sentence(sentence, src_field, trg_field, models, max_len = 50):
    for m in models:
        m.eval()

    src_indexes = [init_token_idx] + sentence + [eos_token_idx]
    src_tensor = torch.LongTensor(src_indexes).unsqueeze(0).cuda()

    trg_indexes = [init_token_idx]

    for i in range(max_len):

        trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).cuda()
        
        with torch.no_grad():
            output = models[0](src_tensor, trg_tensor) + models[1](src_tensor, trg_tensor) + models[2](src_tensor, trg_tensor) + models[3](src_tensor, trg_tensor)
        
        pred_token = output.argmax(2)[:,-1].item()
        
        trg_indexes.append(pred_token)

        if pred_token == eos_token_idx:
            break
            
    return trg_indexes

In [13]:
def str_result(tokens):
    result = ''
    sep_char = tokens[1][0]
    for t in tokens[1:-1]:
        result = result+t
        
    result = ' '.join(result.split(sep_char)[1:])
    return result

In [14]:
paras = wiki_results("bill gates")

In [15]:
paras[1]

"Born and raised in Seattle, Washington, Gates co-founded Microsoft with childhood friend Paul Allen in 1975, in Albuquerque, New Mexico; it went on to become the world's largest personal computer software company. Gates led the company as chairman and CEO until stepping down as CEO in January 2000, but he remained chairman and became chief software architect. During the late 1990s, Gates had been criticized for his business tactics, which have been considered anti-competitive. This opinion has been upheld by numerous court rulings. In June 2006, Gates announced that he would be transitioning to a part-time role at Microsoft and full-time work at the Bill & Melinda Gates Foundation, the private charitable foundation that he and his wife, Melinda Gates, established in 2000. He gradually transferred his duties to Ray Ozzie and Craig Mundie. He stepped down as chairman of Microsoft in February 2014 and assumed a new post as technology adviser to support the newly appointed CEO Satya Nadel

In [16]:
CONTEXT = paras[1]
QUERIES = ["where was bill gates born ?",
           "when did gates step down as ceo ?",
           "why was gates criticized ?",
           "when did gates step down as chairman ?",
           "how wealthy was gates ?",
           "has gates donated to charity ?",
           "what is the giving pledge ?",
           "how wealthy is jeff bezos ?",
           "gates's wife",
           "who is the newly appointed ceo of microsoft ?"]

In [17]:
for q in QUERIES:
    text = "context : " + CONTEXT.lower() + " query : " + q.lower()
    tokens = tokenizer.tokenize(text)
    
    print(f"INPUT TEXT\n{text}\n")
    print(f"INPUT TOKENS\n{tokens}\n")
    
    pred_tokens = translate_sentence(tokenizer.convert_tokens_to_ids(tokens), SRC, TRG, models)
    
    final_result = str_result(tokenizer.convert_ids_to_tokens(pred_tokens))
    print(f"PREDICTION\n{final_result}\n\n\n\n")

INPUT TEXT
context : born and raised in seattle, washington, gates co-founded microsoft with childhood friend paul allen in 1975, in albuquerque, new mexico; it went on to become the world's largest personal computer software company. gates led the company as chairman and ceo until stepping down as ceo in january 2000, but he remained chairman and became chief software architect. during the late 1990s, gates had been criticized for his business tactics, which have been considered anti-competitive. this opinion has been upheld by numerous court rulings. in june 2006, gates announced that he would be transitioning to a part-time role at microsoft and full-time work at the bill & melinda gates foundation, the private charitable foundation that he and his wife, melinda gates, established in 2000. he gradually transferred his duties to ray ozzie and craig mundie. he stepped down as chairman of microsoft in february 2014 and assumed a new post as technology adviser to support the newly appoi