In [1]:
# !pip install pytorch-transformers

In [2]:
import torch
from pytorch_transformers import GPT2Tokenizer
from pytorch_transformers import GPT2LMHeadModel

In [3]:
%%time
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.eval()
print("Done")

Done
CPU times: user 2.69 s, sys: 554 ms, total: 3.25 s
Wall time: 7.83 s


In [4]:
input_text = "We work on a vaccine that would solve the problem of"
indexed_tokens = tokenizer.encode(input_text)
tokens_tensor = torch.tensor([indexed_tokens])

In [5]:
# torch.no_grad: to save memory don't construct the compute graph during this forward pass, 
# since we won’t be running backprop here, as we do not train
with torch.no_grad():
    outputs = model(tokens_tensor)
    predictions = outputs[0]

results = predictions[0, -1, :]

predicted_index = torch.argmax(results).item()
print(tokenizer.decode(indexed_tokens + [predicted_index]))

 We work on a vaccine that would solve the problem of autism


In [6]:
%%time
t = sorted(list(results), reverse=True)[20]
print(', '.join(list(tokenizer.decode([i]) for i,r in enumerate(results) if r.item() >= t) ))

 a,  the,  what,  how,  low,  global,  childhood,  HIV,  vaccine,  Ebola,  autism,  immunity,  vaccines,  AIDS,  vaccination,  infectious,  influenza,  measles,  HPV,  cervical,  polio
CPU times: user 3.47 s, sys: 88.8 ms, total: 3.56 s
Wall time: 3.03 s


In [None]:
%%time
from pytorch_transformers import BertTokenizer, BertForMaskedLM
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
model.eval()
print("Done")
input_text = "We work on a vaccine that would solve the problem of"
indexed_tokens = tokenizer.encode(input_text)
tokens_tensor = torch.tensor([indexed_tokens])
with torch.no_grad():
    outputs = model(tokens_tensor)
    predictions = outputs[0]

results = predictions[0, -1, :]

predicted_index = torch.argmax(results).item()
print(tokenizer.decode(indexed_tokens + [predicted_index]))

In [None]:
%%time
t = sorted(list(results), reverse=True)[20]
print(', '.join(list(tokenizer.decode([i]) for i,r in enumerate(results) if r.item() >= t) ))

In [None]:
print("Done")
input_text = "We work on a software package that would solve the problem of"
indexed_tokens = tokenizer.encode(input_text)
tokens_tensor = torch.tensor([indexed_tokens])
with torch.no_grad():
    outputs = model(tokens_tensor)
    predictions = outputs[0]

results = predictions[0, -1, :]

predicted_index = torch.argmax(results).item()
print(tokenizer.decode(indexed_tokens + [predicted_index]))

In [None]:
import torch
from transformers import BertTokenizer, BertModel, BertForMaskedLM

# OPTIONAL: if you want to have more information on what's happening under the hood, activate the logger as follows
import logging
logging.basicConfig(level=logging.INFO)

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize input
text = "[CLS] We work on a vaccine. [SEP] The vaccine would solve the problem of Corona [SEP]"
tokenized_text = tokenizer.tokenize(text)

# Mask a token that we will try to predict back with `BertForMaskedLM`
masked_index = 15
tokenized_text[masked_index] = '[MASK]'
print(tokenized_text)
print(tokenized_text[masked_index])

# Convert token to vocabulary indices
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
# Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]

# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])


In [None]:
%%time
# Load pre-trained model (weights)
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
model.eval()

# Predict all tokens
with torch.no_grad():
    outputs = model(tokens_tensor, token_type_ids=segments_tensors)
    predictions = outputs[0]

# confirm we were able to predict 'henson'
predicted_index = torch.argmax(predictions[0, masked_index]).item()
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
print(predicted_token)
