In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import sys

sys.path.append("..")

In [None]:
import pdb

# import dill as pickle

import torch
from transformers import BertTokenizer, BertModel, BertForMaskedLM

# optional
# import logging
# logging.basicConfig(level=logging.INFO)

In [None]:
torch.cuda.set_device(1)
print(f"Using GPU #{torch.cuda.current_device()}")

Using GPU #1


## Utility methods

In [None]:
class Config(dict):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        for k, v in kwargs.items():
            setattr(self, k, v)

    def set(self, key, val):
        self[key] = val
        setattr(self, key, val)

In [None]:
def convert_to_snakecase(name):
    s1 = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name)
    return re.sub("([a-z0-9])([A-Z])", r"\1_\2", s1).lower().replace("__", "_")

## Quickstart

### Bert

Use `BertTokenizer` to tokenize/numericalize input text

In [None]:
# load pre-trained model tokenizer (vocab)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [None]:
# tokenize input
text = (
    "[CLS] What is George Lucas famous for ? [SEP] George Lucas created Star Wars [SEP]"
)
tokenized_text = tokenizer.tokenize(text)

print(tokenized_text)

['[CLS]', 'what', 'is', 'george', 'lucas', 'famous', 'for', '?', '[SEP]', 'george', 'lucas', 'created', 'star', 'wars', '[SEP]']


In [None]:
# mask a token to predict with `BertForMaskedLM`
masked_idx = 9
tokenized_text[masked_idx] = "[MASK]"

assert tokenized_text == [
    "[CLS]",
    "what",
    "is",
    "george",
    "lucas",
    "famous",
    "for",
    "?",
    "[SEP]",
    "[MASK]",
    "lucas",
    "created",
    "star",
    "wars",
    "[SEP]",
]

In [None]:
# convert tokens to vocab idxs
tokenized_idxs = tokenizer.convert_tokens_to_ids(tokenized_text)

# define segment_ids
segment_idxs = [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]

print(tokenized_idxs)

[101, 2054, 2003, 2577, 6326, 3297, 2005, 1029, 102, 103, 6326, 2580, 2732, 5233, 102]


In [None]:
# convert inputs to pytorch tensors
T_tokenized_idxs = torch.tensor([tokenized_idxs])
T_segment_idxs = torch.tensor([segment_idxs])

Use `BertModel` to **encode our inputs in hidden-states**:

In [None]:
# load pre-trained model (weights)
model = BertModel.from_pretrained("bert-base-uncased")

In [None]:
# set model to eval (deactivates Dropout and BatchNorm)
model = model.eval()

In [None]:
# (optional) put everything on GPU
T_tokenized_idxs = T_tokenized_idxs.to("cuda")
T_segment_idxs = T_segment_idxs.to("cuda")
model = model.to("cuda")

In [None]:
# predict hidden states features for each layer
with torch.no_grad():
    outputs = model(T_tokenized_idxs, token_type_ids=T_segment_idxs)

    # outputs are tuples; here first element = the hidden state of the last layer of Bert model
    encoded_layers = outputs[0]

# we have encoded our input sequence in a FloatTensor (bsz, seq_len, model_hidden)
print(encoded_layers.shape)

assert tuple(encoded_layers.shape) == (1, len(tokenized_idxs), model.config.hidden_size)

torch.Size([1, 15, 768])


And how to use `BertForMaskedLM` to **predict a masked token**:

In [None]:
# load pre-trained model (weights)
model = BertForMaskedLM.from_pretrained("bert-base-uncased")
model.eval()

# (optional) put everything on GPU
T_tokenized_idxs = T_tokenized_idxs.to("cuda")
T_segment_idxs = T_segment_idxs.to("cuda")
model = model.to("cuda")

# predict all tokens
with torch.no_grad():
    outputs = model(T_tokenized_idxs, token_type_ids=T_segment_idxs)
    predictions = outputs[0]

In [None]:
predicted_idx = torch.argmax(predictions[0, masked_idx]).item()
predicted_token = tokenizer.convert_ids_to_tokens([predicted_idx])

assert predicted_token[0] == "george"

In [None]:
predicted_idx, predicted_token[0]

(2577, 'george')

### OpenAI GPT-2

Use `GPT2Tokenizer` to tokenize text

In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [None]:
# tokenize input
text = " What is George Lucas famous for ? George Lucas created Star"
indexed_tokens = tokenizer.encode(text)

print(indexed_tokens)

[1867, 318, 4502, 15257, 5863, 329, 5633, 4502, 15257, 2727, 2907]


In [None]:
# convert tokens to idxs
tokens_tensor = torch.tensor([indexed_tokens])

Use `GPT2LMHeadModel` to **predict the next token from a text prompt**

In [None]:
model = GPT2LMHeadModel.from_pretrained("gpt2")
model = model.eval()

tokens_tensor = tokens_tensor.to("cuda")
model = model.to("cuda")

with torch.no_grad():
    outputs = model(tokens_tensor)
    preds = outputs[0]

In [None]:
preds.shape

torch.Size([1, 11, 50257])

In [None]:
# get the predicted next sub-word
predicted_index = torch.argmax(preds[0, -1, :]).item()
# predicted_text = tokenizer.decode(indexed_tokens + [predicted_index])

In [None]:
tokenizer.decode(indexed_tokens + [predicted_index])

' What is George Lucas famous for? George Lucas created Star Wars'