In [19]:
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
from datasets import load_dataset

In [20]:
model_checkpoint = "distilbert-base-uncased"
# Commit ID at time of executing this notebook
model_commit_id = "6cdc0aad91f5ae2e6712e91bc7b65d1cf5c05411"

In [21]:
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint, revision=model_commit_id)

# Off-the-shelf model

In [22]:
distilbert_num_params = model.num_parameters()

In [23]:
print(f"{round(distilbert_num_params / 1e6)}M parameters")

67M parameters


In [24]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, revision=model_commit_id)

In [25]:
text = "This is a great [MASK]."

In [26]:
inputs = tokenizer(text, return_tensors="pt")
with torch.no_grad():
    token_logits = model(**inputs).logits

In [27]:
token_logits.shape

torch.Size([1, 8, 30522])

In [28]:
tokenizer.decode(inputs.input_ids.squeeze().tolist())

'[CLS] this is a great [MASK]. [SEP]'

In [29]:
tokenizer.convert_ids_to_tokens(inputs.input_ids.squeeze().tolist())

['[CLS]', 'this', 'is', 'a', 'great', '[MASK]', '.', '[SEP]']

In [30]:
with torch.no_grad():
    mask_token_idx = torch.where(inputs.input_ids == tokenizer.mask_token_id)[1]
    mask_token_logits = token_logits.squeeze(0)[mask_token_idx, :]
# Choose top 5 candidates for [MASK]
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

In [31]:
for i, token in enumerate(top_5_tokens):
    print(f"{i}: {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}")

0: This is a great deal.
1: This is a great success.
2: This is a great adventure.
3: This is a great idea.
4: This is a great feat.


# Dataset

In [32]:
dataset_checkpoint = "imdb"
dataset_commit_id = "9c6ede893febf99215a29cc7b72992bb1138b06b"

In [33]:
imdb_dataset = load_dataset(dataset_checkpoint, revision=dataset_commit_id)

In [34]:
imdb_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [35]:
sample = imdb_dataset["train"].shuffle(seed=42).select(range(3))

In [37]:
for i, row in enumerate(sample):
    gap = ""
    if i != 0:
        gap = '\n'
    print(f"{gap}>>> Review: {row['text']}")
    print(f">>> Label: {row['label']}")

>>> Review: There is no relation at all between Fortier and Profiler but the fact that both are police series about violent crimes. Profiler looks crispy, Fortier looks classic. Profiler plots are quite simple. Fortier's plot are far more complicated... Fortier looks more like Prime Suspect, if we have to spot similarities... The main character is weak and weirdo, but have "clairvoyance". People like to compare, to judge, to evaluate. How about just enjoying? Funny thing too, people writing Fortier looks American but, on the other hand, arguing they prefer American series (!!!). Maybe it's the language, or the spirit, but I think this series is more English than American. By the way, the actors are really good and funny. The acting is not superficial at all...
>>> Label: 1

>>> Review: This movie is a great. The plot is very true to the book which is a classic written by Mark Twain. The movie starts of with a scene where Hank sings a song with a bunch of kids called "when you stub your