In [1]:
import numpy as np
import torch
import torch.nn.functional as F
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

In [2]:
checkpoint = "distilbert-base-cased-distilled-squad"

In [3]:
question_answerer = pipeline("question-answering", model=checkpoint)

# Question answering examples

In [4]:
context = """
🤗 Transformers is backed by the three most popular deep learning libraries — Jax, PyTorch, and TensorFlow — with a seamless integration
between them. It's straightforward to train your models with one before loading them for inference with the other.
"""

In [5]:
question1 = "Which deep learning libraries back 🤗 Transformers?"
question_answerer(question=question1, context=context)

{'score': 0.98026043176651,
 'start': 78,
 'end': 106,
 'answer': 'Jax, PyTorch, and TensorFlow'}

In [6]:
question2 = "Do any deep learning libraries back 🤗 Transformers?"
question_answerer(question=question2, context=context)

{'score': 0.1688368320465088, 'start': 33, 'end': 38, 'answer': 'three'}

## Question answering from a long context

In [7]:
long_context = """
🤗 Transformers: State of the Art NLP

🤗 Transformers provides thousands of pretrained models to perform tasks on texts such as classification, information extraction,
question answering, summarization, translation, text generation and more in over 100 languages.
Its aim is to make cutting-edge NLP easier to use for everyone.

🤗 Transformers provides APIs to quickly download and use those pretrained models on a given text, fine-tune them on your own datasets and
then share them with the community on our model hub. At the same time, each python module defining an architecture is fully standalone and
can be modified to enable quick research experiments.

Why should I use transformers?

1. Easy-to-use state-of-the-art models:
  - High performance on NLU and NLG tasks.
  - Low barrier to entry for educators and practitioners.
  - Few user-facing abstractions with just three classes to learn.
  - A unified API for using all our pretrained models.
  - Lower compute costs, smaller carbon footprint:

2. Researchers can share trained models instead of always retraining.
  - Practitioners can reduce compute time and production costs.
  - Dozens of architectures with over 10,000 pretrained models, some in more than 100 languages.

3. Choose the right framework for every part of a model's lifetime:
  - Train state-of-the-art models in 3 lines of code.
  - Move a single model between TF2.0/PyTorch frameworks at will.
  - Seamlessly pick the right framework for training, evaluation and production.

4. Easily customize a model or an example to your needs:
  - We provide examples for each architecture to reproduce the results published by its original authors.
  - Model internals are exposed as consistently as possible.
  - Model files can be used independently of the library for quick experiments.

🤗 Transformers is backed by the three most popular deep learning libraries — Jax, PyTorch and TensorFlow — with a seamless integration
between them. It's straightforward to train your models with one before loading them for inference with the other.
"""

In [8]:
question_answerer(question=question1, context=long_context, top_k=3)

[{'score': 0.9714871048927307,
  'start': 1892,
  'end': 1919,
  'answer': 'Jax, PyTorch and TensorFlow'},
 {'score': 0.14949694275856018,
  'start': 17,
  'end': 37,
  'answer': 'State of the Art NLP'},
 {'score': 0.015565154142677784,
  'start': 1892,
  'end': 1921,
  'answer': 'Jax, PyTorch and TensorFlow —'}]

In [9]:
question_answerer(question=question2, context=long_context)

{'score': 0.06686097383499146,
 'start': 1815,
 'end': 1889,
 'answer': '🤗 Transformers is backed by the three most popular deep learning libraries'}

# Question answering with a model

In [10]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForQuestionAnswering.from_pretrained(checkpoint)

In [11]:
inputs = tokenizer(question1, context, return_tensors="pt")
outputs = model(**inputs)

In [12]:
start_logits = outputs.start_logits
end_logits = outputs.end_logits

In [13]:
print(start_logits.shape, end_logits.shape)

torch.Size([1, 67]) torch.Size([1, 67])


In [40]:
sequence_ids = inputs.sequence_ids()
# Mask everything except for context tokens
mask = [sid != 1 for sid in sequence_ids]
# Keep [CLS] token
mask[0] = False
mask = torch.tensor(mask).unsqueeze(0)

In [41]:
start_logits[mask] = -1e4
end_logits[mask] = -1e4

In [42]:
start_probs = F.softmax(start_logits, dim=-1).squeeze(0)
end_probs = F.softmax(end_logits, dim=-1).squeeze(0)

## Find the answer
Need to be careful not to just take the argmax, since this could lead to start_idx > end_idx. Compute probabilities for all pairs where start_idx > end_idx.

In [43]:
scores = start_probs.unsqueeze(1) * end_probs.unsqueeze(0)
scores = torch.triu(scores)

In [44]:
np.zeros((3, 2), dtype=int)

array([[0, 0],
       [0, 0],
       [0, 0]])

In [45]:
scores.ravel().topk(3).indices.numpy()

array([1576, 1577, 1107])

In [46]:
max_idx = scores.argmax().item()
# Which row?
start_idx = max_idx // scores.shape[1]
# Which col?
end_idx = max_idx % scores.shape[1]

In [47]:
print(start_idx, end_idx)
print(scores[start_idx, end_idx].item())

23 35
0.9802628755569458


In [48]:
def topk_idx(scores, k):
    start_end = np.zeros((k, 2), dtype=int)
    max_idxs = scores.ravel().topk(k).indices.numpy()
    for i, mi in enumerate(max_idxs):
        # Row
        start_idx = mi // scores.shape[1]
        # Col
        end_idx = mi % scores.shape[1]
        start_end[i, 0] = start_idx
        start_end[i, 1] = end_idx
    return start_end

In [49]:
start_end_topk = topk_idx(scores, 3)

In [50]:
inputs_with_offsets = tokenizer(question1, context, return_offsets_mapping=True)
offsets = inputs_with_offsets.offset_mapping

In [51]:
results_topk = []
for start_end in start_end_topk:
    start_idx, end_idx = start_end[0], start_end[1]
    start_char, _ = offsets[start_idx]
    _, end_char = offsets[end_idx]
    answer = context[start_char: end_char]
    results_topk.append({
        "answer": answer,
        "start": start_char,
        "end": end_char,
        "score": scores[start_idx, end_idx].item()
    })

In [53]:
# Doesn't exactly match the answer predicted by the pipeline...
results_topk

[{'answer': 'Jax, PyTorch, and TensorFlow',
  'start': 78,
  'end': 106,
  'score': 0.9802628755569458},
 {'answer': 'Jax, PyTorch, and TensorFlow —',
  'start': 78,
  'end': 108,
  'score': 0.008247802965342999},
 {'answer': 'three most popular deep learning libraries — Jax, PyTorch, and TensorFlow',
  'start': 33,
  'end': 106,
  'score': 0.006841438356786966}]

In [55]:
inputs = tokenizer(question1, long_context)

In [58]:
len(inputs.input_ids)

461

In [70]:
# In the demo, this is 384
question_answerer.model.config.max_position_embeddings

512

# Truncate inputs

In [71]:
inputs = tokenizer(question1, long_context, max_length=384, truncation="only_second")

In [72]:
tokenizer.decode(inputs.input_ids)

"[CLS] Which deep learning libraries back [UNK] Transformers? [SEP] [UNK] Transformers : State of the Art NLP [UNK] Transformers provides thousands of pretrained models to perform tasks on texts such as classification, information extraction, question answering, summarization, translation, text generation and more in over 100 languages. Its aim is to make cutting - edge NLP easier to use for everyone. [UNK] Transformers provides APIs to quickly download and use those pretrained models on a given text, fine - tune them on your own datasets and then share them with the community on our model hub. At the same time, each python module defining an architecture is fully standalone and can be modified to enable quick research experiments. Why should I use transformers? 1. Easy - to - use state - of - the - art models : - High performance on NLU and NLG tasks. - Low barrier to entry for educators and practitioners. - Few user - facing abstractions with just three classes to learn. - A unified 

## Place overlapping tokens in chunks

In [85]:
sentences = [
    "This sentence is not too long but we are going to split it anyway.",
    "This sentence is shorter but will still get split.",
]

In [86]:
inputs = tokenizer(
    sentences, truncation=True, return_overflowing_tokens=True, max_length=6, stride=2
)

In [87]:
for ids in inputs.input_ids:
    print(tokenizer.decode(ids))

[CLS] This sentence is not [SEP]
[CLS] is not too long [SEP]
[CLS] too long but we [SEP]
[CLS] but we are going [SEP]
[CLS] are going to split [SEP]
[CLS] to split it anyway [SEP]
[CLS] it anyway. [SEP]
[CLS] This sentence is shorter [SEP]
[CLS] is shorter but will [SEP]
[CLS] but will still get [SEP]
[CLS] still get split. [SEP]


In [88]:
inputs.keys()

dict_keys(['input_ids', 'attention_mask', 'overflow_to_sample_mapping'])

In [89]:
inputs.overflow_to_sample_mapping

[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1]

# Redo tokenization using the new tools

In [90]:
inputs = tokenizer(
    question1,
    long_context,
    stride=128,
    max_length=384,
    padding="longest",
    truncation="only_second",
    return_overflowing_tokens=True,
    return_offsets_mapping=True
)

In [92]:
inputs.keys()

dict_keys(['input_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping'])

## Get rid of keys we don't need right now

In [93]:
_ = inputs.pop("overflow_to_sample_mapping")
offsets = inputs.pop("offset_mapping")

In [95]:
inputs = inputs.convert_to_tensors("pt")

In [98]:
inputs.input_ids.shape

torch.Size([2, 384])

In [100]:
outputs = model(**inputs)
start_logits = outputs.start_logits
end_logits = outputs.end_logits

In [101]:
start_logits.shape

torch.Size([2, 384])

In [102]:
end_logits.shape

torch.Size([2, 384])

## Mask non-context and padding tokens

In [111]:
sequence_ids = inputs.sequence_ids()
mask = [sid != 1 for sid in sequence_ids]
mask[0] = False
mask = torch.logical_or(torch.tensor(mask)[None, :], (inputs.attention_mask == 0))

In [112]:
start_logits[mask] = -1e4
end_logits[mask] = -1e4

In [113]:
start_probs = F.softmax(start_logits, dim=-1)
end_probs = F.softmax(end_logits, dim=-1)

## Find span with highest probability score

In [116]:
candidates = []
for start_p, end_p in zip(start_probs, end_probs):
    scores = start_p[:, None] * end_p[None, :]
    idx = torch.triu(scores).argmax().item()

    start_idx = idx // scores.shape[1]
    end_idx = idx % scores.shape[1]
    score = scores[start_idx, end_idx].item()
    candidates.append((start_idx, end_idx, score))

In [117]:
candidates

[(0, 18, 0.3386707901954651), (173, 184, 0.9714868664741516)]

In [123]:
for candidate, offset in zip(candidates, offsets):
    start_token, end_token, score = candidate
    start_char, _ = offset[start_token]
    _, end_char = offset[end_token]
    answer = long_context[start_char: end_char]
    result = {
        "answer": answer,
        "start": start_char,
        "end": end_char,
        "score": score,
    }
    print(result)

{'answer': '\n🤗 Transformers: State of the Art NLP', 'start': 0, 'end': 37, 'score': 0.3386707901954651}
{'answer': 'Jax, PyTorch and TensorFlow', 'start': 1892, 'end': 1919, 'score': 0.9714868664741516}
