In [2]:
# !pip install ipykernel
import torch
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer

In [4]:
## load the pre-trained QA model and tokenizer from Huggingface
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

In [5]:
## load an example. The input requires question and paragraph text.
question = "How many parameters does BERT-large have?"
answer_text = "BERT-large is really big... it has 24-layers and an embedding size of 1,024, for a total of 340M parameters! Altogether it is 1.34GB, so expect it to take a couple minutes to download to your Colab instance."

In [6]:
### transfer the token to token id
input_ids = tokenizer.encode(question, answer_text)
print('The input has a total of {:} tokens.'.format(len(input_ids)))

The input has a total of 70 tokens.


In [7]:
## if you want, you can change back to token for doule checking.
tokens = tokenizer.convert_ids_to_tokens(input_ids)
print(tokens)

['[CLS]', 'how', 'many', 'parameters', 'does', 'bert', '-', 'large', 'have', '?', '[SEP]', 'bert', '-', 'large', 'is', 'really', 'big', '.', '.', '.', 'it', 'has', '24', '-', 'layers', 'and', 'an', 'em', '##bed', '##ding', 'size', 'of', '1', ',', '02', '##4', ',', 'for', 'a', 'total', 'of', '340', '##m', 'parameters', '!', 'altogether', 'it', 'is', '1', '.', '34', '##gb', ',', 'so', 'expect', 'it', 'to', 'take', 'a', 'couple', 'minutes', 'to', 'download', 'to', 'your', 'cola', '##b', 'instance', '.', '[SEP]']


In [8]:
### or more dircetly, print token<->id format
for token, id in zip(tokens, input_ids):
    if id == tokenizer.sep_token_id:
        print('')
    print('{:<12} {:>6,}'.format(token, id))
    if id == tokenizer.sep_token_id:
        print('')

[CLS]           101
how           2,129
many          2,116
parameters   11,709
does          2,515
bert         14,324
-             1,011
large         2,312
have          2,031
?             1,029

[SEP]           102

bert         14,324
-             1,011
large         2,312
is            2,003
really        2,428
big           2,502
.             1,012
.             1,012
.             1,012
it            2,009
has           2,038
24            2,484
-             1,011
layers        9,014
and           1,998
an            2,019
em            7,861
##bed         8,270
##ding        4,667
size          2,946
of            1,997
1             1,015
,             1,010
02            6,185
##4           2,549
,             1,010
for           2,005
a             1,037
total         2,561
of            1,997
340          16,029
##m           2,213
parameters   11,709
!               999
altogether   10,462
it            2,009
is            2,003
1             1,015
.             1,01

In [9]:
## create segment id (important) for question (all 0) and paragraph text (all 1)
sep_index = input_ids.index(tokenizer.sep_token_id)

num_seg_a = sep_index + 1 # The number of segment A tokens includes the [SEP] token istelf.

num_seg_b = len(input_ids) - num_seg_a # The remainder are segment B.

segment_ids = [0]*num_seg_a + [1]*num_seg_b # Construct the list of 0s and 1s.

assert len(segment_ids) == len(input_ids)

In [10]:
### most exciting code: run the BERT model!
# Run our example through the model.
outputs = model(torch.tensor([input_ids]),
                             token_type_ids=torch.tensor([segment_ids]),
                             return_dict=True) 
start_scores = outputs.start_logits
end_scores = outputs.end_logits

In [11]:
## let's see the output returning result.
print(outputs)

QuestionAnsweringModelOutput(loss=None, start_logits=tensor([[-6.4849, -6.4358, -8.1077, -8.8489, -7.8751, -8.0522, -8.4684, -8.5295,
         -7.7074, -9.2464, -6.4849, -2.7303, -6.3473, -5.7299, -7.7780, -7.0391,
         -6.3331, -7.3153, -7.3048, -7.4121, -2.2534, -5.3971, -0.9424, -7.3584,
         -5.4575, -7.0769, -4.4887, -3.9272, -5.6967, -5.9505, -5.0059, -5.9812,
          0.0530, -5.5968, -4.7093, -4.5750, -6.1786, -2.2294, -0.1904, -0.2327,
         -2.7331,  6.4256, -2.6543, -4.5655, -4.9872, -4.9834, -5.9110, -7.8402,
         -1.8986, -7.2123, -4.1543, -6.2354, -8.0953, -7.2329, -6.4411, -6.8384,
         -8.1032, -7.0570, -7.7332, -6.8711, -7.1045, -8.2966, -6.1939, -8.0817,
         -7.5501, -5.9695, -8.1008, -6.8849, -8.2273, -6.4850]],
       grad_fn=<CopyBackwards>), end_logits=tensor([[-2.0629, -6.3878, -6.2450, -6.3605, -7.0722, -7.6281, -7.1160, -6.8674,
         -7.1313, -7.1495, -2.0628, -5.0858, -4.7276, -3.5955, -6.3050, -7.1109,
         -4.4975, -4.7221, -

In [12]:
### find the answer span through the logit scores and the argmax operation
answer_start = torch.argmax(start_scores)
answer_end = torch.argmax(end_scores)

# Combine the tokens in the answer and print it out.
answer = ' '.join(tokens[answer_start:answer_end+1])

print('Answer: "' + answer + '"')

Answer: "340 ##m"


Yeah!