In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.11.3-py3-none-any.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 MB 4.3 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 22.9 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 27.2 MB/s 
Collecting huggingface-hub>=0.0.17
  Downloading huggingface_hub-0.0.19-py3-none-any.whl (56 kB)
[K     |████████████████████████████████| 56 kB 3.2 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 42.5 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3

In [2]:
import torch

In [3]:
from transformers import BertForQuestionAnswering
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

Downloading:   0%|          | 0.00/443 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.25G [00:00<?, ?B/s]

In [4]:
#### LOAD THE TOKENIZER
from transformers import BertTokenizer

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [6]:
#### TIME TO TEST THE MODEL, AND ASK BERT A QUESTION...

question= "How many parameters does BERT-large have?"

answer_text = "BERT-large is really big...it has 24-layers and an embedding size of 1,024, for a total of 340M parameters!"


In [7]:
## WE HAVE TO RUN BERT TOKENIZER AGAINST BOTH QUESTION AND answer_text.
# TO FEED THEM INTO BERT, WE ACTUALLY CONCATENATE THEM.
### APPLY THE TOKENIZER TO THE INPUT TEXT, TREATING THEM AS A TEXT-PAIR###

input_ids = tokenizer.encode(question, answer_text)

print("The input has a  total of {:} tokens.".format(len(input_ids)))

The input has a  total of 46 tokens.


In [8]:
## I WILL NOW PRINT OUT ALL THE TOKENS AND THEIR ID NUMBERS.

## BERT NEEDS ONLY THE TOKEN ID'S, BUT FOR THE PURPOSE OF INSPECTING THE
## TOKENIZER'S BEHAVIOR, LET'S ALSO GET THE TOKEN STRINGS AND DISPLAY THEM....

tokens = tokenizer.convert_ids_to_tokens(input_ids)

## For each token and its id....
for token, id in zip(tokens, input_ids):

   ## 
  if id == tokenizer.sep_token_id:
    print('')

  ##PRINT THE TOKEN STRING AND ITS ID IN TWO COLUMNS...

  print('{:<12} {:>6,}'.format(token, id))

  if id == tokenizer.sep_token_id:
    print('')





[CLS]           101
how           2,129
many          2,116
parameters   11,709
does          2,515
bert         14,324
-             1,011
large         2,312
have          2,031
?             1,029

[SEP]           102

bert         14,324
-             1,011
large         2,312
is            2,003
really        2,428
big           2,502
.             1,012
.             1,012
.             1,012
it            2,009
has           2,038
24            2,484
-             1,011
layers        9,014
and           1,998
an            2,019
em            7,861
##bed         8,270
##ding        4,667
size          2,946
of            1,997
1             1,015
,             1,010
02            6,185
##4           2,549
,             1,010
for           2,005
a             1,037
total         2,561
of            1,997
340          16,029
##m           2,213
parameters   11,709
!               999

[SEP]           102



In [9]:
### search the iput ids for the first instance of the '[SEP]' token...


sep_index = input_ids.index(tokenizer.sep_token_id)


# the number of segment 'A' tokens includes the [SEP] token itself...

num_seg_A = sep_index +1

## the remainder are the segment 'B'...

num_seg_B = len(input_ids) - num_seg_A

##  Construct the list of 0s and 1s...
segment_ids = [0]*num_seg_A+ [1]*num_seg_B


## there should be a segment_id for every input token....


assert len(segment_ids) == len(input_ids)

In [11]:
### WE ARE NOT USING ANY PADDING. ITS FOR BATCH PROCESSING SETENCES ALL AT ONCE....


# run our example thorugh the model:


start_scores, end_scores = model(torch.tensor([input_ids]),  #the tokens representing our input text
                                 token_type_ids = torch.tensor([segment_ids]))  # the segment ids to differenetiate the questions

In [13]:
### NOW WE CAN HIGHLIGHT THE ANSWER JUST BY LOOKIG AT THE MOST PROBABLE START AND END WORDS

### Find the tokens with the highest 'start' and 'end' scores...

answer_start = torch.argmax(start_scores)
answer_end = torch.argmax(end_scores)

## Combine the tokens in the answer and print it out...

answer = ' '.join(tokens[answer_start:answer_end+1])

print(('Answer: "' + answer + '"'))






TypeError: ignored

In [14]:
## RECONSTRUCT ANY WORD THAT GETS BROKEN DOWN INTO SUB WORDS


# start with the first token:

answer = tokens[answer_start]

## Select the remaining answer tokens and join them with whitespace:

for i in range(answer_start +1, answer_end +1):

  ## If it's a subword token, then recombine it with the previous token:

  if tokens[i][0:2] == '##':
    answer += tokens[i][2:]

  else:
    answer += ' ' + tokens[i]

print('Answer: "' + answer + '"' )







NameError: ignored

In [None]:

## WE CAN VISUALIZE THE SCORES TO SEE WHAT THE MODEL IS PRODUCING>>>

import matplotlib.pyplot as plt
import seaborn as sns

# Use plot styling from seaborn

sns.set(style= 'darkgrid')

# Increase the plot size and the font size

# sns set(font_scale = 1.5)

plt.rcParams['figure.figsize'] = (16,8)


In [None]:
# pull the scores out of Pytorch Tesors and convert them to 1D arrays...

s_scores= start_scores.detach().numpy().flatten()

e_scores= end_scores.detach().numpy().flatten()

# We'll use the tokens so the x-axis labels. In order to do that,
# they all need to be unique, so we'll add the token index to the end of each one

token_labels = []

for (i,token) in enumerate(tokens):
  token_labels.append('{:} - {:>2}'.format(token,i))

  

In [15]:
## CREATE A BAR PLOT SHOWING THE SCORE OF EVERY INPUT WORD BEING THE START WORD:


ax = sns.barplot(x=token_labels, y = s_scores, ci = None)

#Turn the xlabels vertical
ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha = "center")

# Turn on the vertical grid to help align words to scores...
ax.grid(True)

plt.title('Start Word Scores')

plt.show()



NameError: ignored

In [None]:
### CREATE A BARPLOT SHOWING THE END WORD SCORES FOR ALL OF THE TOKENS...


ax = sns.barplot(x=token_labels, y = e_scores, ci = None)

# Turn the xlabels vertical....
ax.set_xticklabels(ax.get_xticklabels(), rotation= 90, ha= "center")

#Turn on the vertical grid to help words to scores....

ax.grid(True)

plt.title


In [None]:
import pandas as pd


# store the tokens in a dataframe....


scores = []

for (i, token_label) in enumerate(token_labels):
## add the tokens start scores as one row
  scores.append({'token_label': token_label,
                 'score': s_scores[i],
                 'marker': 'start'})
  
  # Add the tokes end ecore as another row:

  scores.append({'token_label': token_label},
                'score': e_scores[i],
                'marker': 'end'})
  
df = pd.DataFrame(scores)

In [None]:
## Draw a groped barplot to show start and end scores for each
#word. The hue parameter is where we tell it which datapoints belong
# to which of the two series...



g = sns.catplot(x="token_label", y = "score", hue = "marker", data= df,
                kind = "bar", height = 6, aspect = 4)


# Turn the xlabels vertical...

g.set_xticklabels(g.ax.get_xticklabels(), rotation = 90, ha = "center")

# the on the vertical grid to help align the words to scores...

g.ax.grid(True)

In [None]:
### Function that does everything....

def answer_question(question, answer_text):

  '''Takes a question string, and an answer text string (which contains the answer
  ), and identifies the words within the answer_text that are the answer. 
  Prints them out).'''


  ##=======TOKENIZE ======

    ## apply the tokenizer to the input text, treating them as a text-pair.

    input_ids = tokenizer.encode(question, answer_text)

    # Report how long the input sequence is:

    print('Query has {:,} tokens.\n'.format(len(input_ids)))


    # =======Set Segment IDs ======
    # Search the iput_ids for first instance of the [SEP] token...

    sep_index = input_ids.index(tokenizer.sep_token_id)


    # The number of segment A tokens includes the [SEP] token itself
    num_seg_A = sep_index + 1


    #  The remainder are the segment B...
    num_seg_B = len(input_ids) - num_seg_A

    ## Construct the list of 0's and 1's


    segment_ids = [0]*num_seg_A + [1]*num_seg_B

    # There should be a segment_id for every input token...

    assert len(segment_ids) == len(input_ids)



    # =====EVALUATE =====

    #Run the example question through the model...

    start_scores, end_scores = model(torch.tensor([input_ids],
                                                  token_type_ids = torch.tensor([segment_ids]))
    

    ### ========RECONSTRUCT ANSWER ======

    # Find the tokens with the highest start and end scores...

    answer_start = torch.argmax(start_scores)
    answer_end = torch.argmax(end_scores)

    # Get the string versions of the input tokens...

    tokens = tokenizer.convert_ids_to_tokens(input_ids)

    # Start with the first token
    answer = tokens[answer_start]

    ## Select the remainign answer tokens and join them with whitespace....
    for i in range(answer_start +1, answer_end +1):

      # If its a subword, then recombine it with the previous token.

      if tokens[i][0:2] == '##':
        answer += tokens[i][2:]

      # Otherwise add a space then the token...
      else:
        answer += ' ' + token[i]

    print('Answer: "' + answer + '"')















In [None]:
import textwrap
# Wrap text to 80 characters...

wrapper = textwrap.Textwrapper(width=80)

bert_abstract = "We introduce a new language representaion model called BERT, which stands for Bidirectional Encoder Representation Transformer"

print(wrapper.fill(bert_abstract))

In [None]:
#   QUESTION TIME ####

question = "What does the 'B' is BERT stand for?"

answer_question(question, bert_abstract)




In [None]:
question = "What are some example applications of BERT??"


answer_question(question, bert_abstract)