Reference: https://mccormickml.com/2020/03/10/question-answering-with-a-fine-tuned-BERT/#part-2-example-code

In [None]:
!pip install transform

In [1]:
import torch
from transformers import BertForQuestionAnswering
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

In [3]:
def answer_question(question, answer_text):
    '''
    Takes a `question` string and an `answer_text` string (which contains the
    answer), and identifies the words within the `answer_text` that are the
    answer. Prints them out.
    '''
    # ======== Tokenize ========
    # Apply the tokenizer to the input text, treating them as a text-pair.
    input_ids = tokenizer.encode(question, answer_text)

    # Report how long the input sequence is.
    print('Query has {:,} tokens.\n'.format(len(input_ids)))

    # ======== Set Segment IDs ========
    # Search the input_ids for the first instance of the `[SEP]` token.
    sep_index = input_ids.index(tokenizer.sep_token_id)

    # The number of segment A tokens includes the [SEP] token istelf.
    num_seg_a = sep_index + 1

    # The remainder are segment B.
    num_seg_b = len(input_ids) - num_seg_a

    # Construct the list of 0s and 1s.
    segment_ids = [0]*num_seg_a + [1]*num_seg_b

    # There should be a segment_id for every input token.
    assert len(segment_ids) == len(input_ids)

    # ======== Evaluate ========
    # Run our example question through the model.
    start_scores, end_scores = model(torch.tensor([input_ids]), # The tokens representing our input text.
                                    token_type_ids=torch.tensor([segment_ids])) # The segment IDs to differentiate question from answer_text

    # ======== Reconstruct Answer ========
    # Find the tokens with the highest `start` and `end` scores.
    answer_start = torch.argmax(start_scores)
    answer_end = torch.argmax(end_scores)

    # Get the string versions of the input tokens.
    tokens = tokenizer.convert_ids_to_tokens(input_ids)

    # Start with the first token.
    answer = tokens[answer_start]

    # Select the remaining answer tokens and join them with whitespace.
    for i in range(answer_start + 1, answer_end + 1):
        
        # If it's a subword token, then recombine it with the previous token.
        if tokens[i][0:2] == '##':
            answer += tokens[i][2:]
        
        # Otherwise, add a space then the token.
        else:
            answer += ' ' + tokens[i]
    print('Answer: "' + answer + '"')

In [4]:
import textwrap

# Wrap text to 80 characters.
wrapper = textwrap.TextWrapper(width=100)
bert_abstract = 'Leo lived in Naples' # qui ci puoi mettere qualsiasi testo

print(wrapper.fill(bert_abstract))

Leo lived in Naples


In [5]:
question = 'where did Leo live?'

answer_question(question, bert_abstract)

Query has 12 tokens.

Answer: "naples"


In [8]:
import pandas as pd
data = pd.read_csv('WebScraping/data.csv')

In [14]:
Joey = data[data['actor'] =='Joey' ]
Joey_script = Joey.quote.tolist()

In [38]:
bert_abstract  = ''.join(Joey_script.copy())[:1000]
bert_abstract

" C'mon, you're going out with the guy! There's gotta be something wrong with him! Instead of...? Never had that dream. This guy says hello, I wanna kill myself. Ohh. And you never knew she was a lesbian... Alright Ross, look. You're feeling a lot of pain right now. You're angry. You're hurting. Can I tell you what the answer is? Strip joint! C'mon, you're single! Have some hormones! I say push her down the stairs.    And hey, you need anything, you can always come to Joey. Me and Chandler live across the hall. And he's away a lot. What, like there's a rule or something? Who's Paul?\xa0 Hey, Paul! Here's a little tip, she really likes it when you rub her neck in the same spot over and over and over again until it starts to get a little red. Hey Pheebs, you wanna help? I'm thinking we've got a bookcase here.   What's this? Which goes where? Done with the bookcase! Hey-hey-hey-hey, if you're gonna start with that stuff we're outta here. Ross, let me ask you a question. She got the furnit

In [39]:
question = 'where do joey and chandler live?'
answer_question(question, bert_abstract)

Query has 286 tokens.

Answer: "across the hall"
