In [6]:
import torch
from torch.utils.data import Dataset, DataLoader, SequentialSampler, RandomSampler
from transformers import BertForQuestionAnswering, BertTokenizer

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import orjson as json

# Download and clean data

In [31]:
coqa = pd.read_json('http://downloads.cs.stanford.edu/nlp/data/coqa/coqa-train-v1.0.json')
coqa.head()

Unnamed: 0,version,data
0,1,"{'source': 'wikipedia', 'id': '3zotghdk5ibi9ce..."
1,1,"{'source': 'cnn', 'id': '3wj1oxy92agboo5nlq4r7..."
2,1,"{'source': 'gutenberg', 'id': '3bdcf01ogxu7zdn..."
3,1,"{'source': 'cnn', 'id': '3ewijtffvo7wwchw6rtya..."
4,1,"{'source': 'gutenberg', 'id': '3urfvvm165iantk..."


In [37]:
cols = ["text","question","answer"]#list of lists to create our dataframe
comp_list = []
for row in coqa.itertuples():
    for i in range(len(row.data["questions"])):
        temp_list = []
        temp_list.append(row.data["story"])
        temp_list.append(row.data["questions"][i]["input_text"])
        temp_list.append(row.data["answers"][i]["input_text"])
        comp_list.append(temp_list)

optimized_df = pd.DataFrame(comp_list, columns=cols)

# Load model and tokenizer

In [41]:
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForQuestionAnswering: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased a

In [84]:
sample_df = optimized_df.sample(1)
question = sample_df.iloc[0]['question']
context = sample_df.iloc[0]['text']
answer = sample_df.iloc[0]['answer']

In [85]:
answer, question, context

('7 January 1943',
 'And died?',
 'Nikola Tesla (Serbian Cyrillic: Никола Тесла; 10 July 1856 – 7 January 1943) was a Serbian American inventor, electrical engineer, mechanical engineer, physicist, and futurist best known for his contributions to the design of the modern alternating current (AC) electricity supply system. \n\nTesla gained experience in telephony and electrical engineering before emigrating to the United States in 1884 to work for Thomas Edison in New York City. He soon struck out on his own with financial backers, setting up laboratories and companies to develop a range of electrical devices. His patented AC induction motor and transformer were licensed by George Westinghouse, who also hired Tesla for a short time as a consultant. His work in the formative years of electric power development was involved in a corporate alternating current/direct current "War of Currents" as well as various patent battles. \n\nTesla went on to pursue his ideas of wireless lighting and e

In [86]:
tokenized_input = tokenizer.encode_plus(question, context, return_tensors='pt')
outputs = model(**tokenized_input)
start_scores = outputs.start_logits
end_scores = outputs.end_logits

In [87]:
context[start_scores.argmax(axis=1):end_scores.argmax(axis=1)]

'7 January 1943) was a Serbian American'