In [1]:
# The below codes are written by following documentation of simple transformer https://simpletransformers.ai/

## Installing Necessary Library




In [2]:
#Supressing cell output
%%capture 
!pip install simpletransformers

## Downloading and Processing Dataset

In [3]:
#Supressing cell output
%%capture
import json

#Downloading the train dataset
!wget https://github.com/rajpurkar/SQuAD-explorer/raw/master/dataset/train-v2.0.json
!mkdir data
!mv 'train-v2.0.json' './data/train-v2.0.json'

with open('./data/train-v2.0.json', 'r') as f:
    train_data = json.load(f)

#Converting the train dataset into input form
train_data = [item for topic in train_data['data'] for item in topic['paragraphs'] ]
train_data = train_data[:400] #training on 400 samples

#Downloading the train dataset
!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json
!mv 'dev-v2.0.json' './data/dev-v2.0.json'

with open('./data/dev-v2.0.json', 'r') as f:
    dev_data = json.load(f)

#Downloading the train dataset
dev_data = [item for topic in dev_data['data'] for item in topic['paragraphs'] ]
dev_data = dev_data[200:300]

In [4]:
#Supressing cell output
%%capture

from simpletransformers.question_answering import QuestionAnsweringModel

train_args = {
    'fp16':False,
    'learning_rate': 3e-5,
    'num_train_epochs': 4,
    'max_seq_length': 384,
    'doc_stride': 128,
    'overwrite_output_dir': True,
    'reprocess_input_data': False,
    'train_batch_size': 2,
    'gradient_accumulation_steps': 8,
    'use_early_stopping':True,
}

model = QuestionAnsweringModel('bert', 'bert-base-cased', args=train_args)

In [5]:
model.train_model(train_data)

convert squad examples to features: 100%|██████████| 3226/3226 [00:28<00:00, 113.27it/s]
add example index and unique id: 100%|██████████| 3226/3226 [00:00<00:00, 680693.47it/s]


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=4.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0', max=1670.0, style=ProgressStyle(descrip…

Running loss: 4.745566



Running loss: 0.577287




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1', max=1670.0, style=ProgressStyle(descrip…

Running loss: 2.560131


HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2', max=1670.0, style=ProgressStyle(descrip…

Running loss: 0.152112


HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3', max=1670.0, style=ProgressStyle(descrip…

Running loss: 0.224689



## Sample Output of Test dataset

In [6]:
#Supressing cell output
%%capture
import pandas as pd
df = pd.DataFrame(columns=('Context', 'Question', 'Answer','GroundTruth'))

for idx,data in enumerate(dev_data):
  question_list = []
  answer_list = []
  ground_truth = []

  for ques in data['qas']:
    question_list.append(ques['question'])
    if len(ques['answers'])>0:
      ground_truth.append(ques['answers'][0]['text'])
       
  answers = model.predict([data]);

  for ans in answers[0]:
    answer_list.append(ans['answer'][0])
  
  for q,a,g in zip(question_list,answer_list,ground_truth):
    df = df.append({'Context':data['context'],'Question':q,'Answer':a,'GroundTruth':g},ignore_index=True)



### Converting the Output to a CSV file

In [7]:
df.to_csv('./predictions.csv', index=False)


## Format of prediction input

In [8]:
# predict([
#         {
#             'context': "Some context as a demo",
#             'qas': [
#                 {'id': '0', 'question': 'What is the context here?'},
#                 {'id': '1', 'question': 'What is this for?'}
#             ]
#         }
#     ])

In [15]:
%%capture 
context = """The Federal city of Bonn (German pronunciation: [bɔn] (About this soundlisten) Latin: Bonna) is a city on 
the banks of the Rhine in the German state of North Rhine-Westphalia, with a population of over 300,000. 
About 24 km (15 mi) south-southeast of Cologne, Bonn is in the southernmost part of the Rhine-Ruhr region, 
Germany's largest metropolitan area, with over 11 million inhabitants.
It is famously known as the birthplace of Ludwig Van Beethoven in 1770. He spent his childhood and teenage years in Bonn.
Founded in the 1st century BC as a Roman settlement, Bonn is one of Germany's oldest cities.."""

ques=["""The Federal city of Bonn is a city on the banks of the Rhine in the German state of where, with a population of over 300,000? """,
      """The Federal city of Bonn is a city on the banks of the Rhine in the German state of North Rhine-Westphalia, with a population of how many?""",
      """where is in the southernmost part of the Rhine-Ruhr region, Germany's largest metropolitan area, with over 11 million inhabitants?""",
      """Bonn is in the southernmost part of the where region, Germany's largest metropolitan area, with over 11 million inhabitants?""",
      """Bonn is in the southernmost part of the Rhine-Ruhr region, where's largest metropolitan area, with over 11 million inhabitants?""",
      """Bonn is in the southernmost part of the Rhine-Ruhr region, Germany's largest metropolitan area, with how many inhabitants?""",
      """Founded in when BC as a Roman settlement, Bonn is one of Germany's oldest cities?""",
      """Founded in the 1st century BC as a where settlement, Bonn is one of Germany's oldest cities?""",
      """Founded in the 1st century BC as a Roman settlement, where is one of Germany's oldest cities?""",
      """Founded in the 1st century BC as a Roman settlement, Bonn is how many of Germany's oldest cities?""",
      """Founded in the 1st century BC as a Roman settlement, Bonn is one of where's oldest cities?"""]

answer = []

for i in range(len(ques)):
  to_predict = [{'context': context,'qas': [{'question': ques[i], 'id': '0'}]}]
  pred = model.predict(to_predict)
  answer.append(pred[0][0]['answer'][0])

## Sample Prediction

In [16]:
for q,a in zip(ques,answer):
  print("Question: ",q)
  print("Answer: ",a,'\n\n')

Question:  The Federal city of Bonn is a city on the banks of the Rhine in the German state of where, with a population of over 300,000? 
Answer:  North Rhine-Westphalia 


Question:  The Federal city of Bonn is a city on the banks of the Rhine in the German state of North Rhine-Westphalia, with a population of how many?
Answer:  300,000 


Question:  where is in the southernmost part of the Rhine-Ruhr region, Germany's largest metropolitan area, with over 11 million inhabitants?
Answer:  Bonn 


Question:  Bonn is in the southernmost part of the where region, Germany's largest metropolitan area, with over 11 million inhabitants?
Answer:  Rhine-Ruhr 


Question:  Bonn is in the southernmost part of the Rhine-Ruhr region, where's largest metropolitan area, with over 11 million inhabitants?
Answer:  300,000 


Question:  Bonn is in the southernmost part of the Rhine-Ruhr region, Germany's largest metropolitan area, with how many inhabitants?
Answer:  11 million 


Question:  Founded in w