<a href="https://colab.research.google.com/github/developeravsk/Question-Answer-Generation-from-unsupervised-text-using-Transformers/blob/main/Question_generation_and_answer_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
text= """
Amanda Crew is a Canadian actress, writer, and director who is known for playing several roles in a variety of shows and movies which include final destination 3,15 love, charlie st. cloud,silicon valley,whistler,a crooked somebody,crazy kind of love, the haunting in connecticut, poor boy, and ferocious .
The film had its world premiere in the midnight passion section at the 23 rd busan international film festival on october 6, 2018 . in spite of the film being panned by critics, crew was nominated for the best actress award at several film festivals for her performance .
Amanda Catherine Crew is a Canadian actress . She is set to star in new film with Blake Lively The Age Of Adaline . She started acting in fifth grade elementary school . Amanda attended Brookswood Secondary School before studying at the American Academy of Dramatic Arts in New York .
Her mother is a legal secretary and her father is a telecom worker . She later co starred with ed harris in the drama thriller a crooked somebody . She played bette mack, a woman who becomes intimately involved with the youngest son of a family nearly torn apart by the fathers infidelity .
The canadian movie actress was born in canada on june 5, 1986 . She has been in a long term relationship with actor dustin milligan since october 2010 . The couple have been together for around 11 years, 1 month, and 5 days . Amanda Crew has not been previously engaged .
"""


# Question extraction from text

In [2]:
# Package downloads

!pip install -U transformers==3.0.0
!pip install sentencepiece
!python -m nltk.downloader punkt

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# Cloning the github repo for question generation

import os.path
from os import path

if path.exists("question_generation")==False:
  !git clone https://github.com/patil-suraj/question_generation.git
  
%cd question_generation

/content/question_generation


In [4]:
# Import pipeline from the cloned repository
from  pipelines import pipeline

# function for generation qa pairs
def generate_qa_pairs(model,text):
  try:
    question_answer = model(text)
    if len(question_answer)==0:
      print("No question-answer pair generated :(")
    i = 1
  except:
    question_answer=[]
  return question_answer

In [5]:
# loading the model
nlp_qa_qg = pipeline("multitask-qa-qg")


Downloading:   0%|          | 0.00/656 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/31.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/242M [00:00<?, ?B/s]

In [6]:
# Extracting question answers from text
# There is no limit on how much text the model can process
data=generate_qa_pairs(nlp_qa_qg,text)

  beam_id = beam_token_id // vocab_size


In [7]:
# Diplaying the top 5 question answer pair
data[0:5]

[{'answer': 'midnight passion',
  'question': 'What section of the busan international film festival did Amanda Crew premiere?'},
 {'answer': '23 rd busan international film festival',
  'question': 'At what festival did Amanda Crew premiere?'},
 {'answer': 'best actress award',
  'question': 'What award did Amanda Crew receive at several film festivals?'},
 {'answer': 'Amanda Catherine Crew', 'question': 'Who is a Canadian actress?'},
 {'answer': 'Blake Lively The Age Of Adaline',
  'question': 'What new film is Amanda Catherine Crew set to star with?'}]

In [8]:
# Extracting only the questions from the above qa pair
question=[]
for i in data:
  question.append(i['question'])

In [9]:
question

['What section of the busan international film festival did Amanda Crew premiere?',
 'At what festival did Amanda Crew premiere?',
 'What award did Amanda Crew receive at several film festivals?',
 'Who is a Canadian actress?',
 'What new film is Amanda Catherine Crew set to star with?',
 'Where did Amanda begin acting?',
 'What school did Amanda attend before studying at the American Academy of Dramatic Arts in New York?',
 "What is Amanda's mother's job?",
 'What drama thriller did Amanda co starred with ed harris?',
 'What is the name of the woman who becomes intimately involved with the youngest son of a family nearly torn apart by the fathers infidelity',
 'When was Amanda Crew born?',
 'Which actor has Amanda Crew been in a long term relationship with since october 2010?',
 'How long have Amanda Crew been together?',
 'Who has not been previously engaged?']

# Answer extraction using BERT Model


In [12]:
# Necessary model including BertQuestionAnswering and BertTokenizer
!pip install --upgrade transformers
import pandas as pd
import numpy as np
import torch
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer
import json

Collecting transformers
  Using cached transformers-4.12.5-py3-none-any.whl (3.1 MB)
Collecting tokenizers<0.11,>=0.10.1
  Using cached tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
Installing collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.8.0rc4
    Uninstalling tokenizers-0.8.0rc4:
      Successfully uninstalled tokenizers-0.8.0rc4
  Attempting uninstall: transformers
    Found existing installation: transformers 3.0.0
    Uninstalling transformers-3.0.0:
      Successfully uninstalled transformers-3.0.0
Successfully installed tokenizers-0.10.3 transformers-4.12.5


In [13]:
# Loading the model
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

Downloading:   0%|          | 0.00/443 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [14]:
# Function for answer extraction
def question_answer(question, text):
    
    #tokenize question and text as a pair
    input_ids = tokenizer.encode(question, text)
    
    #string version of tokenized ids
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    
    #segment IDs
    #first occurence of [SEP] token
    sep_idx = input_ids.index(tokenizer.sep_token_id)    
    
    #number of tokens in segment A (question)
    num_seg_a = sep_idx+1    
    
    #number of tokens in segment B (text)
    num_seg_b = len(input_ids) - num_seg_a
    
    #list of 0s and 1s for segment embeddings
    segment_ids = [0]*num_seg_a + [1]*num_seg_b    
    
    assert len(segment_ids) == len(input_ids)
    
    #model output using input_ids and segment_ids
    output = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([segment_ids]))
    
    #reconstructing the answer
    answer_start = torch.argmax(output.start_logits)
    answer_end = torch.argmax(output.end_logits)    
    if answer_end >= answer_start:
        answer = tokens[answer_start]
        for i in range(answer_start+1, answer_end+1):
            if tokens[i][0:2] == "##":
                answer += tokens[i][2:]
            else:
                answer += " " + tokens[i]
                
    else:
        answer = "Unable to find the answer to your question."
    
    return answer.capitalize()

In [15]:
# extracting answers for each of the extracted question
# Note that the BERT model only accepts 512 tokens at a time

qa_sets=question_answer(question[0], text)

AttributeError: ignored