# Generate Multiple Choice Questions
* Notebook: [Generate_Multiple_Choice_Questions.ipynb](Generate_Multiple_Choice_Questions.ipynb)
* Purpose: QA-GNN requires multiple choice answers to perform question answering. This notebook generates noun phrases that are semantically related (if they can be grounded by WordNet or ConceptNet) and creates the jsonl files required for QA-GNN to run.
* Source Explanation: [Ramsri Goutham's Practical AI](https://towardsdatascience.com/practical-ai-automatically-generate-multiple-choice-questions-mcqs-from-any-content-with-bert-2140d53a9bf5)
* Adapted code: [Generate_MCQ_BERT_Wordnet_Conceptnet Repo](https://github.com/ramsrigouthamg/Generate_MCQ_BERT_Wordnet_Conceptnet.git)

### Install Dependencies and Download NLTK Collections

In [None]:
!pip install gensim
!pip install git+https://github.com/boudinfl/pke.git
!python -m spacy download en
!pip install -U nltk
!pip install -U pywsd
!pip install -U flashtext
!pip install datasets

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('popular')

## Helper Functions for Generating Noun Phrases

In [None]:
import pprint
import itertools
import re
import pke
import string
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from flashtext import KeywordProcessor

def get_nouns_multipartite(text):
    out=[]
    extractor = pke.unsupervised.MultipartiteRank()

    stoplist = list(string.punctuation)
    stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
    stoplist += stopwords.words('english')
    #ORIG: extractor.load_document(input=text)
    extractor.load_document(input=text, stoplist=stoplist)
    #    not contain punctuation marks or stopwords as candidates.
    pos = {'PROPN'}
    #pos = {'VERB', 'ADJ', 'NOUN'}
    #ORIG: extractor.candidate_selection(pos=pos, stoplist=stoplist)
    extractor.candidate_selection(pos=pos)
    # 4. build the Multipartite graph and rank candidates using random walk,
    #    alpha controls the weight adjustment mechanism, see TopicRank for
    #    threshold/method parameters.
    extractor.candidate_weighting(alpha=1.1,
                                  threshold=0.75,
                                  method='average')
    keyphrases = extractor.get_n_best(n=20)
    for key in keyphrases:
        out.append(key[0])
    return out

def tokenize_sentences(text):
    sentences = [sent_tokenize(text)]
    sentences = [y for x in sentences for y in x]
    # Remove any short sentences less than 20 letters.
    sentences = [sentence.strip() for sentence in sentences if len(sentence) > 20]
    return sentences

def get_sentences_for_keyword(keywords, sentences):
    keyword_processor = KeywordProcessor()
    keyword_sentences = {}
    for word in keywords:
        keyword_sentences[word] = []
        keyword_processor.add_keyword(word)
    for sentence in sentences:
        keywords_found = keyword_processor.extract_keywords(sentence)
        for key in keywords_found:
            keyword_sentences[key].append(sentence)

    for key in keyword_sentences.keys():
        values = keyword_sentences[key]
        values = sorted(values, key=len, reverse=True)
        keyword_sentences[key] = values
    return keyword_sentences

## Setup SQuAD data

In [None]:
from datasets import load_dataset
from collections import namedtuple

squad = load_dataset("squad")
SquadExample = namedtuple("SquadExample", "id title context question answers")

In [None]:
def get_squad_split(squad, split="validation"):
    # Use `split='train'` for the train split.
    #    Returns
    #    -------
    #    list of SquadExample named tuples with attributes
    #    id, title, context, question, answers

    fields = squad[split].features
    data = zip(*[squad[split][field] for field in fields])
    return [SquadExample(eid, title, context, question, answers["text"])
      for eid, title, context, question, answers in data]

In [None]:
squad_dev = get_squad_split(squad)
squad_train = get_squad_split(squad, "train")

## Generate Multiple Choice Questions

In [None]:
import requests
import json
import re
import random
from pywsd.similarity import max_similarity
from pywsd.lesk import adapted_lesk
from pywsd.lesk import simple_lesk
from pywsd.lesk import cosine_lesk
from nltk.corpus import wordnet as wn

# Distractors from Wordnet
def get_distractors_wordnet(syn,word):
    distractors=[]
    word= word.lower()
    orig_word = word
    if len(word.split())>0:
        word = word.replace(" ","_")
    hypernym = syn.hypernyms()
    if len(hypernym) == 0: 
        return distractors
    for item in hypernym[0].hyponyms():
        name = item.lemmas()[0].name()
        #print ("name ",name, " word",orig_word)
        if name == orig_word:
            continue
        name = name.replace("_"," ")
        name = " ".join(w.capitalize() for w in name.split())
        if name is not None and name not in distractors:
            distractors.append(name)
    return distractors

def get_wordsense(sent,word):
    word= word.lower()
    
    if len(word.split())>0:
        word = word.replace(" ","_")

    synsets = wn.synsets(word,'n')
    if synsets:
        wup = max_similarity(sent, word, 'wup', pos='n')
        adapted_lesk_output =  adapted_lesk(sent, word, pos='n')
        lowest_index = min (synsets.index(wup),synsets.index(adapted_lesk_output))
        return synsets[lowest_index]
    else:
        return None

# Distractors from http://conceptnet.io/
def get_distractors_conceptnet(word):
    word = word.lower()
    original_word= word
    if (len(word.split())>0):
        word = word.replace(" ","_")
    distractor_list = [] 
    url = "http://api.conceptnet.io/query?node=/c/en/%s/n&rel=/r/PartOf&start=/c/en/%s&limit=5"%(word,word)
    obj = requests.get(url).json()

    for edge in obj['edges']:
        link = edge['end']['term'] 

        url2 = "http://api.conceptnet.io/query?node=%s&rel=/r/PartOf&end=%s&limit=10"%(link,link)
        obj2 = requests.get(url2).json()
        for edge in obj2['edges']:
            word2 = edge['start']['label']
            if word2 not in distractor_list and original_word.lower() not in word2.lower():
                distractor_list.append(word2)
                   
    return distractor_list

def get_choices(answer, context):
  key_distractor_list = {}

  keywords = get_nouns_multipartite(context)

  wrong_answers = []
  for keyword in keywords:
    if keyword.lower() != answer.lower():
      wrong_answers.append(keyword)

  sentences = tokenize_sentences(context)
  keyword_sentence_mapping = get_sentences_for_keyword(keywords, sentences)

  keyword = answer.lower()
  if keyword in keyword_sentence_mapping:
    first_sentence = keyword_sentence_mapping[keyword][0]
    wordsense = get_wordsense(first_sentence,keyword)

    if wordsense:
        distractors = get_distractors_wordnet(wordsense,keyword)
        if len(distractors) ==0:
            distractors = get_distractors_conceptnet(keyword)
        if len(distractors) != 0:
            key_distractor_list[keyword] = distractors
    else:
        distractors = get_distractors_conceptnet(keyword)
        if len(distractors) != 0:
          key_distractor_list[keyword] = distractors

    if not keyword in key_distractor_list:
      key_distractor_list[keyword] = wrong_answers

  return key_distractor_list

# Sample question line in jsonl format:
#{"id": "90b30172e645ff91f7171a048582eb8b",
# "question": {"question_concept": "townhouse", 
# "choices": [{"label": "A", "text": "suburban development"}, {"label": "B", "text": "apartment building"}, {"label": "C", "text": "bus stop"}, {"label": "D", "text": "michigan"}, {"label": "E", "text": "suburbs"}], 
#"stem": "The townhouse was a hard sell for the realtor, it was right next to a high rise what?"}, 
#"statements": [{"label": true, "statement": "The townhouse was a hard sell for the realtor, it was right next to a high rise suburban development."}, 
#{"label": false, "statement": "The townhouse was a hard sell for the realtor, it was right next to a high rise apartment building."}, 
#{"label": false, "statement": "The townhouse was a hard sell for the realtor, it was right next to a high rise bus stop."}, 
#{"label": false, "statement": "The townhouse was a hard sell for the realtor, it was right next to a high rise michigan."}, 
#{"label": false, "statement": "The townhouse was a hard sell for the realtor, it was right next to a high rise suburbs."}]}
def create_question_line(right_answer, key_distractor_list):
  id_str = "\"id\": \""  + ex.id + "\""
  question_concept_str = "\"question_concept\": \"" + ex.title + "\""

  choices = [right_answer.lower()] + key_distractor_list[right_answer.lower()]
  top5choices = choices[:5]
  random.shuffle(top5choices)
  optionchoices = ['a','b','c','d', 'e']
  choices_arr = []
  for idx,choice in enumerate(top5choices):
    choices_arr.append("{\"label\": \"" + optionchoices[idx] + "\", \"text\": \"" + choice + "\"}")
  choices_str = ", ".join(choices_arr)

  question_mod = ex.question.replace("\"", "")
  question_str = "\"question\": {" + question_concept_str + ", " + "\"choices\": [" + choices_str + "], " + "\"stem\": \"" + question_mod + "\"}"
  
  statement_stem =question_mod[:-1]
  statements_arr = []

  for choice in top5choices:
    statement_choice = "{\"label\": "
    if choice == right_answer.lower():
      statement_choice += "true"
    else:
      statement_choice += "false"
    statement_choice += ", \"statement\": \"" + statement_stem + " " + choice + ".\"}"
    statements_arr.append(statement_choice)

  statements_array_str = ", ".join(statements_arr)
  statements_str = "\"statements\": [" + statements_array_str + "]"
  ex_line = "{" + id_str + ", " + question_str + ", " + statements_str + "}"

  return ex_line

## Generate JSONL Statement File From SQuAD Data

In [None]:
base_dir = "/home/scpdxcs/"

def generate_statement_file(squad_data, statement_file):
    current_rec = 0
    squad_len = len(squad_data)
    statement_write_file = open(statement_file, 'w')
    
    for ex in squad_data:
      right_answer = ex.answers[0].replace("-", " ").translate(str.maketrans('', '', string.punctuation))
      context = ex.context.replace("-", " ").translate(str.maketrans('', '', string.punctuation))
      try:
        key_distractor_list = get_choices(right_answer, context)
      except ValueError:
        print("error processing line " + str(current_rec + 1))

      if len(key_distractor_list) != 0:
        ex_line = create_question_line(right_answer, key_distractor_list)
        statement_write_file.write(ex_line + "\n")

      current_rec += 1
      if current_rec % 100 == 0:
        print("Percent done: " + str((current_rec / squad_len) * 100) + "% (" + str(current_rec) + "/" + str(squad_len) + ")")

    statement_write_file.close()

generate_statement_file(squad_dev, base_dir + "dev.statement.jsonl")
# generate_statement_file(squad_train, base_dir + "train.statement.jsonl")

# Process JSONL Statements for use in QA-GNN

In [None]:
!pip install torch==1.8.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html
!pip install transformers==3.4.0
!pip install nltk spacy==2.1.6
!python -m spacy download en

# for torch-geometric
!pip install torch-scatter==2.0.7 -f https://pytorch-geometric.com/whl/torch-1.8.0+cu101.html
!pip install torch-sparse==0.6.9 -f https://pytorch-geometric.com/whl/torch-1.8.0+cu101.html
!pip install torch-geometric==1.7.0 -f https://pytorch-geometric.com/whl/torch-1.8.0+cu101.html

# file utilities
!pip install wget

In [None]:
import os.path
import sys

qa_gnn_repo_path = "qagnn/"
base_dir = "/home/scpdxcs/"

if not os.path.exists(qa_gnn_repo_path):
    !git clone https://github.com/michiyasunaga/qagnn.git
        
# from https://stackoverflow.com/questions/4383571/importing-files-from-different-folder
sys.path.append(base_dir + qa_gnn_repo_path)

In [None]:
os.chdir(base_dir + qa_gnn_repo_path)
!chmod 755 ./download_preprocessed_data.sh
!./download_preprocessed_data.sh

### Update preprocess.py to Process SQuAD data

**Add to input_paths**:<br/>
     'squad': {<br/>
         'test': './data/squad/test_squad_does_not_exist.jsonl',<br/>
     },<br/>
<br/>
**Add to output_paths**:<br/>
     'squad': {<br/>
         'statement': {<br/>
             'test': './data/squad/statement/test.statement.jsonl',<br/>
         },<br/>
         'grounded': {<br/>
             'test': './data/squad/grounded/test.grounded.jsonl',<br/>
         },<br/>
         'graph': {<br/>
             'adj-test': './data/squad/graph/test.graph.adj.pk',<br/>
         },<br/>
     },<br/>
<br/>
**In main()**:<br/>
**Add squad to choices for argument "--run"**<br/>
     parser.add_argument('--run', default=['common'], choices=['common', 'csqa', 'hswag', 'anli', 'exp', 'scitail', 'phys', 'socialiqa', 'obqa', 'obqa-fact', 'make_word_vocab', 'squad'], nargs='+')<br/>
<br/>
**Add to routines**:<br/>
         'squad': [<br/>
             {'func': ground, 'args': (output_paths['squad']['statement']['test'], output_paths['cpnet']['vocab'],<br/>
                                       output_paths['cpnet']['patterns'], output_paths['squad']['grounded']['test'], args.nprocs)},<br/>
             {'func': generate_adj_data_from_grounded_concepts__use_LM, 'args': (output_paths['squad']['grounded']['test'], output_paths['cpnet']['pruned-graph'], output_paths['cpnet']['vocab'], output_paths['squad']['graph']['adj-test'], args.nprocs)},<br/>
         ],<br/>

In [None]:
!cp utils/preprocess-squad.py qagnn/preprocess.py

### Create Data Directories for SQuAD in QA-GNN and Run preprocess.py on Statement Data

In [None]:
os.chdir(base_dir)
!mkdir -p qagnn/data/squad/statement
!mkdir -p qagnn/data/squad/grounded
!mkdir -p qagnn/data/squad/graph
!cp dev.statement.jsonl qagnn/data/squad/statement/test.statement.jsonl

In [None]:
os.chdir(base_dir + qa_gnn_repo_path)
!python3 preprocess.py --run squad

### Remove Records That preprocess.py Can't Process

For some statements that the preprocess scripts runs when created the grounded dataset, it doesn't generate lines for all of the possible answers. The code block below identifies the IDs of the SQuAD examples that don't generate five answers and removes them from the statement set.

In [None]:
import json
import csv

os.chdir(base_dir)

grounded_file = base_dir + qa_gnn_repo_path + 'data/squad/grounded/test.grounded.jsonl'
statement_file = base_dir + qa_gnn_repo_path + 'data/squad/statement/test.statement.jsonl'

grounded_data = [json.loads(line) for line in open(grounded_file, 'r')]
statement_data = [json.loads(line) for line in open(statement_file, 'r')]
grounded_counts = {}
j = 0

for statement in statement_data:
  grounded_counts[statement['id']] = 0

for i in range(0, len(statement_data)):
  check_next_statement = False
  while (not check_next_statement) and (j < len(grounded_data)):
    statement_found = False
    grounded_answer = grounded_data[j]['sent']
    statement_list = statement_data[i]['statements']
    k = 0

    while (not statement_found) and (k < len(statement_list)):
      if grounded_answer in statement_list[k]["statement"]:
        statement_found = True
      else:
        k += 1
      
    if statement_found:
      grounded_counts[statement_data[i]['id']] += 1
      j += 1
    else:
      check_next_statement = True

complete_statements = []

for g_id in grounded_counts:
  if grounded_counts[g_id] == 5:
    complete_statements.append(g_id)

# writing to file
dev_statement_updated_file = base_dir + "dev-updated.statement.jsonl"
statement_write_file = open(dev_statement_updated_file, 'w')

for statement in statement_data:
  if statement["id"] in complete_statements:
    statement_str = json.dumps(statement)
    statement_write_file.write(statement_str + "\n")

statement_write_file.close()

### Stage New Statements File and Run-Run preprocessing.py

In [None]:
os.chdir(base_dir)
!cp dev-updated.statement.jsonl qagnn/data/squad/statement/test.statement.jsonl
!rm qagnn/data/squad/grounded/test.grounded.jsonl

In [None]:
os.chdir(base_dir + qa_gnn_repo_path)
!python3 preprocess.py --run squad