In [1]:
#start from a .json file from the "data" folder
#first create a "processed" .json file that will contain multiple "elementary documents"
#an "elementary document" will have an unique id, formed from the name of the source .json file and an internal id
#basically, the input file will be the .json file from the "data" folder and the output file will be a .json containing "elementary documents"
#we will use split to 250 and to 500 words

#will then use a function that takes as input a .json file containing "elementary documents" and returns a json containing the id of the "elementary document" and a list of pairs (q,a)

In [2]:
# define the tests parameters - here is the only cell where we need to give our input

input_file_names = ["10.json", "24.json"]

num_questions = 3
split_length = 250
tokenizer_model = None
use_gpu = False
split_by="word"

In [3]:
import os
import json
import time

from app.api.questiongeneration.service import QuestionGenerationModelB

from transformers import AutoTokenizer

from haystack.schema import Document
from haystack.nodes import PreProcessor

INFO - haystack.document_stores.base -  Numba not found, replacing njit() with no-op implementation. Enable it with 'pip install numba'.
2022-05-16 10:35:59.225010: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-05-16 10:35:59.225027: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
INFO - haystack.modeling.model.optimization -  apex not found, won't use it. See https://nvidia.github.io/apex/


In [4]:
#initialize tokenizer
QG_PRETRAINED = "iarfmoose/t5-base-question-generator"
qg_tokenizer = AutoTokenizer.from_pretrained(QG_PRETRAINED, use_fast=False)

In [5]:
preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=False,
    split_by=split_by,
    split_length=split_length,
    split_respect_sentence_boundary=True,
)

In [6]:
def get_full_path(filename):
    fpath = f"./data/{filename}"
    
    return fpath

In [7]:
def read_file(path):
    with open(path) as f:
        j = json.load(f)
        return j

In [8]:
def file_to_doc(path, preprocessor):
    data = read_file(path)
    docs = [Document(content=t).to_dict() for t in data["texts"]]

    return preprocessor.process(docs)

In [9]:
def file_to_elementary_docs_json(filename, preprocessor):
    path = get_full_path(filename)

    json_data = {}
    
    counter = 1
    
    for d in file_to_doc(path, preprocessor):
        elem_doc_id = input_file_name + "_"  + str(counter)
        json_data[elem_doc_id] = d.content
        
        counter = counter + 1

    elementary_docs_json = json.dumps(json_data)
    
    return elementary_docs_json

In [10]:
def file_list_to_elementary_docs_json(input_file_names, preprocessor):
    
    json_data = {}
    
    for filename in input_file_names:
        path = get_full_path(filename)

        current_json = {}
    
        counter = 1
    
        for d in file_to_doc(path, preprocessor):
            elem_doc_id = filename + "_"  + str(counter)
            current_json[elem_doc_id] = d.content
                        
            counter = counter + 1

        json_data = {**json_data, **current_json}
        
    return json_data

In [11]:
class Payload:
    def __init__(self, text, num_questions, answer_style=None):
        self.text = text
        self.num_questions = num_questions
        self.answer_style = answer_style

In [12]:
def generate_questions(input_file_names, preprocessor):
    
    start = time.time()
  
    
    json_data = file_list_to_elementary_docs_json(input_file_names, preprocessor)
    
    results_data = {}
    
    for key in json_data:
        doc = json_data[key]
        
        payload = Payload(doc, num_questions=num_questions, answer_style="sentences")

        qg = QuestionGenerationModelB()
        result = qg.predict(payload)
        
        results_data[key] = result
    
    end = time.time()
    
    duration_seconds = end - start
    duration_minutes = int(duration_seconds / 60)
    
    print("Done the questions generation in " + str(duration_seconds) + " seconds, which is " + str(duration_minutes) + " minutes.")
    
    return results_data

In [13]:
res = generate_questions(input_file_names, preprocessor)

100%|█████████████████████████████████████████| 10/10 [00:00<00:00, 84.49docs/s]
100%|████████████████████████████████████████| 10/10 [00:00<00:00, 957.84docs/s]


Generating questions...

Evaluating QA pairs...



INFO - backoff -  Backing off send_request(...) for 0.3s (requests.exceptions.ConnectTimeout: HTTPSConnectionPool(host='tm.hs.deepset.ai', port=443): Max retries exceeded with url: /batch/ (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fd3611fc8b0>, 'Connection to tm.hs.deepset.ai timed out. (connect timeout=15)')))


Generating questions...

Evaluating QA pairs...



INFO - backoff -  Backing off send_request(...) for 1.6s (requests.exceptions.ConnectTimeout: HTTPSConnectionPool(host='tm.hs.deepset.ai', port=443): Max retries exceeded with url: /batch/ (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fd359424130>, 'Connection to tm.hs.deepset.ai timed out. (connect timeout=15)')))


Generating questions...

Evaluating QA pairs...

Generating questions...

Evaluating QA pairs...


Was only able to generate 2 questions. For more questions, please input a longer text.


INFO - backoff -  Backing off send_request(...) for 0.5s (requests.exceptions.ConnectTimeout: HTTPSConnectionPool(host='tm.hs.deepset.ai', port=443): Max retries exceeded with url: /batch/ (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fd35be3e3d0>, 'Connection to tm.hs.deepset.ai timed out. (connect timeout=15)')))


Generating questions...

Evaluating QA pairs...



ERROR - backoff -  Giving up send_request(...) after 4 tries (requests.exceptions.ConnectTimeout: HTTPSConnectionPool(host='tm.hs.deepset.ai', port=443): Max retries exceeded with url: /batch/ (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fd3600357c0>, 'Connection to tm.hs.deepset.ai timed out. (connect timeout=15)')))
ERROR - posthog -  error uploading: HTTPSConnectionPool(host='tm.hs.deepset.ai', port=443): Max retries exceeded with url: /batch/ (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fd3600357c0>, 'Connection to tm.hs.deepset.ai timed out. (connect timeout=15)'))


Generating questions...

Evaluating QA pairs...


Was only able to generate 0 questions. For more questions, please input a longer text.
Generating questions...

Evaluating QA pairs...

Generating questions...

Evaluating QA pairs...

Generating questions...

Evaluating QA pairs...

Generating questions...





Evaluating QA pairs...

Generating questions...

Evaluating QA pairs...

Generating questions...

Evaluating QA pairs...

Generating questions...

Evaluating QA pairs...

Generating questions...

Evaluating QA pairs...

Generating questions...

Evaluating QA pairs...

Generating questions...

Evaluating QA pairs...

Generating questions...

Evaluating QA pairs...

Generating questions...

Evaluating QA pairs...

Generating questions...

Evaluating QA pairs...

Generating questions...

Evaluating QA pairs...


Was only able to generate 1 questions. For more questions, please input a longer text.
Generating questions...

Evaluating QA pairs...

Generating questions...

Evaluating QA pairs...

Generating questions...

Evaluating QA pairs...

Generating questions...

Evaluating QA pairs...

Generating questions...

Evaluating QA pairs...

Done the questions generation in 757.1004436016083 seconds, which is 12 minutes.


In [14]:
#save the results into a json file
#the file name is qa_pairs.json
#the path where it is saved is "data/qa/"
#if "qa" folder does not exist, it will be created

output_path = "data/qa"

path_exists = os.path.exists(output_path)

if not path_exists:
    os.makedirs(output_path)

output_full_path = output_path + "/qa_pairs.json"

with open(output_full_path, "w") as f:
    f.write(json.dumps(res))

In [15]:
res

{'10.json_1': {'text': 'BISE Specific Privacy Statement\nIntroduction\nBISE, The Biodiversity Information System for Europe is a joint initiative of the European Commission- DG Environment, and the European Environment Agency (EEA), to offer a single entry point for information and data to support the implementation of the EU biodiversity policy in support of the implementation of its actions and track progress towards both the EU and global biodiversity targets. An introduction about BISE is provided here: https://biodiversity.europa.eu/info\nPersonal data collected by the European BISE portal are processed in accordance with Regulation (EU) 2018/1725 of the European Parliament and of the Council of 23 October 2018 on the protection of natural persons with regard to the processing of personal data by the Union institutions, bodies, offices and agencies and on the free movement of such data. Collection and processing of data are under the responsibility of the European Environment Agen

In [16]:
def convert_to_squad(input_path, output_path):
    
    with open(input_path) as f:
        dataset = json.load(f)
    
    doc_counter = 1
    
    json_squad = {}
    json_squad["data"] = []
    
    with open(output_path, "w") as f:
                    
        for key in dataset:
            
            curr_json_squad_elem = {}
            
            question_counter = 1
            
            curr_json = dataset[key]
            
            title = "title_" + str(doc_counter)
            paragraphs = []

            curr_paragraph = {}
            
            context = curr_json["text"]
            tokenized_context = qg_tokenizer(context)
            transformed_context = qg_tokenizer.decode(tokenized_context.input_ids)
            
            transformed_context = transformed_context.replace("</s>","")
            
            curr_paragraph["context"] = transformed_context
            
            qas = []
            
            questions_and_answers = curr_json["questions"]
            
            for qa_pair in questions_and_answers:
                curr_qa = {}
                curr_qa["answers"] = []
                
                curr_qa_curr_ans = {}
            
                answer = qa_pair["answer"].replace("</s>","")
                curr_qa_curr_ans["text"] = answer
                
                curr_qa_curr_ans["answer_start"] = curr_paragraph["context"].find(answer)
                

                curr_qa["answers"].append(curr_qa_curr_ans)
                
                #double the answer when an answer is found
                #if (curr_qa_curr_ans["answer_start"] != -1):
                #    curr_qa["answers"].append(curr_qa_curr_ans) 
                
                curr_qa["id"] = key + "_" + str(question_counter)
                question = qa_pair["question"].replace("</s>","")
                curr_qa["question"] = question
                
                if (curr_qa_curr_ans["answer_start"] == -1): 
                    curr_qa["is_impossible"] = True
                else:
                    curr_qa["is_impossible"] = False

                qas.append(curr_qa)
                
                question_counter += 1

            curr_paragraph["qas"] = qas

            paragraphs.append(curr_paragraph)

            curr_json_squad_elem["title"] = key
            curr_json_squad_elem["paragraphs"] = paragraphs
            
            json_squad["data"].append(curr_json_squad_elem)
            
            doc_counter += 1
    
        json.dump(json_squad, f, indent=4)

In [17]:
input_full_path = "data/qa/qa_pairs.json"
converted_output_full_path ="data/qa/qa_pairs_squad.json" #this file contains the qa pairs in SQuAD format
convert_to_squad(input_full_path, converted_output_full_path)