# Query generation

In [None]:
%pip install chunkipy 
%pip install python-terrier
%pip install pyterrier_pisa
%pip install git+https://github.com/terrierteam/pyterrier_dr.git
%pip install git+https://github.com/terrierteam/pyterrier_t5.git

In [4]:
import pandas as pd
from pyterrier_doc2query import Doc2Query, QueryScorer, QueryFilter
from pyterrier_dr import ElectraScorer

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# create pandas df
def append_queries(input_data):
    for row in input_data:
        row["querygen"] = row["querygen"].split("\n")




### No ranking

In [21]:
doc2query = Doc2Query(num_samples=5)

queries = doc2query([
  {'docno': '0', 'text': 'The presence of communication amid scientific minds was equally important to the success of the Manhattan Project as scientific intellect was. The only cloud hanging over the impressive achievement of the atomic researchers and engineers is what their success truly meant; hundreds of thousands of innocent lives obliterated.'},
  {'docno': '100', 'text': "Antonín Dvorák (1841–1904) Antonin Dvorak was a son of butcher, but he did not follow his father's trade. While assisting his father part-time, he studied music, and graduated from the Prague Organ School in 1859."},
  {'docno': '1000', 'text': 'QuickFacts Matanuska-Susitna Borough, Alaska; UNITED STATES QuickFacts provides statistics for all states and counties, and for cities and towns with a population of 5,000 or more.'},
])

append_queries(queries)
queries

  warn('consider setting fast_tokenizer=True; it speeds up inference considerably')


[{'docno': '0',
  'text': 'The presence of communication amid scientific minds was equally important to the success of the Manhattan Project as scientific intellect was. The only cloud hanging over the impressive achievement of the atomic researchers and engineers is what their success truly meant; hundreds of thousands of innocent lives obliterated.',
  'querygen': ['what was the success of the manhattan project',
   'why was the manhattan project important',
   'what was the success of the manhattan project?',
   'why was the manhattan project so successful',
   'why was the manhattan project so successful']},
 {'docno': '100',
  'text': "Antonín Dvorák (1841–1904) Antonin Dvorak was a son of butcher, but he did not follow his father's trade. While assisting his father part-time, he studied music, and graduated from the Prague Organ School in 1859.",
  'querygen': ['what nationality was dvorak',
   'where is the organ school in prague?',
   'who was antony dvorak',
   'who was ann dv

### Ranking

In [21]:
doc2query = Doc2Query(num_samples=5)
scorer = ElectraScorer()

# inspection
pipeline = doc2query >> QueryScorer(scorer) #>> QueryFilter(append=False, t=3.21484375) # 30% electra filter

queries = pipeline([
  {'doc_id': '0', 'p':"e", 'text': 'The presence of communication amid scientific minds was equally important to the success of the Manhattan Project as scientific intellect was. The only cloud hanging over the impressive achievement of the atomic researchers and engineers is what their success truly meant; hundreds of thousands of innocent lives obliterated.'},
  {'doc_id': '100', 'text': "Antonín Dvorák (1841–1904) Antonin Dvorak was a son of butcher, but he did not follow his father's trade. While assisting his father part-time, he studied music, and graduated from the Prague Organ School in 1859."},
  {'doc_id': '1000', 'text': 'QuickFacts Matanuska-Susitna Borough, Alaska; UNITED STATES QuickFacts provides statistics for all states and counties, and for cities and towns with a population of 5,000 or more.'},
])
append_queries(queries)
queries

  warn('consider setting fast_tokenizer=True; it speeds up inference considerably')
ELECTRA scoring: 100%|██████████| 15/15 [00:00<00:00, 134432.82record/s]


[{'doc_id': '0',
  'p': 'e',
  'text': 'The presence of communication amid scientific minds was equally important to the success of the Manhattan Project as scientific intellect was. The only cloud hanging over the impressive achievement of the atomic researchers and engineers is what their success truly meant; hundreds of thousands of innocent lives obliterated.',
  'querygen': ['why was communication important in the manhattan project',
   'why is the success of the manhattan project so important',
   'what was an important accomplishment in the manhattan project',
   'what is the most significant achievement of the manhattan project?',
   'why was the manhattan project so successful'],
  'querygen_score': array([2.4463906, 3.1714365, 2.2407935, 1.1016814, 2.8727548],
        dtype=float32)},
 {'doc_id': '100',
  'p': nan,
  'text': "Antonín Dvorák (1841–1904) Antonin Dvorak was a son of butcher, but he did not follow his father's trade. While assisting his father part-time, he studi

In [24]:
[num for num in queries[0]["querygen_score"]]

[2.4463906, 3.1714365, 2.2407935, 1.1016814, 2.8727548]

## Process to batches

In [None]:
import json
import os

def list_all_files(directory):
    file_list = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            file_list.append(os.path.join(root, file))
    return file_list

def batch_processing(batch, pipeline):

    # generate queries
    queries = pipeline(batch)

    # modify data structure
    for query in queries:
        if "querygen_score" in query:
            query["querygen_score"] = [str(num) for num in query["querygen_score"]]
    append_queries(queries)       

    return queries

def process(input_data_path, output_data_path, pipeline, batch_size):

    # list all docs
    docs = list_all_files(input_data_path)
    
    quotient, remainder = divmod(len(docs), batch_size)
    n_batches = quotient + min(1, remainder)

    print(f"Total number of docs: {len(docs)}")

    for i in range(0, len(docs), batch_size):

        # check if batch already processed
        if f"{output_data_path}/batch_{i+1}.json" in list_all_files(output_data_path):
            continue

        ids_batch = range(i, i+batch_size)

        files_batch = []

        for file in docs[i:i+batch_size]:
            # Open and read the JSON file
            with open(file, 'r') as file_n:
                files_batch.append(json.load(file_n))

        docs_batch = []

        for j, file in enumerate(files_batch):
            doc = {
                "doc_id": f"{ids_batch[j+1]}__{file['chunk_index']}",
                "document_category": file["document_category"],
                "title": file["document_name"],
                "text": file["chunk_text"]
            }
            docs_batch.append(doc)
        
        # process batch
        batch_queries = batch_processing(docs_batch, pipeline)

        # save batch
        with open(f"{output_data_path}/batch_{i+1}.json", "w") as outfile:
            json.dump(batch_queries, outfile, indent=2)
        
        print(f"Batch processed : {i+1} / {n_batches}")

    

In [41]:
INPUT_DATA_PATH = "./chunks"
OUTPUT_DATA_PATH = "./queries"
BATCH_SIZE = 8
N_QUERIES = 3

# pipeline
doc2query = Doc2Query(num_samples=N_QUERIES)
scorer = ElectraScorer()

# inspection
pipeline = doc2query >> QueryScorer(scorer)

process(INPUT_DATA_PATH, OUTPUT_DATA_PATH, pipeline, BATCH_SIZE)

  warn('consider setting fast_tokenizer=True; it speeds up inference considerably')


Total number of docs: 3652


ELECTRA scoring: 100%|██████████| 24/24 [00:11<00:00,  2.09record/s]


Batch processed : 1 / 457


ELECTRA scoring: 100%|██████████| 24/24 [00:14<00:00,  1.67record/s]


Batch processed : 9 / 457


ELECTRA scoring: 100%|██████████| 24/24 [00:12<00:00,  1.98record/s]


Batch processed : 17 / 457


ELECTRA scoring:  62%|██████▎   | 15/24 [00:00<00:00, 116.25record/s]


KeyboardInterrupt: 