# Query generation

In [None]:
%pip install chunkipy 
%pip install python-terrier
%pip install pyterrier_pisa
%pip install git+https://github.com/terrierteam/pyterrier_dr.git
%pip install git+https://github.com/terrierteam/pyterrier_t5.git

In [None]:
import pandas as pd
from pyterrier_doc2query import Doc2Query, QueryScorer, QueryFilter
from pyterrier_dr import ElectraScorer

In [None]:
# create pandas df
def append_queries(input_data):
    for row in input_data:
        row["querygen"] = row["querygen"].split("\n")


### No ranking

In [None]:
doc2query = Doc2Query(num_samples=5)

queries = doc2query([
  {'docno': '0', 'text': 'The presence of communication amid scientific minds was equally important to the success of the Manhattan Project as scientific intellect was. The only cloud hanging over the impressive achievement of the atomic researchers and engineers is what their success truly meant; hundreds of thousands of innocent lives obliterated.'},
  {'docno': '100', 'text': "Antonín Dvorák (1841–1904) Antonin Dvorak was a son of butcher, but he did not follow his father's trade. While assisting his father part-time, he studied music, and graduated from the Prague Organ School in 1859."},
  {'docno': '1000', 'text': 'QuickFacts Matanuska-Susitna Borough, Alaska; UNITED STATES QuickFacts provides statistics for all states and counties, and for cities and towns with a population of 5,000 or more.'},
])

append_queries(queries)
queries

### Ranking

In [None]:
doc2query = Doc2Query(num_samples=5)
scorer = ElectraScorer()

# inspection
pipeline = doc2query >> QueryScorer(scorer) #>> QueryFilter(append=False, t=3.21484375) # 30% electra filter

queries = pipeline([
  {'doc_id': '0', 'p':"e", 'text': 'The presence of communication amid scientific minds was equally important to the success of the Manhattan Project as scientific intellect was. The only cloud hanging over the impressive achievement of the atomic researchers and engineers is what their success truly meant; hundreds of thousands of innocent lives obliterated.'},
  {'doc_id': '100', 'text': "Antonín Dvorák (1841–1904) Antonin Dvorak was a son of butcher, but he did not follow his father's trade. While assisting his father part-time, he studied music, and graduated from the Prague Organ School in 1859."},
  {'doc_id': '1000', 'text': 'QuickFacts Matanuska-Susitna Borough, Alaska; UNITED STATES QuickFacts provides statistics for all states and counties, and for cities and towns with a population of 5,000 or more.'},
])
append_queries(queries)
queries

## Process to batches

In [None]:
import json
import os

def list_all_files(directory):
    file_list = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            file_list.append(os.path.join(root, file))
    return file_list

def batch_processing(batch, pipeline):

    # generate queries
    queries = pipeline(batch)

    # modify data structure
    for query in queries:
        if "querygen_score" in query:
            query["querygen_score"] = [str(num) for num in query["querygen_score"]]
    append_queries(queries)       

    return queries

def process(input_data_path, output_data_path, pipeline, batch_size):

    # list all docs
    docs = list_all_files(input_data_path)
    
    n_batches = len(docs) // batch_size + 1 * ( len(docs) % batch_size > 0 )

    print(f"Total number of docs: {len(docs)}")

    destination_files = list_all_files(output_data_path)

    batch_num = 1
    for i in range(0, len(docs), batch_size):

        # check if batch already processed
        if f"{output_data_path}/batch_{batch_num}.json" in destination_files:
            continue


        files_batch = []

        for file in docs[i:i+batch_size]:
            # Open and read the JSON file
            with open(file, 'r') as file_n:
                files_batch.append(json.load(file_n))

        docs_batch = []

        for file in files_batch:
            doc = {
                "id":f"{file['docId']}__{file['chunk_index']}",
                "doc_id": file["docId"],
                "chunk_id": file["chunk_index"],
                "document_category": file["document_category"],
                "text": file["chunk_text"]
            }
            docs_batch.append(doc)
        
        # process batch
        batch_queries = batch_processing(docs_batch, pipeline)

        # save batch
        with open(f"{output_data_path}/batch_{batch_num}.json", "w") as outfile:
            json.dump(batch_queries, outfile, indent=2)
        
        print(f"Batch processed : {batch_num} / {n_batches}")
        batch_num += 1

    

In [None]:
INPUT_DATA_PATH = "./chunks"
OUTPUT_DATA_PATH = "./queries"
BATCH_SIZE = 32 # DO NOT CHANGE AFTER FIRST RUN
N_QUERIES = 3

# pipeline
doc2query = Doc2Query(num_samples=N_QUERIES)
scorer = ElectraScorer()

# inspection
pipeline = doc2query >> QueryScorer(scorer)

process(INPUT_DATA_PATH, OUTPUT_DATA_PATH, pipeline, BATCH_SIZE)