In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer, util
from itertools import combinations
import numpy as np
import json
from tqdm import tqdm

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

# Loading Dataset : NFCorpus

In [None]:
from datasets import load_dataset

In [None]:
from tqdm import tqdm

In [None]:
dataset = load_dataset("BeIR/scifact", "corpus")

In [None]:
dataset

In [None]:
dataset_q = load_dataset("BeIR/scifact", "queries")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/14.0k [00:00<?, ?B/s]

scifact.py:   0%|          | 0.00/1.66k [00:00<?, ?B/s]

queries/queries/0000.parquet:   0%|          | 0.00/67.5k [00:00<?, ?B/s]

Generating queries split:   0%|          | 0/1109 [00:00<?, ? examples/s]

In [None]:
dataset_q

DatasetDict({
    queries: Dataset({
        features: ['_id', 'title', 'text'],
        num_rows: 1109
    })
})

In [None]:
dataset_qrel = load_dataset("BeIR/scifact-qrels", split="test")

train.tsv:   0%|          | 0.00/14.5k [00:00<?, ?B/s]

test.tsv:   0%|          | 0.00/5.39k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/919 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/339 [00:00<?, ? examples/s]

In [None]:
dataset_qrel

Dataset({
    features: ['query-id', 'corpus-id', 'score'],
    num_rows: 339
})

In [None]:
dataset_qrel[0]

{'query-id': 1, 'corpus-id': 31715818, 'score': 1}

In [None]:
from collections import defaultdict

# Initialize a defaultdict to store query-doc pairs
qrels = defaultdict(dict)

# Iterate over the dataset and populate the dictionary
for qrel in dataset_qrel:
    qrels[str(qrel["query-id"])][str(qrel["corpus-id"])] = qrel["score"]

# Convert defaultdict back to a regular dictionary if needed
qrels = dict(qrels)

In [None]:
qrels

{'1': {'31715818': 1},
 '3': {'14717500': 1},
 '5': {'13734012': 1},
 '13': {'1606628': 1},
 '36': {'5152028': 1, '11705328': 1},
 '42': {'18174210': 1},
 '48': {'13734012': 1},
 '49': {'5953485': 1},
 '50': {'12580014': 1},
 '51': {'45638119': 1},
 '53': {'45638119': 1},
 '54': {'49556906': 1},
 '56': {'4709641': 1},
 '57': {'4709641': 1},
 '70': {'5956380': 1, '4414547': 1},
 '72': {'6076903': 1},
 '75': {'4387784': 1},
 '94': {'1215116': 1},
 '99': {'18810195': 1},
 '100': {'4381486': 1},
 '113': {'6157837': 1},
 '115': {'33872649': 1},
 '118': {'6372244': 1},
 '124': {'4883040': 1},
 '127': {'21598000': 1},
 '128': {'8290953': 1},
 '129': {'27768226': 1},
 '130': {'27768226': 1},
 '132': {'7975937': 1},
 '133': {'38485364': 1,
  '6969753': 1,
  '17934082': 1,
  '16280642': 1,
  '12640810': 1},
 '137': {'26016929': 1},
 '141': {'6955746': 1, '14437255': 1},
 '142': {'10582939': 1},
 '143': {'10582939': 1},
 '146': {'10582939': 1},
 '148': {'1084345': 1},
 '163': {'18872233': 1},
 '1

In [None]:
print(f"Number of query-document pairs: {len(qrels)}")

Number of query-document pairs: 300


In [None]:
queries = {query["_id"]: query["text"] for query in dataset_q["queries"]}

In [None]:
print(f"Number of queries: {len(queries)}")


Number of queries: 1109


In [None]:
corpus = {doc["_id"]: doc['text'] for doc in dataset["corpus"]}

In [None]:
print(f"Corpus size: {len(corpus)}")
print(f"Number of queries: {len(queries)}")
print(f"Number of query-document pairs: {len(qrels)}")

Corpus size: 3633
Number of queries: 3237
Number of query-document pairs: 323


In [None]:
len(corpus)

3633

# Part 1

In [None]:
# Load generative model (GPT-3.5 or FLAN-T5)
def load_generative_model(model_name="google/flan-t5-large"):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    return tokenizer, model

In [None]:
# Segment document into chunks
def segment_document(document, chunk_size, overlap):
    tokens = document.split()
    chunks = []
    for i in range(0, len(tokens), chunk_size - overlap):
        chunk = tokens[i:i + chunk_size]
        chunks.append(" ".join(chunk))
    return chunks

In [None]:
def generate_pseudo_queries(chunks, tokenizer, model, top_k_ques=5):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)  # Move the model to the same device (GPU or CPU)

    # Prepare the prompts
    prompts = [
        f"Generate a detailed and nuanced question focusing on the most significant aspects of the passage:\n\n{chunk}\n\n"
        for chunk in chunks
    ]

    # Tokenize the input
    inputs = tokenizer(prompts, truncation=True, padding=True, return_tensors="pt", max_length=512)
    # print(inputs)


    # Move input tensors to the same device as the model
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Generate pseudo queries without gradients
    with torch.no_grad():
        outputs = model.generate(
            inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_length=20,
            num_return_sequences=top_k_ques,  # Number of queries to generate per chunk
            do_sample=True,  # Use sampling to generate diverse queries
            top_k=5         # Limit to the top 10 most probable next tokens
        )

    # Decode the generated sequences into queries
    queries = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    # Since `batch_decode` will return a flat list, we need to group them into batches of `top_k_ques`
    queries = [
        queries[i:i + top_k_ques]
        for i in range(0, len(queries), top_k_ques)
    ]

    return queries


In [None]:

# Diversity filtering using semantic similarity
def filter_diverse_queries(queries, similarity_model, threshold=0.8):
    filtered_queries = []
    embeddings = similarity_model.encode(queries, convert_to_tensor=True)
    for i, query in enumerate(queries):
        is_redundant = any(
            util.cos_sim(embeddings[i], embeddings[j]) > threshold for j in range(len(filtered_queries))
        )
        if not is_redundant:
            filtered_queries.append(query)
    return filtered_queries


In [None]:
def calculate_similarity_score(user_pseudo_query, document_pseudo_queries, similarity_model):
    """
    Calculate the similarity score for a query-document pair.

    Parameters:
    - query: The user query (string).
    - document_pseudo_queries: List of pseudo-queries for the document.
    - similarity_model: Preloaded Sentence Transformer model.

    Returns:
    - max_similarity: Maximum similarity score between query and pseudo-queries.
    """
    # Compute embeddings for the query and pseudo-queries
    # query_embedding = similarity_model.encode(user_pseudo_query, convert_to_tensor=True)
    # # pseudo_query_embeddings = similarity_model.encode(document_pseudo_queries, convert_to_tensor=True)

    # # Compute cosine similarity scores
    # similarity_scores = util.cos_sim(query_embedding, document_pseudo_queries)
    device = "cuda" if torch.cuda.is_available() else "cpu"

    similarity_model_gpu = similarity_model.to(device)

    # Compute embedding for the query on the specified device
    query_embedding = similarity_model_gpu.encode(
        user_pseudo_query,
        convert_to_tensor=True,
        device=device
    )

    if query_embedding.dim() == 1:
        query_embedding = query_embedding.unsqueeze(0)

    # Ensure pseudo-query embeddings are also on the same device
    document_pseudo_queries = document_pseudo_queries.to(device)
    # print(query_embedding.shape)
    # print(document_pseudo_queries.shape)

    # Compute cosine similarity scores
    similarity_scores = util.cos_sim(query_embedding, document_pseudo_queries)

    # Find the maximum similarity score
    max_similarity = torch.max(similarity_scores).item()
    return max_similarity



In [None]:
def rank_documents_by_query(query, document_pseudo_queries_embeddings, similarity_model):
    """
    Rank documents based on the similarity of their pseudo-queries to the query.

    Parameters:
    - query: The user query (string).
    - document_queries_dict: Dictionary where keys are document IDs and values are lists of pseudo-queries.
    - similarity_model: Preloaded Sentence Transformer model.

    Returns:
    - ranked_scores: List of tuples (document_id, score) sorted by descending scores.
    """
    document_scores = {}
    for doc_id, pseudo_queries_embeddings in document_pseudo_queries_embeddings.items():
        # print(doc_id)
        # print(pseudo_queries)
        # print("yoo")
        score = calculate_similarity_score(query, pseudo_queries_embeddings, similarity_model)
        document_scores[doc_id] = score

    # Sort documents by scores in descending order
    ranked_scores = sorted(document_scores.items(), key=lambda x: x[1], reverse=True)
    return ranked_scores

In [None]:
# Model names
generative_model_name = "google/flan-t5-large"
similarity_model_name = "all-mpnet-base-v2"
chunk_size=500
overlap=0
threshold=0.8

In [None]:
similarity_model = SentenceTransformer(similarity_model_name)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# Main function to process the corpus
def process_corpus(corpus, generative_model_name, similarity_model, chunk_size, overlap, threshold):
    tokenizer, generative_model = load_generative_model(generative_model_name)

  # Initialize dictionaries to store the chunks and queries
    document_chunks = {}
    document_generated_queries = {}

    # Loop over each document in the corpus with tqdm progress bar
    for doc_id, document in tqdm(corpus.items(), desc="Processing Documents"):
        # Segment the document into chunks
        chunks = segment_document(document, chunk_size, overlap)

        # Store the chunks in a dictionary with the doc_id as the key
        document_chunks[doc_id] = chunks

        chunk_queries = []
        generated_queries = generate_pseudo_queries(chunks, tokenizer, generative_model)
        # print("Generated Queries")
        # print(generated_queries)
        # print("Generated Queries [0] ")
        # print(generated_queries[0])
        # break

        diverse_queries = filter_diverse_queries(generated_queries[0], similarity_model, threshold)
        chunk_queries.extend(diverse_queries)

        document_generated_queries[doc_id] = chunk_queries
    return document_generated_queries


# NOTE
Below is the code to call functions for generating the psuedo queries
You need not create the psuedo queries. Use the following link to find the generated psuedo queries which you can use to train the autoregressive model.

Link: https://drive.google.com/drive/folders/191D9QMsCVku2V1aCE0ZlkWvDqCzXlWQ3?usp=sharing

Check the files for their suffix to know which dataset they contain.

In [None]:
results = process_corpus(corpus, generative_model_name, similarity_model,chunk_size, overlap, threshold)

Processing Documents: 100%|██████████| 3633/3633 [57:48<00:00,  1.05it/s]


In [None]:
# with open("/content/drive/MyDrive/646Project/646Project/test/document_generated_queries_withdiversequeries_flan-t5-large_nfcorpus_DONOTRERUN.json", "w") as file:
#     json.dump(results, file, indent=4)
# print("Data saved as generated_queries.json")

Data saved as generated_queries.json


In [None]:
# type(results)

# Part 3

Read the file from the above link and load the content from the file in data object. then use it for evaluation metrics.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
file_path = "/content/drive/MyDrive/646Project/sharedfiles/document_generated_queries_withdiversequeries_flan-t5-large_scifact.json"

with open(file_path, "r") as file:
  data = json.load(file)


In [None]:
document_pseudo_queries = data

#NOTE

For now, we used the top 100 queries from qrels to test part 3, once model is trained. The generated psuedo query for the top 100 queries in qrels should be used instead.

In [None]:
queries_dict = list(qrels.items())[:100]

In [None]:
type(queries_dict)

list

In [None]:
result = [item[0] for item in queries_dict]

In [None]:
queries_dict

[('1', {'31715818': 1}),
 ('3', {'14717500': 1}),
 ('5', {'13734012': 1}),
 ('13', {'1606628': 1}),
 ('36', {'5152028': 1, '11705328': 1}),
 ('42', {'18174210': 1}),
 ('48', {'13734012': 1}),
 ('49', {'5953485': 1}),
 ('50', {'12580014': 1}),
 ('51', {'45638119': 1}),
 ('53', {'45638119': 1}),
 ('54', {'49556906': 1}),
 ('56', {'4709641': 1}),
 ('57', {'4709641': 1}),
 ('70', {'5956380': 1, '4414547': 1}),
 ('72', {'6076903': 1}),
 ('75', {'4387784': 1}),
 ('94', {'1215116': 1}),
 ('99', {'18810195': 1}),
 ('100', {'4381486': 1}),
 ('113', {'6157837': 1}),
 ('115', {'33872649': 1}),
 ('118', {'6372244': 1}),
 ('124', {'4883040': 1}),
 ('127', {'21598000': 1}),
 ('128', {'8290953': 1}),
 ('129', {'27768226': 1}),
 ('130', {'27768226': 1}),
 ('132', {'7975937': 1}),
 ('133',
  {'38485364': 1, '6969753': 1, '17934082': 1, '16280642': 1, '12640810': 1}),
 ('137', {'26016929': 1}),
 ('141', {'6955746': 1, '14437255': 1}),
 ('142', {'10582939': 1}),
 ('143', {'10582939': 1}),
 ('146', {'1058

In [None]:
# type(result_2)

In [None]:
# with open("result.tsv", "w") as file:
#     for item in result:
#         file.write(item + "\n")

In [None]:

# import json
# with open("validation_result.json", "r") as file:
#     data_2 = json.load(file)

result_2 = list(user_pseudo_queries.keys())


In [None]:
a_minus_b = list(set(result) - set(result_2))
b_minus_a = list(set(result_2) - set(result))

# Output the results
print("A - B:", a_minus_b)
print("B - A:", b_minus_a)

A - B: ['491', '501']
B - A: ['1291', '871']


In [None]:
qrels['871']

KeyError: '871'

In [None]:
qrels['1291']

KeyError: '1291'

In [None]:
queries['440']

'Fz/PCP-dependent Pk localizes to the anterior membrane of notochord cells during zebrafish neuralation.'

In [None]:
user_pseudo_queries['440']

'Fz/PCP-dependent Pk localizes to the anterior membrane of notochord cells during zebrafish neuralation.  Does PKC1 play an important role in neurogenesis? Are there distinct types of p53 kinases associated with this phenotype and is it determined by environmental factors such as diet or genetic variation within species that are related to protein synthesis, metabolism, growth factor signalling etc.? The present study investigated whether endogenous proteins from different populations have specific roles on synaptic plasticity resulting from microRNAs involved in axonal localization via RNAi ligand (ARGs). Using mouse embryonic stem cell lineage data we found two genes encoding proteins ZnA2+ , which were observed at both sites after translation into neurons through ERK phosphorylation using ARGPRII cleavage complexes [ 14 ]. Both these transcriptional profiles can be inhibited without binding AKT6Y3 . This suggests they may contribute directly to'

In [None]:
queries['443']

'GATA-3 is important for hematopoietic stem cell (HSC) function.'

In [None]:
user_pseudo_queries['443']

'GATA-3 is important for hematopoietic stem cell (HSC) function.  Does the presence of an HCS inhibitor in mice reduce tumorigenesis? The role that myosinophil expression plays during differentiation and progression to cancerous cells has not been studied yet, but it appears possible TGFβ promotes growth by activating cytotoxic stress pathways through activation of NFARγ signaling pathway . PLoS One 9 : e91786 ?\n "The ability on chromosome 6q4t2/6f7n1a synapses are activated as well when DATP binds with caspase 2 at DNA binding sites such genes targeting Nrf8 or miRNAs." - How do CCDs activate transcriptional regulation via RNA interference from nucleotide substitutions within chromosomes 3c , 4d & 5e?? What mechanisms were found whereby gene dosage induced apoptosis after induction of proinflammatory cytokines'

In [None]:
queries['501']

'Headaches are not correlated with cognitive impairment.'

In [None]:
queries['491']

'HNF4A mutations can cause diabetes in mutant carriers by the age of 14 years'

In [None]:
def calculate_similarity_score(user_pseudo_query, document_pseudo_queries, similarity_model):
    """
    Calculate the similarity score for a query-document pair.

    Parameters:
    - query: The user query (string).
    - document_pseudo_queries: List of pseudo-queries for the document.
    - similarity_model: Preloaded Sentence Transformer model.

    Returns:
    - max_similarity: Maximum similarity score between query and pseudo-queries.
    """
    # Compute embeddings for the query and pseudo-queries
    # query_embedding = similarity_model.encode(user_pseudo_query, convert_to_tensor=True)
    # # pseudo_query_embeddings = similarity_model.encode(document_pseudo_queries, convert_to_tensor=True)

    # # Compute cosine similarity scores
    # similarity_scores = util.cos_sim(query_embedding, document_pseudo_queries)
    device = "cuda" if torch.cuda.is_available() else "cpu"

    similarity_model_gpu = similarity_model.to(device)

    # Compute embedding for the query on the specified device
    query_embedding = similarity_model_gpu.encode(
        user_pseudo_query,
        convert_to_tensor=True,
        device=device
    )

    if query_embedding.dim() == 1:
        query_embedding = query_embedding.unsqueeze(0)

    # Ensure pseudo-query embeddings are also on the same device
    document_pseudo_queries = document_pseudo_queries.to(device)
    # print(query_embedding.shape)
    # print(document_pseudo_queries.shape)

    # Compute cosine similarity scores
    similarity_scores = util.cos_sim(query_embedding, document_pseudo_queries)

    # Find the maximum similarity score
    max_similarity = torch.max(similarity_scores).item()
    return max_similarity



In [None]:
def rank_documents_by_query(query, document_pseudo_queries_embeddings, similarity_model):
    """
    Rank documents based on the similarity of their pseudo-queries to the query.

    Parameters:
    - query: The user query (string).
    - document_queries_dict: Dictionary where keys are document IDs and values are lists of pseudo-queries.
    - similarity_model: Preloaded Sentence Transformer model.

    Returns:
    - ranked_scores: List of tuples (document_id, score) sorted by descending scores.
    """
    document_scores = {}
    for doc_id, pseudo_queries_embeddings in document_pseudo_queries_embeddings.items():
        # print(doc_id)
        # print(pseudo_queries)
        # print("yoo")
        score = calculate_similarity_score(query, pseudo_queries_embeddings, similarity_model)
        document_scores[doc_id] = score

    # Sort documents by scores in descending order
    ranked_scores = sorted(document_scores.items(), key=lambda x: x[1], reverse=True)
    return ranked_scores

In [None]:
# queries_dict = list(qrels.items())[:2]
test_dict = {}
document_pseudo_queries_embeddings={}
device = "cuda" if torch.cuda.is_available() else "cpu"
similarity_model_gpu = similarity_model.to(device)

for doc_id, pseudo_queries in tqdm(document_pseudo_queries.items()):
  # print(pseudo_queries)
  # print(len(pseudo_queries))
  pseudo_query_embeddings = similarity_model_gpu.encode(pseudo_queries, convert_to_tensor=True, device = device)
  document_pseudo_queries_embeddings[doc_id] = pseudo_query_embeddings
  # # print(doc_id)
  # print(pseudo_query_embeddings[0])
  # print(len(pseudo_query_embeddings))
  # # print(document_pseudo_queries_embeddings)
  # break



100%|██████████| 5183/5183 [01:54<00:00, 45.34it/s]


  # NOTE
  Replace the commented line in below code to get the generated psuedo query from the model for top 100 queries in qrels.

  *user_pseudo_query = generate_pseudo_query(queries[query_id])*


  Suggestion : Instead of generating inside the loop, it would be better to generate for all 100 queries at once and then used as needed


In [None]:
import json
with open("/content/drive/MyDrive/646Project/sharedfiles/scifact_gpt2_id_response_mapping_results.json", "r") as file:
    user_pseudo_queries = json.load(file)

In [None]:
user_pseudo_queries

{'871': 'Obesity decreases life quality.  What is the role of adipose tissue in maintaining health? Does it affect disease progression and survival to adulthood, or does this mechanism depend on obesity status during childhood exposure? Are there differences between adult adults with higher body mass index (BMI) values compared to those without BMI at adolescence years old who are more susceptible to complications associated by diabetes mellitus than do older individuals whose bodies have less fat around them before puberty onset? How did weight gain occur over time within middle age versus late twenties when overweight persons were not exposed to adverse events such as low blood pressure prior research has suggested that high waist circumference affects early development following pregnancy among nonobese children ? Do obese patients be recruited into treatment programs after they reach their preterm birth date under conditions which increase risk factors including cardiovascular dise

In [None]:
for query_id, doc_score in tqdm(queries_dict):
  # print(query_id)
  # user_pseudo_query = generate_pseudo_query(queries[query_id])
  user_pseudo_query = user_pseudo_queries[query_id]
  if(user_pseudo_query == ""):
    print("error")
    break
  doc_rank_for_query = rank_documents_by_query(user_pseudo_query, document_pseudo_queries_embeddings, similarity_model)
  # print(type(doc_rank_for_query))
  test_dict[query_id]={}
  for doc_id, score in doc_rank_for_query:
    test_dict[str(query_id)][str(doc_id)] = float(score)


 98%|█████████▊| 98/100 [3:01:33<03:42, 111.16s/it]


KeyError: '491'

In [None]:
test_dict

{'1': {'31715818': 0.4951883554458618,
  '82665667': 0.4868339002132416,
  '17388232': 0.4576263427734375,
  '3770726': 0.44607916474342346,
  '17123657': 0.4392583966255188,
  '28071965': 0.4381023645401001,
  '16057926': 0.43264660239219666,
  '9580772': 0.4294033646583557,
  '3874000': 0.427815705537796,
  '21746539': 0.42589065432548523,
  '1275505': 0.4242013096809387,
  '10982689': 0.4112655222415924,
  '5764562': 0.4101751148700714,
  '6219790': 0.40856897830963135,
  '10607877': 0.40799811482429504,
  '13513790': 0.4022824168205261,
  '1546650': 0.4017248749732971,
  '11172205': 0.3999215364456177,
  '2727303': 0.399031400680542,
  '22867765': 0.3989674150943756,
  '15327601': 0.3962341547012329,
  '20758340': 0.39367416501045227,
  '4928057': 0.387709379196167,
  '39326723': 0.38762274384498596,
  '18758057': 0.38647305965423584,
  '11784947': 0.3853013515472412,
  '33986200': 0.3848946988582611,
  '7840442': 0.3848038613796234,
  '2121272': 0.38180121779441833,
  '37437064': 

In [None]:
len(test_dict)

98

In [None]:
user_pseudo_queries

{'871': 'Obesity decreases life quality.  What is the role of adipose tissue in maintaining health? Does it affect disease progression and survival to adulthood, or does this mechanism depend on obesity status during childhood exposure? Are there differences between adult adults with higher body mass index (BMI) values compared to those without BMI at adolescence years old who are more susceptible to complications associated by diabetes mellitus than do older individuals whose bodies have less fat around them before puberty onset? How did weight gain occur over time within middle age versus late twenties when overweight persons were not exposed to adverse events such as low blood pressure prior research has suggested that high waist circumference affects early development following pregnancy among nonobese children ? Do obese patients be recruited into treatment programs after they reach their preterm birth date under conditions which increase risk factors including cardiovascular dise

In [None]:
queries_dict

[('1', {'31715818': 1}),
 ('3', {'14717500': 1}),
 ('5', {'13734012': 1}),
 ('13', {'1606628': 1}),
 ('36', {'5152028': 1, '11705328': 1}),
 ('42', {'18174210': 1}),
 ('48', {'13734012': 1}),
 ('49', {'5953485': 1}),
 ('50', {'12580014': 1}),
 ('51', {'45638119': 1}),
 ('53', {'45638119': 1}),
 ('54', {'49556906': 1}),
 ('56', {'4709641': 1}),
 ('57', {'4709641': 1}),
 ('70', {'5956380': 1, '4414547': 1}),
 ('72', {'6076903': 1}),
 ('75', {'4387784': 1}),
 ('94', {'1215116': 1}),
 ('99', {'18810195': 1}),
 ('100', {'4381486': 1}),
 ('113', {'6157837': 1}),
 ('115', {'33872649': 1}),
 ('118', {'6372244': 1}),
 ('124', {'4883040': 1}),
 ('127', {'21598000': 1}),
 ('128', {'8290953': 1}),
 ('129', {'27768226': 1}),
 ('130', {'27768226': 1}),
 ('132', {'7975937': 1}),
 ('133',
  {'38485364': 1, '6969753': 1, '17934082': 1, '16280642': 1, '12640810': 1}),
 ('137', {'26016929': 1}),
 ('141', {'6955746': 1, '14437255': 1}),
 ('142', {'10582939': 1}),
 ('143', {'10582939': 1}),
 ('146', {'1058

In [None]:
len(user_pseudo_queries)

100

In [None]:
len(queries_dict)

100

In [None]:
test_dict

In [None]:
top_n = 10

In [None]:
top_documents = {}
for query_id, doc_scores in test_dict.items():
  # Directly take the top N documents (already sorted)
  top_documents[str(query_id)] = dict(list(doc_scores.items())[:top_n])

In [None]:
top_documents

{'1': {'31715818': 0.4951883554458618,
  '82665667': 0.4868339002132416,
  '17388232': 0.4576263427734375,
  '3770726': 0.44607916474342346,
  '17123657': 0.4392583966255188,
  '28071965': 0.4381023645401001,
  '16057926': 0.43264660239219666,
  '9580772': 0.4294033646583557,
  '3874000': 0.427815705537796,
  '21746539': 0.42589065432548523},
 '3': {'503050': 0.6526093482971191,
  '5935987': 0.638474702835083,
  '18670': 0.6268690824508667,
  '3174305': 0.6208770871162415,
  '16016673': 0.6145445704460144,
  '18218379': 0.6068553924560547,
  '23665162': 0.6014305353164673,
  '22038539': 0.5981489419937134,
  '9291668': 0.5940287113189697,
  '7988832': 0.5914297699928284},
 '5': {'13734012': 0.7612267732620239,
  '18617259': 0.6205020546913147,
  '21550246': 0.6008192300796509,
  '1292369': 0.5936995148658752,
  '42240424': 0.5422430038452148,
  '29657303': 0.5349238514900208,
  '23124332': 0.5324510335922241,
  '695938': 0.5320730805397034,
  '1583041': 0.5057469010353088,
  '1958440':

In [None]:
# queries['PLAIN-1050']

In [None]:
# user_pseudo_queries['PLAIN-1050']

In [None]:
# user_pseudo_queries['PLAIN-12']

In [None]:
# queries['PLAIN-12']

In [None]:
# queries['PLAIN-91']

In [None]:
# user_pseudo_queries['PLAIN-91']

In [None]:
# top_documents['PLAIN-1050']

In [None]:
# qrels['PLAIN-1050']

In [None]:
!pip install pytrec_eval

Collecting pytrec_eval
  Downloading pytrec_eval-0.5.tar.gz (15 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pytrec_eval
  Building wheel for pytrec_eval (setup.py) ... [?25l[?25hdone
  Created wheel for pytrec_eval: filename=pytrec_eval-0.5-cp310-cp310-linux_x86_64.whl size=308216 sha256=02a1e9ce60d6450da33cad7dbc2d52e7a54f5878679f153aa28c29bc1a982549
  Stored in directory: /root/.cache/pip/wheels/51/3a/cd/dcc1ddfc763987d5cb237165d8ac249aa98a23ab90f67317a8
Successfully built pytrec_eval
Installing collected packages: pytrec_eval
Successfully installed pytrec_eval-0.5


In [None]:
import pytrec_eval

In [None]:
evaluator = pytrec_eval.RelevanceEvaluator(qrels, {'P.3,5,10', 'recall.3,5,10', 'ndcg_cut.3,5,10', 'map_cut.3,5,10'})
result = evaluator.evaluate(top_documents)
print(result)
metrics = ['P','ndcg_cut', 'recall', 'map_cut']
cutoffs = [3,5,10]
scores = {f'{metric}_{cutoff}': 0 for metric in metrics for cutoff in cutoffs}
for key in result:
  for metric in metrics:
    for cutoff in cutoffs:
      scores[f'{metric}_{cutoff}'] += result[key][f'{metric}_{cutoff}']
run_length = len(test_dict)
for score in scores:
  scores[score] /= run_length

{'1': {'P_3': 0.3333333333333333, 'P_5': 0.2, 'P_10': 0.1, 'recall_3': 1.0, 'recall_5': 1.0, 'recall_10': 1.0, 'ndcg_cut_3': 1.0, 'ndcg_cut_5': 1.0, 'ndcg_cut_10': 1.0, 'map_cut_3': 1.0, 'map_cut_5': 1.0, 'map_cut_10': 1.0}, '3': {'P_3': 0.0, 'P_5': 0.0, 'P_10': 0.0, 'recall_3': 0.0, 'recall_5': 0.0, 'recall_10': 0.0, 'ndcg_cut_3': 0.0, 'ndcg_cut_5': 0.0, 'ndcg_cut_10': 0.0, 'map_cut_3': 0.0, 'map_cut_5': 0.0, 'map_cut_10': 0.0}, '5': {'P_3': 0.3333333333333333, 'P_5': 0.2, 'P_10': 0.1, 'recall_3': 1.0, 'recall_5': 1.0, 'recall_10': 1.0, 'ndcg_cut_3': 1.0, 'ndcg_cut_5': 1.0, 'ndcg_cut_10': 1.0, 'map_cut_3': 1.0, 'map_cut_5': 1.0, 'map_cut_10': 1.0}, '13': {'P_3': 0.0, 'P_5': 0.0, 'P_10': 0.0, 'recall_3': 0.0, 'recall_5': 0.0, 'recall_10': 0.0, 'ndcg_cut_3': 0.0, 'ndcg_cut_5': 0.0, 'ndcg_cut_10': 0.0, 'map_cut_3': 0.0, 'map_cut_5': 0.0, 'map_cut_10': 0.0}, '36': {'P_3': 0.0, 'P_5': 0.0, 'P_10': 0.0, 'recall_3': 0.0, 'recall_5': 0.0, 'recall_10': 0.0, 'ndcg_cut_3': 0.0, 'ndcg_cut_5': 0.0

In [None]:
scores

{'P_3': 0.08843537414965984,
 'P_5': 0.05918367346938778,
 'P_10': 0.04081632653061226,
 'ndcg_cut_3': 0.2153580697039053,
 'ndcg_cut_5': 0.22718436626656718,
 'ndcg_cut_10': 0.25688942238981033,
 'recall_3': 0.23979591836734693,
 'recall_5': 0.27040816326530615,
 'recall_10': 0.35544217687074825,
 'map_cut_3': 0.20195578231292516,
 'map_cut_5': 0.20909863945578228,
 'map_cut_10': 0.22260285066407515}