In [None]:
# Baixe o dataset do ms marco tiny
#curl -O https://storage.googleapis.com/unicamp-dl/ia368dd_2023s1/msmarco/msmarco_triples.train.tiny.tsv


#!pip3 install nmslib
#!python -m pip install --upgrade pip
#!pip install pyserini
#!pip install datasets
#!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117
#!pip install sacrebleu 
#!pip install scikit-learn 
#!pip install torch
#!pip install sentencepiece
#!pip install transformers
#!pip install pandas
#!pip install torch
#!pip install transformers
#!pip install tqdm
#!pip install sacrebleu
#!pip install scikit-learn
#!pip install jupyter notebok
#!pip install jsonlines

In [None]:
!export PYTORCH_CUDA_ALLOC_CONF=garbage_collection_threshold:0.6

import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config, AdamW
from tqdm import tqdm
import sacrebleu

# Atualizar a classe MSMARCODataset
class MSMARCODataset(Dataset):
    def __init__(self, data_file, tokenizer, max_len):
        self.data = pd.read_csv(data_file, delimiter="\t", header=None, names=["query", "relevant_passage", "non_relevant_passage"])
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        query = self.data.loc[index, "query"]
        relevant_passage = self.data.loc[index, "relevant_passage"]
        tokenized_inputs = self.tokenizer(relevant_passage, return_tensors="pt", max_length=self.max_len, padding="max_length", truncation=True)
        tokenized_outputs = self.tokenizer(query, return_tensors="pt", max_length=self.max_len, padding="max_length", truncation=True)
        return {"input_ids": tokenized_inputs["input_ids"].squeeze(0), "attention_mask": tokenized_inputs["attention_mask"].squeeze(0), "labels": tokenized_outputs["input_ids"].squeeze(0)}

# Load the dataset and split it into training and validation sets
data_file = "msmarco_triples.train.tiny.tsv"
tokenizer = T5Tokenizer.from_pretrained("t5-small")
max_len = 128
dataset = MSMARCODataset(data_file, tokenizer, max_len)

train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Create DataLoaders for training and validation datasets
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Initialize the model and optimizer
model = T5ForConditionalGeneration.from_pretrained("t5-small")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = AdamW(model.parameters(), lr=1e-4)

# Train the seq2seq model and validate every X steps  
epochs = 2
validate_every_x_steps = 50
step_count = 0

for epoch in range(epochs):
    model.train()
    train_loss_accumulator = 0.0

    for batch in tqdm(train_dataloader):
        step_count += 1

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        train_loss_accumulator += loss.item()
        loss.backward()

        optimizer.step()
        optimizer.zero_grad()

        # Validate the model every X steps
        if step_count % validate_every_x_steps == 0:
            model.eval()
            val_loss_accumulator = 0.0
            refs = []
            hyps = []

            with torch.no_grad():
                for val_batch in val_dataloader:
                    val_input_ids = val_batch["input_ids"].to(device)
                    val_attention_mask = val_batch["attention_mask"].to(device)
                    val_labels = val_batch["labels"].to(device)

                    val_outputs = model(input_ids=val_input_ids, attention_mask=val_attention_mask, labels=val_labels)
                    val_loss = val_outputs.loss
                    val_loss_accumulator += val_loss.item()

                    generated = model.generate(val_input_ids, attention_mask=val_attention_mask, max_length=max_len)
                    hyps.extend(tokenizer.batch_decode(generated, skip_special_tokens=True))
                    refs.extend(tokenizer.batch_decode(val_labels, skip_special_tokens=True))

            val_loss_avg = val_loss_accumulator / len(val_dataloader)
            train_loss_avg = train_loss_accumulator / validate_every_x_steps
            bleu = sacrebleu.corpus_bleu(hyps, [refs])
            print(f"Step: {step_count}, Train Loss: {train_loss_avg}, Validation Loss: {val_loss_avg}, Validation BLEU: {bleu.score}")

            train_loss_accumulator = 0.0
            model.train()

model.save_pretrained("doc2query_model")


## Gere as consultas expandidas para o TREC-COVID

In [None]:
#!pip install datasets
#!pip install pyserini

a = trec_covid_corpus["corpus"]
b = trec_covid_queries["queries"]

In [None]:
from datasets import load_dataset
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch

# Load the saved model
model_path = "doc2query_model"
model = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer = T5Tokenizer.from_pretrained("t5-small", model_max_length=128)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)

trec_covid_corpus = load_dataset("BeIR/trec-covid", "corpus")
trec_covid_queries = load_dataset("BeIR/trec-covid", "queries")
trec_covid_expanded = []

In [None]:
print("Available keys (splits):", trec_covid_corpus.keys())

In [None]:
# Print the first 5 entries
for i in range(1000):
    print(f"Entry {i+1}:")
    print(trec_covid_corpus['corpus'][i])
    print()

In [None]:
#sorted_corpus = sorted(trec_covid_corpus["corpus"], key=lambda x: x["_id"])[:500]
# Print the first 5 entries
for i in range(500):
    print(f"Entry {i+1}:")
    print(trec_covid_queries['queries'][i])
    print()

In [None]:
from tqdm import tqdm
import json

def generate_expanded_queries(document, model, tokenizer, max_len=128, batch_size=500):
    # Tokenize the input
    tokenized_inputs = tokenizer(document, return_tensors="pt", max_length=max_len, padding="max_length", truncation=True)
    input_ids = tokenized_inputs["input_ids"].to(device)
    attention_mask = tokenized_inputs["attention_mask"].to(device)

    # Split the input into batches
    input_ids_batches = torch.split(input_ids, batch_size)
    attention_mask_batches = torch.split(attention_mask, batch_size)

    # Initialize the output buffer
    expanded_queries = []

    for input_ids_batch, attention_mask_batch in zip(input_ids_batches, attention_mask_batches):
        # Generate the output
        with torch.no_grad():
            outputs = model.generate(input_ids=input_ids_batch, attention_mask=attention_mask_batch, max_new_tokens=max_len)
        
        # Decode the output
        for output in outputs:
            expanded_query = tokenizer.decode(output, skip_special_tokens=True)
            expanded_queries.append(expanded_query)

    return expanded_queries

# Get 1000 sorted entries from the dataset
#sorted_corpus = sorted(trec_covid_corpus["corpus"], key=lambda x: x["_id"])[:100]
sorted_corpus = trec_covid_corpus["corpus"]

# Print the first 5 entries
for i in range(5):
    print(f"Entry {i+1}:")
    print(sorted_corpus[i])
    print()

for doc in tqdm(sorted_corpus, desc="Generating queries"):
    expanded_query = generate_expanded_queries(doc["text"], model, tokenizer, max_len=128, batch_size=500)
    trec_covid_expanded.append({"id": doc["_id"], "text": doc["text"], "expanded_query": expanded_query})
    #print("Original Document Text:")
    #print(doc["text"])
    #print("Expanded Queries:")
    #print(expanded_query)  
    #print(f"Generated {len(trec_covid_expanded)} queries")

# Salve as consultas expandidas em um arquivo
with open("trec_covid_expanded.json", "w") as f:
    json.dump(trec_covid_expanded, f)    


#for doc in tqdm(trec_covid_corpus["corpus"], desc="Generating queries"):
#    expanded_query = generate_expanded_queries(doc["text"], model, tokenizer, max_len=20, batch_size=10000)
#    trec_covid_expanded.append({"id": doc["_id"], "text": doc["text"], "expanded_query": expanded_query})

## Gerando o Indice

In [None]:
import jsonlines
import os

workdir = "trec-covid/"
os.makedirs(workdir, exist_ok=True)

json_batch_size = 1 #len(sorted_corpus) // 10
j = 0

for i in range(0, len(sorted_corpus), json_batch_size):
    filename = f"{workdir}json_{j}.json"
    print(filename)
    with jsonlines.open(filename, mode='w') as writer:
        for item in sorted_corpus[i:i + json_batch_size]:
            writer.write(item)
    j += 1

In [None]:
import json
import pytrec_eval
from pyserini.search import SimpleSearcher



# Carregue o índice do TREC-COVID
searcher = SimpleSearcher("beir-v1.0.0-trec-covid-flat")

# BM25 sem expansão
def evaluate_bm25_no_expansion(searcher, trec_covid_queries, qrels, k=10):
    evaluator = pytrec_eval.RelevanceEvaluator(qrels, {'map_cut', 'ndcg_cut', 'recip_rank'})
    topics = {str(topic["id"]): topic["query"] for topic in trec_covid_queries["queries"]}
    qrun = {}
    for topic_id, query in topics.items():
        hits = searcher.search(query, k)
        qrun[topic_id] = {hit.docid: idx + 1 for idx, hit in enumerate(hits)}
    results = evaluator.evaluate(qrun)
    return results['ndcg_cut_10']

# BM25 com expansão
def evaluate_bm25_expansion(searcher, trec_covid_queries, trec_covid_expanded, qrels, k=10):
    evaluator = pytrec_eval.RelevanceEvaluator(qrels, {'map_cut', 'ndcg_cut', 'recip_rank'})
    topics = {str(topic["id"]): topic["query"] for topic in trec_covid_queries["queries"]}
    expanded_queries = {doc["id"]: doc["expanded_query"] for doc in trec_covid_expanded}
    qrun = {}
    for topic_id, query in topics.items():
        expanded_query = f"{query} {expanded_queries[topic_id]}"
        hits = searcher.search(expanded_query, k)
        qrun[topic_id] = {hit.docid: idx + 1 for idx, hit in enumerate(hits)}
    results = evaluator.evaluate(qrun)
    return results['ndcg_cut_10']

qrels = {str(qrel["query_id"]): {str(qrel["doc_id"]): qrel["relevance"] for qrel in trec_covid_queries["qrels"]} for qrel in trec_covid_queries["queries"]}

bm25_no_expansion_ndcg = evaluate_bm25_no_expansion(searcher, trec_covid_queries, qrels)
bm25_expansion_ndcg = evaluate_bm25_expansion(searcher, trec_covid_queries, trec_covid_expanded, qrels)

print(f"nDCG@10 do BM25 sem expansão: {bm25_no_expansion_ndcg}")
print(f"nDCG@10 do BM25 com expansão: {bm25_expansion_ndcg}")


# Load the data for evaluation

In [1]:
# Load the data for evaluation

from datasets import load_dataset
import json
import pandas as pd

trec_covid_corpus = load_dataset("BeIR/trec-covid", "corpus")
trec_covid_queries = load_dataset("BeIR/trec-covid", "queries")
# Convert datasets to dictionaries
trec_covid_corpus = trec_covid_corpus["corpus"].to_dict()
trec_covid_queries = trec_covid_queries["queries"].to_dict()

# Load the trec-covid-qrels dataset
trec_covid_qrels = load_dataset("BeIR/trec-covid-qrels")

# Create a DataFrame to store the qrels data
qrels = pd.DataFrame()
qrels["query_id"] = trec_covid_qrels['test']["query-id"]
qrels["corpus_id"] = trec_covid_qrels['test']["corpus-id"]
qrels["score"] = trec_covid_qrels['test']["score"]

# Create a dictionary from qrels data
qrels_dict = {}
for query_id, corpus_id, score in zip(qrels["query_id"], qrels["corpus_id"], qrels["score"]):
    query_id = str(query_id)
    corpus_id = corpus_id.strip()
    score = int(score)
    if query_id not in qrels_dict:
        qrels_dict[query_id] = {}
    qrels_dict[query_id][corpus_id] = score



Found cached dataset trec-covid (C:/Users/crist/.cache/huggingface/datasets/BeIR___trec-covid/corpus/0.0.0/093f1fe2ffa7a9c72fa48239c8f279b51d6b171abd77737c7fd1406125307599)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset trec-covid (C:/Users/crist/.cache/huggingface/datasets/BeIR___trec-covid/queries/0.0.0/093f1fe2ffa7a9c72fa48239c8f279b51d6b171abd77737c7fd1406125307599)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset csv (C:/Users/crist/.cache/huggingface/datasets/BeIR___csv/BeIR--trec-covid-qrels-1766e3af5b0b856a/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/1 [00:00<?, ?it/s]

# Generating the indexes

In [2]:
# Generating the indexes

import pytrec_eval
from pathlib import Path
import os
import subprocess
import shutil


trec_covid_expanded = []

# Load the expanded queries
with open("trec_covid_expanded.json", "r") as f:
    trec_covid_expanded = json.load(f)

#Concatenate expanded queries to their respective documents
for doc in trec_covid_expanded:
    doc["text"] = f"{ doc['text']} {doc['expanded_query']}"

# Write the original and expanded TREC-COVID datasets to jsonl files
with open("trec_covid_original.jsonl", "w") as f:
    for doc_id, text in zip(trec_covid_corpus["_id"], trec_covid_corpus["text"]):
        doc = {"id": doc_id, "text": text}
        f.write(json.dumps(doc) + "\n")

with open("trec_covid_expanded.jsonl", "w") as f:
    for doc in trec_covid_expanded:
        f.write(json.dumps(doc) + "\n")

# Define the directories where the indexes will be stored
expanded_index_directory = "trec_covid_expanded_index"
original_index_directory = "trec_covid_original_index"

# Create directories for original and expanded datasets
original_data_directory = "trec_covid_original_data"
expanded_data_directory = "trec_covid_expanded_data"

# Create the directories if they do not exist
Path(expanded_index_directory).mkdir(parents=True, exist_ok=True)
Path(original_index_directory).mkdir(parents=True, exist_ok=True)
Path(original_data_directory).mkdir(parents=True, exist_ok=True)
Path(expanded_data_directory).mkdir(parents=True, exist_ok=True)

# Move the jsonl files into their corresponding directories
shutil.move("trec_covid_original.jsonl", f"{original_data_directory}/trec_covid_original.jsonl")
shutil.move("trec_covid_expanded.jsonl", f"{expanded_data_directory}/trec_covid_expanded.jsonl")


# Index the original dataset
print('Index the original dataset')
result = subprocess.run([
    "python", "-m", "pyserini.index.lucene",
    "--collection", "JsonCollection",
    "--input", original_data_directory,
    "--index", original_index_directory,
    "--generator", "DefaultLuceneDocumentGenerator",
    "--threads", "12",
    "--storePositions", "--storeDocvectors", "--storeRaw"
], capture_output=True, text=True)
print(result.stdout)
print(result.stderr)

# Index the expanded dataset
print('Index the expanded dataset')
result = subprocess.run([
    "python", "-m", "pyserini.index.lucene",
    "--collection", "JsonCollection",
    "--input", expanded_data_directory,
    "--index", expanded_index_directory,
    "--generator", "DefaultLuceneDocumentGenerator",
    "--threads", "12",
    "--storePositions", "--storeDocvectors", "--storeRaw"
], capture_output=True, text=True)
print(result.stdout)
print(result.stderr)



Index the original dataset
2023-04-18 23:20:11,857 INFO  [main] index.IndexCollection (IndexCollection.java:380) - Setting log level to INFO
2023-04-18 23:20:11,859 INFO  [main] index.IndexCollection (IndexCollection.java:383) - Starting indexer...
2023-04-18 23:20:11,859 INFO  [main] index.IndexCollection (IndexCollection.java:385) - DocumentCollection path: trec_covid_original_data
2023-04-18 23:20:11,859 INFO  [main] index.IndexCollection (IndexCollection.java:386) - CollectionClass: JsonCollection
2023-04-18 23:20:11,859 INFO  [main] index.IndexCollection (IndexCollection.java:387) - Generator: DefaultLuceneDocumentGenerator
2023-04-18 23:20:11,860 INFO  [main] index.IndexCollection (IndexCollection.java:388) - Threads: 12
2023-04-18 23:20:11,860 INFO  [main] index.IndexCollection (IndexCollection.java:389) - Language: en
2023-04-18 23:20:11,860 INFO  [main] index.IndexCollection (IndexCollection.java:390) - Stemmer: porter
2023-04-18 23:20:11,861 INFO  [main] index.IndexCollection

# Evaluating

In [4]:
# Evaluating
import pyserini
from pyserini.search import LuceneSearcher
print('Load the indexes (either downloaded or built index)')
expanded_searcher = LuceneSearcher(expanded_index_directory)
original_searcher = LuceneSearcher(original_index_directory)

def evaluate_bm25_expansion(searcher, queries, expanded_queries, qrels):
    run = {}    
    for query, expanded_query in zip(queries, expanded_queries):
        query_id = str(query["query_id"])  # Updated
        hits = searcher.search(expanded_query["expanded_query"], k=10)
        run[query_id] = {hit.docid: idx + 1 for idx, hit in enumerate(hits)}
    evaluator = pytrec_eval.RelevanceEvaluator(qrels, {'ndcg_cut'})
    metrics = evaluator.evaluate(run)
    ndcg = {key: value['ndcg_cut_10'] for key, value in metrics.items()}
    mean_ndcg = sum(ndcg.values()) / len(ndcg)
    return mean_ndcg

def evaluate_bm25_no_expansion(searcher, queries, qrels):
    run = {}
    for query in queries:
        query_id = str(query["query_id"])  # Updated
        hits = searcher.search(query["text"], k=10)  # Updated
        run[query_id] = {hit.docid: idx + 1 for idx, hit in enumerate(hits)}
    evaluator = pytrec_eval.RelevanceEvaluator(qrels, {'ndcg_cut'})
    metrics = evaluator.evaluate(run)
    ndcg = {key: value['ndcg_cut_10'] for key, value in metrics.items()}
    mean_ndcg = sum(ndcg.values()) / len(ndcg)
    return mean_ndcg

# Create a list of query dictionaries
query_list = [{"query_id": query_id, "text": text} for query_id, text in zip(trec_covid_queries["_id"], trec_covid_queries["text"])]

# Evaluate the BM25 performance on both indexes using nDCG@10
qrels = qrels_dict

bm25_expanded_ndcg = evaluate_bm25_expansion(
    expanded_searcher, query_list, trec_covid_expanded, qrels
)
bm25_no_expansion_ndcg = evaluate_bm25_no_expansion(
    original_searcher, query_list, qrels
)

print(f"nDCG@10 for BM25 without expansion: {bm25_no_expansion_ndcg}")
print(f"nDCG@10 for BM25 with expansion: {bm25_expanded_ndcg}")



Load the indexes (either downloaded or built index)


JavaException: No methods called search in io/anserini/search/SimpleSearcher matching your arguments, requested: (['what was the clinical features of 40 patients'], 10), available: ['(Ljava/lang/String;)[Lio/anserini/search/SimpleSearcher$Result;', '(Ljava/lang/String;I)[Lio/anserini/search/SimpleSearcher$Result;', '(Lio/anserini/search/query/QueryGenerator;Ljava/lang/String;I)[Lio/anserini/search/SimpleSearcher$Result;', '(Lorg/apache/lucene/search/Query;I)[Lio/anserini/search/SimpleSearcher$Result;']

In [None]:
print("trec_covid_queries (first 3 items):")
for key, value in list(trec_covid_queries.items())[:3]:
    print(f"{key}: {value}")

print("\ntrec_covid_expanded (first item):")
print(trec_covid_expanded[0] if len(trec_covid_expanded) > 0 else "Empty list")

print("\nqrels_dict (first item):")
first_key = list(qrels_dict.keys())[0]
print(f"{first_key}: {qrels_dict[first_key]}")

In [None]:
trec_covid_qrels = load_dataset("BeIR/trec-covid", "queries")
a = trec_covid_qrels['queries']
print(a['title'])
print(a['text'])

num_lines_to_print = 5

for i in range(num_lines_to_print):
    doc_id = trec_covid_corpus["id"][i]
    text = trec_covid_corpus["text"][i]
    print(f"Document {i + 1}:")
    print(f"ID: {doc_id}")
    print(f"Text: {text}\n")

In [None]:
trec_covid_qrels = load_dataset("BeIR/trec-covid", "qrels")
qrels_dict = {}
for qrel in trec_covid_qrels["qrels"]:
    query_id = str(qrel["query_id"])
    doc_id = str(qrel["doc_id"])
    relevance = qrel["relevance"]
    if query_id not in qrels_dict:
        qrels_dict[query_id] = {}
    qrels_dict[query_id][doc_id] = relevance

In [None]:
# Print some relevant specs of the trec_covid_queries dictionary
print(f"Number of queries: {len(trec_covid_queries)}")
print("First 5 queries:")
for i in range(5):
    query_id = trec_covid_queries["_id"][i]
    query_text = trec_covid_queries["text"][i]
    print(f"Query {query_id}: {query_text}")

In [None]:
# Print the first three queries
print("First three queries:")
for i in range(3):
    print(f"Query {trec_covid_queries['_id'][i]}: {trec_covid_queries['text'][i]}")


In [None]:
# Print some relevant specs of the trec_covid_queries dictionary
print(f"Number of queries: {len(trec_covid_queries)}")
print("First 5 queries:")
for i in range(5):
    query_id = trec_covid_queries["_id"][i]
    query_text = trec_covid_queries["text"][i]
    print(f"Query {query_id}: {query_text}")


In [None]:
# Print the number of expanded queries and the text of the first 5 queries
print(f"Number of queries: {len(trec_covid_queries)}")
print("First 5 queries:")
for i in range(5):
    print(f"Query {trec_covid_queries['_id'][i]}: {trec_covid_queries['text'][i][:50]}...")

In [None]:
# Print the number of queries and the relevant documents for the first 5 queries
print(f"Number of queries in qrels_dict: {len(qrels_dict)}")
print("Relevant documents for first 5 queries:")
for i in range(5):
    query_id = list(qrels_dict.keys())[i]
    relevant_docs = list(qrels_dict[query_id].keys())[:5]
    print(f"Query {query_id}: {relevant_docs}")