In [None]:
# Baixe o dataset do ms marco tiny
#curl -O https://storage.googleapis.com/unicamp-dl/ia368dd_2023s1/msmarco/msmarco_triples.train.tiny.tsv


#!pip3 install nmslib
#!python -m pip install --upgrade pip
#!pip install pyserini
#!pip install datasets
#!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117
#!pip install sacrebleu 
#!pip install scikit-learn 
#!pip install torch
#!pip install sentencepiece
#!pip install transformers
#!pip install pandas
#!pip install torch
#!pip install transformers
#!pip install tqdm
#!pip install sacrebleu
#!pip install scikit-learn
#!pip install jupyter notebok
#!pip install jsonlines

In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config, AdamW
from tqdm import tqdm
import sacrebleu

# Atualizar a classe MSMARCODataset
class MSMARCODataset(Dataset):
    def __init__(self, data_file, tokenizer, max_len):
        self.data = pd.read_csv(data_file, delimiter="\t", header=None, names=["query", "relevant_passage", "non_relevant_passage"])
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        query = self.data.loc[index, "query"]
        relevant_passage = self.data.loc[index, "relevant_passage"]
        tokenized_inputs = self.tokenizer(relevant_passage, return_tensors="pt", max_length=self.max_len, padding="max_length", truncation=True)
        tokenized_outputs = self.tokenizer(query, return_tensors="pt", max_length=self.max_len, padding="max_length", truncation=True)
        return {"input_ids": tokenized_inputs["input_ids"].squeeze(0), "attention_mask": tokenized_inputs["attention_mask"].squeeze(0), "labels": tokenized_outputs["input_ids"].squeeze(0)}

# Load the dataset and split it into training and validation sets
data_file = "msmarco_triples.train.tiny.tsv"
tokenizer = T5Tokenizer.from_pretrained("t5-small")
max_len = 128
dataset = MSMARCODataset(data_file, tokenizer, max_len)

train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Create DataLoaders for training and validation datasets
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Initialize the model and optimizer
model = T5ForConditionalGeneration.from_pretrained("t5-small")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = AdamW(model.parameters(), lr=1e-4)

# Train the seq2seq model and validate every X steps  
epochs = 2
validate_every_x_steps = 50
step_count = 0

for epoch in range(epochs):
    model.train()
    train_loss_accumulator = 0.0

    for batch in tqdm(train_dataloader):
        step_count += 1

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        train_loss_accumulator += loss.item()
        loss.backward()

        optimizer.step()
        optimizer.zero_grad()

        # Validate the model every X steps
        if step_count % validate_every_x_steps == 0:
            model.eval()
            val_loss_accumulator = 0.0
            refs = []
            hyps = []

            with torch.no_grad():
                for val_batch in val_dataloader:
                    val_input_ids = val_batch["input_ids"].to(device)
                    val_attention_mask = val_batch["attention_mask"].to(device)
                    val_labels = val_batch["labels"].to(device)

                    val_outputs = model(input_ids=val_input_ids, attention_mask=val_attention_mask, labels=val_labels)
                    val_loss = val_outputs.loss
                    val_loss_accumulator += val_loss.item()

                    generated = model.generate(val_input_ids, attention_mask=val_attention_mask, max_length=max_len)
                    hyps.extend(tokenizer.batch_decode(generated, skip_special_tokens=True))
                    refs.extend(tokenizer.batch_decode(val_labels, skip_special_tokens=True))

            val_loss_avg = val_loss_accumulator / len(val_dataloader)
            train_loss_avg = train_loss_accumulator / validate_every_x_steps
            bleu = sacrebleu.corpus_bleu(hyps, [refs])
            print(f"Step: {step_count}, Train Loss: {train_loss_avg}, Validation Loss: {val_loss_avg}, Validation BLEU: {bleu.score}")

            train_loss_accumulator = 0.0
            model.train()

model.save_pretrained("doc2query_model")


For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
  8%|▊         | 50/619 [03:17<8:47:09, 55.59s/it]

Step: 50, Train Loss: 5.651070511341095, Validation Loss: 0.4829001305759817, Validation BLEU: 0.0


 16%|█▌        | 100/619 [06:32<8:01:45, 55.70s/it]

Step: 100, Train Loss: 0.7496192157268524, Validation Loss: 0.4414199940536333, Validation BLEU: 0.0


 24%|██▍       | 149/619 [06:43<01:39,  4.74it/s]  

## Gere as consultas expandidas para o TREC-COVID

In [None]:
#!pip install datasets
#!pip install pyserini

a = trec_covid_corpus["corpus"]
b = trec_covid_queries["queries"]

In [None]:
from datasets import load_dataset
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch

# Load the saved model
model_path = "doc2query_model"
model = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer = T5Tokenizer.from_pretrained("t5-small", model_max_length=128)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)

trec_covid_corpus = load_dataset("BeIR/trec-covid", "corpus")
trec_covid_queries = load_dataset("BeIR/trec-covid", "queries")
trec_covid_expanded = []

In [None]:
print("Available keys (splits):", trec_covid_corpus.keys())

In [None]:
# Print the first 5 entries
for i in range(1000):
    print(f"Entry {i+1}:")
    print(trec_covid_corpus['corpus'][i]['text'])
    print()

In [None]:
#sorted_corpus = sorted(trec_covid_corpus["corpus"], key=lambda x: x["_id"])[:500]
# Print the first 5 entries
for i in range(500):
    print(f"Entry {i+1}:")
    print(sorted_corpus[i])
    print()

In [2]:
from tqdm import tqdm
import json

def generate_expanded_queries(document, model, tokenizer, max_len=128, batch_size=500):
    # Tokenize the input
    tokenized_inputs = tokenizer(document, return_tensors="pt", max_length=max_len, padding="max_length", truncation=True)
    input_ids = tokenized_inputs["input_ids"].to(device)
    attention_mask = tokenized_inputs["attention_mask"].to(device)

    # Split the input into batches
    input_ids_batches = torch.split(input_ids, batch_size)
    attention_mask_batches = torch.split(attention_mask, batch_size)

    # Initialize the output buffer
    expanded_queries = []

    for input_ids_batch, attention_mask_batch in zip(input_ids_batches, attention_mask_batches):
        # Generate the output
        with torch.no_grad():
            outputs = model.generate(input_ids=input_ids_batch, attention_mask=attention_mask_batch, max_new_tokens=max_len)
        
        # Decode the output
        for output in outputs:
            expanded_query = tokenizer.decode(output, skip_special_tokens=True)
            expanded_queries.append(expanded_query)

    return expanded_queries

# Get 1000 sorted entries from the dataset
sorted_corpus = sorted(trec_covid_corpus["corpus"], key=lambda x: x["_id"])[:100]

# Print the first 5 entries
for i in range(5):
    print(f"Entry {i+1}:")
    print(sorted_corpus[i])
    print()

for doc in tqdm(sorted_corpus, desc="Generating queries"):
    expanded_query = generate_expanded_queries(doc["text"], model, tokenizer, max_len=128, batch_size=500)
    trec_covid_expanded.append({"id": doc["_id"], "text": doc["text"], "expanded_query": expanded_query})
    print("Original Document Text:")
    print(doc["text"])
    print("Expanded Queries:")
    print(expanded_query)  
    print(f"Generated {len(trec_covid_expanded)} queries")

# Salve as consultas expandidas em um arquivo
with open("trec_covid_expanded.json", "w") as f:
    json.dump(trec_covid_expanded, f)    


#for doc in tqdm(trec_covid_corpus["corpus"], desc="Generating queries"):
#    expanded_query = generate_expanded_queries(doc["text"], model, tokenizer, max_len=20, batch_size=10000)
#    trec_covid_expanded.append({"id": doc["_id"], "text": doc["text"], "expanded_query": expanded_query})

NameError: name 'trec_covid_corpus' is not defined

## Gerando o Indice

In [None]:
import jsonlines
import os

workdir = "trec-covid/"
os.makedirs(workdir, exist_ok=True)

json_batch_size = 1 #len(sorted_corpus) // 10
j = 0

for i in range(0, len(sorted_corpus), json_batch_size):
    filename = f"{workdir}json_{j}.json"
    print(filename)
    with jsonlines.open(filename, mode='w') as writer:
        for item in sorted_corpus[i:i + json_batch_size]:
            writer.write(item)
    j += 1

In [None]:
import json
import pytrec_eval
from pyserini.search import SimpleSearcher



# Carregue o índice do TREC-COVID
searcher = SimpleSearcher("beir-v1.0.0-trec-covid-flat")

# BM25 sem expansão
def evaluate_bm25_no_expansion(searcher, trec_covid_queries, qrels, k=10):
    evaluator = pytrec_eval.RelevanceEvaluator(qrels, {'map_cut', 'ndcg_cut', 'recip_rank'})
    topics = {str(topic["id"]): topic["query"] for topic in trec_covid_queries["queries"]}
    qrun = {}
    for topic_id, query in topics.items():
        hits = searcher.search(query, k)
        qrun[topic_id] = {hit.docid: idx + 1 for idx, hit in enumerate(hits)}
    results = evaluator.evaluate(qrun)
    return results['ndcg_cut_10']

# BM25 com expansão
def evaluate_bm25_expansion(searcher, trec_covid_queries, trec_covid_expanded, qrels, k=10):
    evaluator = pytrec_eval.RelevanceEvaluator(qrels, {'map_cut', 'ndcg_cut', 'recip_rank'})
    topics = {str(topic["id"]): topic["query"] for topic in trec_covid_queries["queries"]}
    expanded_queries = {doc["id"]: doc["expanded_query"] for doc in trec_covid_expanded}
    qrun = {}
    for topic_id, query in topics.items():
        expanded_query = f"{query} {expanded_queries[topic_id]}"
        hits = searcher.search(expanded_query, k)
        qrun[topic_id] = {hit.docid: idx + 1 for idx, hit in enumerate(hits)}
    results = evaluator.evaluate(qrun)
    return results['ndcg_cut_10']

qrels = {str(qrel["query_id"]): {str(qrel["doc_id"]): qrel["relevance"] for qrel in trec_covid_queries["qrels"]} for qrel in trec_covid_queries["queries"]}

bm25_no_expansion_ndcg = evaluate_bm25_no_expansion(searcher, trec_covid_queries, qrels)
bm25_expansion_ndcg = evaluate_bm25_expansion(searcher, trec_covid_queries, trec_covid_expanded, qrels)

print(f"nDCG@10 do BM25 sem expansão: {bm25_no_expansion_ndcg}")
print(f"nDCG@10 do BM25 com expansão: {bm25_expansion_ndcg}")
