<a href="https://colab.research.google.com/github/eduseiti/ia368v_dd_class_09/blob/main/DL_reranking_evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TREC-COVID DL reranking evaluation

This notebook perform reranking tests over TREC COVID queries BM25 results.

## Prepare the environment

In [1]:
import os
import sys

import tqdm

In [2]:
IN_COLAB='google.colab' in sys.modules
LINK_WITH_COMET=False

In [3]:
if IN_COLAB:
    from google.colab import drive

    WORKING_FOLDER="/content/drive/MyDrive/unicamp/ia368v_dd/aula_09"

    drive.mount('/content/drive', force_remount=True)

    os.chdir(WORKING_FOLDER)

    os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
    


    !pip install pyserini -q
    !pip install faiss-cpu -q

    # !apt-get install maven -qq
    # !git clone --recurse-submodules https://github.com/castorini/pyserini.git
    # !cd pyserini
    # !cd tools/eval && tar xvfz trec_eval.9.0.4.tar.gz && cd trec_eval.9.0.4 && make && cd ../../..
    # !cd tools/eval/ndeval && make && cd ../../..

    PYSERINI_FOLDER="/content/drive/MyDrive/unicamp/ia368v_dd/pyserini/"

    TREC_EVAL_FULLPATH=PYSERINI_FOLDER+"tools/eval/trec_eval.9.0.4/trec_eval"
    PYSERINI_TOOLS_FOLDER=PYSERINI_FOLDER+"tools/scripts/msmarco/"

    !pip install transformers -q

    # !git clone --recurse-submodules https://github.com/castorini/anserini.git
    # !cd anserini
    # !mvn clean package appassembler:assemble

    os.environ["ANSERINI_CLASSPATH"]="content/drive/MyDrive/unicamp/ia368v_dd/aula_09/pyserini/anserini/target"

    !chmod +x /content/drive/MyDrive/unicamp/ia368v_dd/pyserini/tools/eval/trec_eval.9.0.4/trec_eval

    PYSERINI_PREVIOUS_RUN="run.trec_covid_original_complete_20230503_135137.txt"
else:
    WORKING_FOLDER="/mnt/0060f889-4c27-409b-b0de-47f5427515e3/unicamp/ia368v_dd/ia368v_dd_class_09/"
    PYSERINI_FOLDER="/mnt/0060f889-4c27-409b-b0de-47f5427515e3/unicamp/ia368v_dd/pyserini/"
    
    TREC_EVAL_FULLPATH=PYSERINI_FOLDER+"tools/eval/trec_eval.9.0.4/trec_eval"
    
    os.environ["ANSERINI_CLASSPATH"]="/media/eduseiti/bigdata01/unicamp/ia368v_dd/anserini/target"

    PYSERINI_PREVIOUS_RUN="run.trec_covid_original_complete_20230501_141634.txt"

Mounted at /content/drive
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.1/154.1 MB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.0/5.0 MB[0m [31m101.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m79.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m188.7/188.7 kB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m106.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m79.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 k

In [4]:
import pandas as pd
import pickle
import numpy as np

import json

import time

import re

from datetime import datetime

from scipy import stats

In [5]:
TREC_COVID_MERGED_FILE="trec_covid_merged_data.tsv"
TREC_COVID_DOCUMENTS_FILE="trec_covid_original_title_text_merged.tsv"

TREC_COVID_QUERIES="trec_covid_queries.tsv"
TREC_COVID_QRELS="trec_covid_qrels.tsv"

API_KEYS_FILE="../api_keys_20230324.json"

pd.set_option('display.max_colwidth', None)

In [6]:
TREC_COVID_ORIGINAL_TITLE_TEXT_MERGED_FILENAME="trec_covid_original_title_text_merged.tsv"
TREC_COVID_ORIGINAL_FOLDER="trec_covid_original"
TREC_COVID_ORIGINAL_INDEX_FOLDER="trec_covid_original/index"
TREC_COVID_ORIGINAL_RUNS_FOLDER="trec_covid_original/runs"

In [7]:
TREC_COVID_RERANKING_RUNS_FOLDER="trec_covid_reranking_runs"

In [8]:
import torch

from transformers import get_linear_schedule_with_warmup, get_constant_schedule
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils import data
from transformers import BatchEncoding

## Set the random seed

In [9]:
RANDOM_SEED = 6

rng = np.random.default_rng(RANDOM_SEED)

### Initialize reranking model parameters

In [10]:
MODEL_NAME='microsoft/MiniLM-L12-H384-uncased'
MS_MARCO_PRETRAINED_MODEL="pretrain_20230315_180741"

MAX_TOKENS_LENGTH=512

RETURN_OVERFLOWING_TOKENS=True

In [11]:
TRAIN_OUTPUT_FOLDER="trained_models"

In [12]:
if RETURN_OVERFLOWING_TOKENS:
    TREC_COVID_TOKENIZED_BM25_RUN="trec_covid_tokenized_{}.pkl"
else:
    TREC_COVID_TOKENIZED_BM25_RUN="trec_covid_tokenized_no_overflow_{}.pkl"

In [13]:
PYSERINI_TEST_RUN_RERANKED_FILENAME_FORMAT="run.trec_covid_reranking_{}_{}_{}.txt"

In [14]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [15]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [16]:
device

device(type='cuda')

In [17]:
if os.path.exists(os.path.join(TREC_COVID_ORIGINAL_RUNS_FOLDER, PYSERINI_PREVIOUS_RUN)):
    pyserini_runfile = PYSERINI_PREVIOUS_RUN

    run_ready=True
else:
    run_ready=False

    print("Need to execute Pyserini run...")

### Convert TREC COVID to Pyserini's format

In [18]:
if not run_ready:
    !python3 ./pyserini/tools/scripts/msmarco/convert_collection_to_jsonl.py \
        --collection-path {TREC_COVID_ORIGINAL_TITLE_TEXT_MERGED_FILENAME} \
        --output-folder {TREC_COVID_ORIGINAL_FOLDER}

### Create a Pyserini BM25 index for the entire TREC COVID dataset

In [19]:
if not run_ready:
    os.chdir(PYSERINI_FOLDER)

    !python3 -m pyserini.index.lucene \
        --collection JsonCollection \
        --input {WORKING_FOLDER}/{TREC_COVID_ORIGINAL_FOLDER} \
        --index {WORKING_FOLDER}/{TREC_COVID_ORIGINAL_INDEX_FOLDER} \
        --generator DefaultLuceneDocumentGenerator \
        --threads 9 \
        --storePositions --storeDocvectors --storeRaw

## Perform BM25 search over TREC-COVID queries

In [20]:
if not run_ready:
    execution_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    pyserini_runfile = "run.trec_covid_original_complete_{}.txt".format(execution_timestamp)

    !python3 -m pyserini.search.lucene \
        --index {WORKING_FOLDER}/{TREC_COVID_ORIGINAL_INDEX_FOLDER} \
        --topics {WORKING_FOLDER}/{TREC_COVID_QUERIES} \
        --output {WORKING_FOLDER}/{TREC_COVID_ORIGINAL_RUNS_FOLDER}/{pyserini_runfile} \
        --output-format trec \
        --hits 1000 \
        --bm25 --k1 0.82 --b 0.68 \
        --threads 8

    os.chdir(WORKING_FOLDER)

## Now prepare the BM25 results for reranking

### Load the TREC COVID documents

In [21]:
trec_covid_docs_df = pd.read_csv(TREC_COVID_DOCUMENTS_FILE, sep='\t', header=None, names=['corpus-id', 'corpus-title-text'])

display(trec_covid_docs_df.head())

print(trec_covid_docs_df.shape)

Unnamed: 0,corpus-id,corpus-title-text
0,ug7v899j,"Clinical features of culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi ArabiaOBJECTIVE: This retrospective chart review describes the epidemiology and clinical features of 40 patients with culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia. METHODS: Patients with positive M. pneumoniae cultures from respiratory specimens from January 1997 through December 1998 were identified through the Microbiology records. Charts of patients were reviewed. RESULTS: 40 patients were identified, 33 (82.5%) of whom required admission. Most infections (92.5%) were community-acquired. The infection affected all age groups but was most common in infants (32.5%) and pre-school children (22.5%). It occurred year-round but was most common in the fall (35%) and spring (30%). More than three-quarters of patients (77.5%) had comorbidities. Twenty-four isolates (60%) were associated with pneumonia, 14 (35%) with upper respiratory tract infections, and 2 (5%) with bronchiolitis. Cough (82.5%), fever (75%), and malaise (58.8%) were the most common symptoms, and crepitations (60%), and wheezes (40%) were the most common signs. Most patients with pneumonia had crepitations (79.2%) but only 25% had bronchial breathing. Immunocompromised patients were more likely than non-immunocompromised patients to present with pneumonia (8/9 versus 16/31, P = 0.05). Of the 24 patients with pneumonia, 14 (58.3%) had uneventful recovery, 4 (16.7%) recovered following some complications, 3 (12.5%) died because of M pneumoniae infection, and 3 (12.5%) died due to underlying comorbidities. The 3 patients who died of M pneumoniae pneumonia had other comorbidities. CONCLUSION: our results were similar to published data except for the finding that infections were more common in infants and preschool children and that the mortality rate of pneumonia in patients with comorbidities was high."
1,02tnwd4m,"Nitric oxide: a pro-inflammatory mediator in lung disease?Inflammatory diseases of the respiratory tract are commonly associated with elevated production of nitric oxide (NO•) and increased indices of NO• -dependent oxidative stress. Although NO• is known to have anti-microbial, anti-inflammatory and anti-oxidant properties, various lines of evidence support the contribution of NO• to lung injury in several disease models. On the basis of biochemical evidence, it is often presumed that such NO• -dependent oxidations are due to the formation of the oxidant peroxynitrite, although alternative mechanisms involving the phagocyte-derived heme proteins myeloperoxidase and eosinophil peroxidase might be operative during conditions of inflammation. Because of the overwhelming literature on NO• generation and activities in the respiratory tract, it would be beyond the scope of this commentary to review this area comprehensively. Instead, it focuses on recent evidence and concepts of the presumed contribution of NO• to inflammatory diseases of the lung."
2,ejv2xln0,"Surfactant protein-D and pulmonary host defenseSurfactant protein-D (SP-D) participates in the innate response to inhaled microorganisms and organic antigens, and contributes to immune and inflammatory regulation within the lung. SP-D is synthesized and secreted by alveolar and bronchiolar epithelial cells, but is also expressed by epithelial cells lining various exocrine ducts and the mucosa of the gastrointestinal and genitourinary tracts. SP-D, a collagenous calcium-dependent lectin (or collectin), binds to surface glycoconjugates expressed by a wide variety of microorganisms, and to oligosaccharides associated with the surface of various complex organic antigens. SP-D also specifically interacts with glycoconjugates and other molecules expressed on the surface of macrophages, neutrophils, and lymphocytes. In addition, SP-D binds to specific surfactant-associated lipids and can influence the organization of lipid mixtures containing phosphatidylinositol in vitro. Consistent with these diverse in vitro activities is the observation that SP-D-deficient transgenic mice show abnormal accumulations of surfactant lipids, and respond abnormally to challenge with respiratory viruses and bacterial lipopolysaccharides. The phenotype of macrophages isolated from the lungs of SP-D-deficient mice is altered, and there is circumstantial evidence that abnormal oxidant metabolism and/or increased metalloproteinase expression contributes to the development of emphysema. The expression of SP-D is increased in response to many forms of lung injury, and deficient accumulation of appropriately oligomerized SP-D might contribute to the pathogenesis of a variety of human lung diseases."
3,2b73a28n,"Role of endothelin-1 in lung diseaseEndothelin-1 (ET-1) is a 21 amino acid peptide with diverse biological activity that has been implicated in numerous diseases. ET-1 is a potent mitogen regulator of smooth muscle tone, and inflammatory mediator that may play a key role in diseases of the airways, pulmonary circulation, and inflammatory lung diseases, both acute and chronic. This review will focus on the biology of ET-1 and its role in lung disease."
4,9785vg6d,"Gene expression in epithelial cells in response to pneumovirus infectionRespiratory syncytial virus (RSV) and pneumonia virus of mice (PVM) are viruses of the family Paramyxoviridae, subfamily pneumovirus, which cause clinically important respiratory infections in humans and rodents, respectively. The respiratory epithelial target cells respond to viral infection with specific alterations in gene expression, including production of chemoattractant cytokines, adhesion molecules, elements that are related to the apoptosis response, and others that remain incompletely understood. Here we review our current understanding of these mucosal responses and discuss several genomic approaches, including differential display reverse transcription-polymerase chain reaction (PCR) and gene array strategies, that will permit us to unravel the nature of these responses in a more complete and systematic manner."


(171325, 2)


### Load the TREC COVID queries

In [22]:
trec_covid_queries_df = pd.read_csv(TREC_COVID_QUERIES, sep='\t', header=None, names=['query-id', 'query-text'])

display(trec_covid_queries_df)

Unnamed: 0,query-id,query-text
0,1,what is the origin of COVID-19
1,2,how does the coronavirus respond to changes in the weather
2,3,will SARS-CoV2 infected people develop immunity? Is cross protection possible?
3,4,what causes death from Covid-19?
4,5,what drugs have been active against SARS-CoV or SARS-CoV-2 in animal studies?
5,6,what types of rapid testing for Covid-19 have been developed?
6,7,are there serological tests that detect antibodies to coronavirus?
7,8,how has lack of testing availability led to underreporting of true incidence of Covid-19?
8,9,how has COVID-19 affected Canada
9,10,has social distancing had an impact on slowing the spread of COVID-19?


### Now, load the BM25 run

In [23]:
bm25_run_df = pd.read_csv(os.path.join(TREC_COVID_ORIGINAL_RUNS_FOLDER, pyserini_runfile), 
                          sep=" ", 
                          header=None, 
                          names=['query-id', 'Q0', 'doc-id', 'doc-order', 'doc-score', 'comment'])

In [24]:
bm25_run_df.shape

(50000, 6)

In [25]:
bm25_run_df.head()

Unnamed: 0,query-id,Q0,doc-id,doc-order,doc-score,comment
0,1,Q0,dv9m19yk,1,7.7299,Anserini
1,1,Q0,hmvo5b0q,2,6.4755,Anserini
2,1,Q0,0paafp5j,3,6.4311,Anserini
3,1,Q0,96zsd27n,4,6.431099,Anserini
4,1,Q0,5d7zien3,5,6.2127,Anserini


### Check if hasn't already tokenized the data

In [26]:
if os.path.exists(TREC_COVID_TOKENIZED_BM25_RUN.format(os.path.splitext(pyserini_runfile)[0])):
    with open(TREC_COVID_TOKENIZED_BM25_RUN.format(os.path.splitext(pyserini_runfile)[0]), "rb") as inputFile:
        
        tokenized_data = pickle.load(inputFile)

    trec_queries_tokens = tokenized_data['trec_queries_tokens']
    trec_docs_tokens = tokenized_data['trec_docs_tokens']
    bm25_run_with_all_data_df = tokenized_data['bm25_run_with_all_data_df']
    
    tokenized_data_read=True
else:
    tokenized_data_read=False
    
    print("Need to create the tokenized BM25 run data...")

### Build the test data to be tokenized

#### First, filter the TREC COVID topics text using the corresponding IDs on the run

In [27]:
if not tokenized_data_read:
    filtered_topics = trec_covid_queries_df.merge(bm25_run_df, left_on='query-id', right_on='query-id', how='inner')

    display(filtered_topics)

    bm25_run_with_all_data_df = filtered_topics.merge(trec_covid_docs_df, left_on='doc-id', right_on='corpus-id', how='inner')

    display(bm25_run_with_all_data_df)
else:
    print("Has already read the tokenized data...")

Has already read the tokenized data...


#### Now, tokenize both topics and returned texts

In [28]:
if not tokenized_data_read:
    trec_queries_tokens = tokenizer(bm25_run_with_all_data_df['query-text'].tolist(), 
                                    truncation=True, 
                                    max_length=MAX_TOKENS_LENGTH, 
                                    return_length=True)

    print(stats.describe(trec_queries_tokens['length']))

    trec_docs_tokens = tokenizer(bm25_run_with_all_data_df['corpus-title-text'].tolist(), 
                                 truncation=True,
                                 return_overflowing_tokens=RETURN_OVERFLOWING_TOKENS, 
                                 max_length=MAX_TOKENS_LENGTH - np.max(trec_queries_tokens['length']), 
                                 return_length=True)

    print(stats.describe(trec_docs_tokens['length']))
    
    #### Check if has truncated documents

    original_length = bm25_run_with_all_data_df.shape[0]

    if 'overflow_to_sample_mapping' in trec_docs_tokens:
        if original_length < len(trec_docs_tokens['overflow_to_sample_mapping']):
            print("Added {} overflowing texts...".format(len(trec_docs_tokens['overflow_to_sample_mapping']) - original_length))
    else:
        # Add the field to not break the code

        trec_docs_tokens['overflow_to_sample_mapping'] = np.array(list(range(bm25_run_with_all_data_df.shape[0])))


    #### Save the tokenized data

    with open(TREC_COVID_TOKENIZED_BM25_RUN.format(os.path.splitext(pyserini_runfile)[0]), "wb") as outputFile:
        pickle.dump({'trec_queries_tokens': trec_queries_tokens,
                     'trec_docs_tokens': trec_docs_tokens,
                     'bm25_run_with_all_data_df': bm25_run_with_all_data_df}, outputFile, pickle.HIGHEST_PROTOCOL)    
else:
    print("Has already read the tokenized data...")    

Has already read the tokenized data...


### Build the concatenated topic + document to feed the model

Remove the 'CLS' token from the documents token sequence.

In [29]:
test_input_ids = []
test_token_type_ids = []
test_attention_mask = []

# Loop through the documents tokens, since there are overflown ones which shares the same question

for i in range(len(trec_docs_tokens['input_ids'])):
    
    which_query = trec_docs_tokens['overflow_to_sample_mapping'][i]
    
    test_input_ids.append(trec_queries_tokens['input_ids'][which_query] + trec_docs_tokens['input_ids'][i][1:])
    test_token_type_ids.append(trec_queries_tokens['token_type_ids'][which_query] + trec_docs_tokens['token_type_ids'][i][1:])
    test_attention_mask.append(trec_queries_tokens['attention_mask'][which_query] + trec_docs_tokens['attention_mask'][i][1:])

In [30]:
x_test = {'input_ids': test_input_ids, 
          'token_type_ids': test_token_type_ids, 
          'attention_mask': test_attention_mask}

Enter this fixed target data just as reference for the Dataset class.

In [31]:
y_test = np.ones(len(trec_docs_tokens['input_ids']), dtype=bool)

## Initialize some model structures before doing anything

In [32]:
def collate_fn(batch):

    # print(len(batch[0]['input_ids']))

    r1 = tokenizer.pad(batch, return_tensors='pt')

    # print(len(r1['input_ids'][0]))

    return BatchEncoding(r1)


class Dataset(data.Dataset):
    def __init__(self, examples, targets):
        self.examples = examples
        self.targets = targets
    
    def __len__(self):
        return len(self.examples['input_ids'])
    
    def __getitem__(self, idx):
        return {
            'input_ids': self.examples['input_ids'][idx],
            'attention_mask': self.examples['attention_mask'][idx],
            'labels': int(self.targets[idx]),
        }

In [33]:
def collect_reranking(model, dataloader, set_name):
    losses = []
    scores = []
    
    model.eval()

    with torch.no_grad():
        for batch in tqdm.tqdm(dataloader, mininterval=0.5, desc=set_name, disable=False, bar_format='{l_bar}{bar:20}{r_bar}{bar:-20b}', colour='GREEN', file=sys.stdout, position=0, leave=True):
            outputs = model(**batch.to(device))
            loss_val = outputs.loss
            losses.append(loss_val.cpu().item())

            scores.append(outputs.logits.cpu())

    print("{} loss: {:0.4f}".format(set_name, np.mean(losses)))

    return scores

### Create the dataset and the dataloader

In [34]:
dataset_test = Dataset(x_test, y_test)

### Make sure the dataloader preserves the samples order (no shuffling!!!)

In [35]:
batch_size=300

dataloader_test = data.DataLoader(dataset_test, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

In [36]:
TREC_RESULT_LINE_FORMAT="{}\tQ0\t{}\t{}\t{}\tInPars_reranking\n"

In [37]:
def consolidate_reranking_scores_and_check_performance(pyserini_runfile, consolidation_approach, bm25_run_with_all_data_df, query_scores):
    
    consolidated_scores = []

    # Consolidate the scores according to the defined approach.
    
    for i in range(bm25_run_with_all_data_df.shape[0]):
        if consolidation_approach == 'mean':
            consolidated_scores.append(np.mean(query_scores[i]))
        else:
            consolidated_scores.append(np.max(query_scores[i]))
            
    bm25_run_with_all_data_df['reranking_scores'] = consolidated_scores
    
    test_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    reranked_run = PYSERINI_TEST_RUN_RERANKED_FILENAME_FORMAT.format(os.path.splitext(pyserini_runfile)[0], test_timestamp, consolidation_approach)
    
    if not os.path.exists(TREC_COVID_RERANKING_RUNS_FOLDER):
        os.makedirs(TREC_COVID_RERANKING_RUNS_FOLDER)
    
    with open(os.path.join(TREC_COVID_RERANKING_RUNS_FOLDER, reranked_run), 'w') as outputFile:
        for group_name, group_df in bm25_run_with_all_data_df.groupby('query-id'):
            group_df = group_df.sort_values('reranking_scores', ascending=False).reset_index(drop=True)

            for i, row in group_df.iterrows():
                outputFile.write(TREC_RESULT_LINE_FORMAT.format(group_name, row['doc-id'], i + 1, row['reranking_scores']))
                
    result = !{TREC_EVAL_FULLPATH} -c -mrecall.1000 -mmap -mndcg_cut.10 -mrecip_rank.100 \
                 {WORKING_FOLDER}/{TREC_COVID_QRELS} {WORKING_FOLDER}/{TREC_COVID_RERANKING_RUNS_FOLDER}/{reranked_run}

    results = {}

    for line in result:

        line = line.split('\t')

        results[line[0].strip()] = np.float32(line[-1])    
        
    return({"consolidated_scores": consolidated_scores,
            "reranked_run": reranked_run,
            "results": results})

In [38]:
def rerank_BM25_retrieved_texts(model_checkpoint, dataloader_test, trec_docs_tokens, pyserini_runfile, bm25_run_with_all_data_df):

    # Read the model checkpoint
    
    model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint).to(device)
    print('Parameters', model.num_parameters())
    
    # Rerank the BM25 retrieved documents
    
    reranking_scores = collect_reranking(model=model, dataloader=dataloader_test, set_name='TREC COVID')
    
    matches_relevance_score = np.concatenate([batch_scores[:][:, 1].numpy() for batch_scores in reranking_scores])

    query_scores = {}

    for i, match_score in enumerate(matches_relevance_score):
        which_query = trec_docs_tokens['overflow_to_sample_mapping'][i]

        if which_query not in query_scores:
            query_scores[which_query] = []

        query_scores[which_query].append(match_score)
        
    max_results = consolidate_reranking_scores_and_check_performance(pyserini_runfile, "max", bm25_run_with_all_data_df, query_scores)

    print("\n\nMax consolidation results:")
    print(max_results['results'])

    mean_results = consolidate_reranking_scores_and_check_performance(pyserini_runfile, "mean", bm25_run_with_all_data_df, query_scores)

    print("\n\nMean consolidation results:")
    print(mean_results['results'])
    
    return reranking_scores, max_results, mean_results

### Execute tests in different pretraining

In [None]:
MSMARCO_rerank, MSMARCO_max, MSMARCO_mean = rerank_BM25_retrieved_texts(os.path.join(TRAIN_OUTPUT_FOLDER, MS_MARCO_PRETRAINED_MODEL),
                                                                        dataloader_test,
                                                                        trec_docs_tokens,
                                                                        pyserini_runfile, 
                                                                        bm25_run_with_all_data_df)

Parameters 33360770
TREC COVID:   0%|[32m                    [0m| 0/435 [00:00<?, ?it/s][32m[0m

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


TREC COVID: 100%|[32m████████████████████[0m| 435/435 [13:52<00:00,  1.91s/it][32m[0m
TREC COVID loss: 1.6150


Max consolidation results:
{'map': 0.1859, 'recip_rank': 0.8343, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.5796}


Mean consolidation results:
{'map': 0.1831, 'recip_rank': 0.8123, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.5873}


In [None]:
LLM_rerank_best_eval, LLM_max_best_eval, LLM_mean_best_eval = rerank_BM25_retrieved_texts(os.path.join(TRAIN_OUTPUT_FOLDER, 
                                                                                                       "checkpoint_eduseiti_1000_queries_expansion_20230502_02.jsonl_01_epoch_20230503_005356_eval_0.1699"),
                                                                                          dataloader_test,
                                                                                          trec_docs_tokens,
                                                                                          pyserini_runfile, 
                                                                                          bm25_run_with_all_data_df)

Parameters 33360770
TREC COVID: 100%|[32m████████████████████[0m| 186/186 [03:49<00:00,  1.24s/it][32m[0m
TREC COVID loss: 3.5743


Max consolidation results:
{'map': 0.1842, 'recip_rank': 0.7632, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.5376}


Mean consolidation results:
{'map': 0.1829, 'recip_rank': 0.7196, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.5207}


In [None]:
LLM_rerank_06, LLM_max_06, LLM_mean_06 = rerank_BM25_retrieved_texts(os.path.join(TRAIN_OUTPUT_FOLDER, "checkpoint_eduseiti_1000_queries_expansion_20230502_02.jsonl_06_epoch_20230503_012701_0.0426"),
                                                                     dataloader_test,
                                                                     trec_docs_tokens,
                                                                     pyserini_runfile, 
                                                                     bm25_run_with_all_data_df)

Parameters 33360770
TREC COVID:   0%|[32m                    [0m| 0/186 [00:00<?, ?it/s][32m[0m

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


TREC COVID: 100%|[32m████████████████████[0m| 186/186 [03:50<00:00,  1.24s/it][32m[0m
TREC COVID loss: 5.5414


Max consolidation results:
{'map': 0.1848, 'recip_rank': 0.7892, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.5905}


Mean consolidation results:
{'map': 0.1848, 'recip_rank': 0.7784, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.5854}


In [None]:
LLM_rerank_08, LLM_max_08, LLM_mean_08 = rerank_BM25_retrieved_texts(os.path.join(TRAIN_OUTPUT_FOLDER, "checkpoint_eduseiti_1000_queries_expansion_20230502_02.jsonl_08_epoch_20230503_014021_0.0373"),
                                                                     dataloader_test,
                                                                     trec_docs_tokens,
                                                                     pyserini_runfile, 
                                                                     bm25_run_with_all_data_df)

Parameters 33360770
TREC COVID: 100%|[32m████████████████████[0m| 186/186 [03:48<00:00,  1.23s/it][32m[0m
TREC COVID loss: 5.5695


Max consolidation results:
{'map': 0.1863, 'recip_rank': 0.764, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.5509}


Mean consolidation results:
{'map': 0.1864, 'recip_rank': 0.7478, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.5355}


In [None]:
LLM_rerank_10, LLM_max_10, LLM_mean_10 = rerank_BM25_retrieved_texts(os.path.join(TRAIN_OUTPUT_FOLDER, "checkpoint_eduseiti_1000_queries_expansion_20230502_02.jsonl_10_epoch_20230503_015341_0.0295"),
                                                                     dataloader_test,
                                                                     trec_docs_tokens,
                                                                     pyserini_runfile, 
                                                                     bm25_run_with_all_data_df)

Parameters 33360770
TREC COVID: 100%|[32m████████████████████[0m| 186/186 [03:49<00:00,  1.23s/it][32m[0m
TREC COVID loss: 5.7646


Max consolidation results:
{'map': 0.1809, 'recip_rank': 0.7432, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.5572}


Mean consolidation results:
{'map': 0.181, 'recip_rank': 0.7641, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.5569}


In [None]:
LLM_rerank_13, LLM_max_13, LLM_mean_13 = rerank_BM25_retrieved_texts(os.path.join(TRAIN_OUTPUT_FOLDER, "checkpoint_eduseiti_1000_queries_expansion_20230502_02.jsonl_13_epoch_20230503_021342_0.0247"),
                                                                     dataloader_test,
                                                                     trec_docs_tokens,
                                                                     pyserini_runfile, 
                                                                     bm25_run_with_all_data_df)

Parameters 33360770
TREC COVID: 100%|[32m████████████████████[0m| 186/186 [03:45<00:00,  1.21s/it][32m[0m
TREC COVID loss: 6.2144


Max consolidation results:
{'map': 0.1752, 'recip_rank': 0.746, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.5152}


Mean consolidation results:
{'map': 0.1756, 'recip_rank': 0.7455, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.5123}


In [None]:
LLM_rerank_04, LLM_max_04, LLM_mean_04 = rerank_BM25_retrieved_texts(os.path.join(TRAIN_OUTPUT_FOLDER, "checkpoint_eduseiti_1000_queries_expansion_20230502_02.jsonl_04_epoch_20230503_011343_0.0489"),
                                                                     dataloader_test,
                                                                     trec_docs_tokens,
                                                                     pyserini_runfile, 
                                                                     bm25_run_with_all_data_df)

Parameters 33360770
TREC COVID:   0%|[32m                    [0m| 0/186 [00:00<?, ?it/s][32m[0m

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


TREC COVID: 100%|[32m████████████████████[0m| 186/186 [03:49<00:00,  1.23s/it][32m[0m
TREC COVID loss: 5.2219


Max consolidation results:
{'map': 0.1853, 'recip_rank': 0.7777, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.5499}


Mean consolidation results:
{'map': 0.185, 'recip_rank': 0.7503, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.5291}


### Tests without token overflow

In [None]:
LLM_rerank_06_no, LLM_max_06_no, LLM_mean_06_no = rerank_BM25_retrieved_texts(os.path.join(TRAIN_OUTPUT_FOLDER, "checkpoint_eduseiti_1000_queries_expansion_20230502_02.jsonl_06_epoch_20230503_012701_0.0426"),
                                                                              dataloader_test,
                                                                              trec_docs_tokens,
                                                                              pyserini_runfile, 
                                                                              bm25_run_with_all_data_df)

Parameters 33360770
TREC COVID:   0%|[32m                    [0m| 0/167 [00:00<?, ?it/s][32m[0m

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


TREC COVID: 100%|[32m████████████████████[0m| 167/167 [03:25<00:00,  1.23s/it][32m[0m
TREC COVID loss: 5.5239


Max consolidation results:
{'map': 0.1863, 'recip_rank': 0.7834, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.589}


Mean consolidation results:
{'map': 0.1863, 'recip_rank': 0.7834, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.589}


In [None]:
LLM_rerank_08_no, LLM_max_08_no, LLM_mean_08_no = rerank_BM25_retrieved_texts(os.path.join(TRAIN_OUTPUT_FOLDER, "checkpoint_eduseiti_1000_queries_expansion_20230502_02.jsonl_08_epoch_20230503_014021_0.0373"),
                                                                              dataloader_test,
                                                                              trec_docs_tokens,
                                                                              pyserini_runfile, 
                                                                              bm25_run_with_all_data_df)

Parameters 33360770
TREC COVID: 100%|[32m████████████████████[0m| 167/167 [03:21<00:00,  1.21s/it][32m[0m
TREC COVID loss: 5.5688


Max consolidation results:
{'map': 0.1877, 'recip_rank': 0.7491, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.5468}


Mean consolidation results:
{'map': 0.1877, 'recip_rank': 0.7491, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.5468}


In [None]:
LLM_rerank_07_no, LLM_max_07_no, LLM_mean_07_no = rerank_BM25_retrieved_texts(os.path.join(TRAIN_OUTPUT_FOLDER, "checkpoint_eduseiti_1000_queries_expansion_20230502_02.jsonl_07_epoch_20230503_013342_0.0419"),
                                                                              dataloader_test,
                                                                              trec_docs_tokens,
                                                                              pyserini_runfile, 
                                                                              bm25_run_with_all_data_df)

Parameters 33360770
TREC COVID: 100%|[32m████████████████████[0m| 167/167 [03:21<00:00,  1.21s/it][32m[0m
TREC COVID loss: 5.2299


Max consolidation results:
{'map': 0.1763, 'recip_rank': 0.6811, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.5096}


Mean consolidation results:
{'map': 0.1763, 'recip_rank': 0.6811, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.5096}


In [None]:
MSMARCO_rerank_no, MSMARCO_max_no, MSMARCO_mean_no = rerank_BM25_retrieved_texts(os.path.join(TRAIN_OUTPUT_FOLDER, MS_MARCO_PRETRAINED_MODEL),
                                                                                 dataloader_test,
                                                                                 trec_docs_tokens,
                                                                                 pyserini_runfile, 
                                                                                 bm25_run_with_all_data_df)

Parameters 33360770
TREC COVID: 100%|[32m████████████████████[0m| 167/167 [03:21<00:00,  1.20s/it][32m[0m
TREC COVID loss: 1.4422


Max consolidation results:
{'map': 0.1844, 'recip_rank': 0.826, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.5793}


Mean consolidation results:
{'map': 0.1844, 'recip_rank': 0.826, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.5793}


### Testes no fine-tuning do melhor MS MARCO

In [39]:
LLM_rerank_00_ms, LLM_max_00_ms, LLM_mean_00_ms = rerank_BM25_retrieved_texts(os.path.join(TRAIN_OUTPUT_FOLDER, "checkpoint_eduseiti_1000_queries_expansion_20230502_02.jsonl_00_epoch_20230503_175030_0.2501"),
                                                                              dataloader_test,
                                                                              trec_docs_tokens,
                                                                              pyserini_runfile, 
                                                                              bm25_run_with_all_data_df)

Parameters 33360770
TREC COVID:   0%|[32m                    [0m| 0/186 [00:00<?, ?it/s][32m[0m

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


TREC COVID: 100%|[32m████████████████████[0m| 186/186 [03:52<00:00,  1.25s/it][32m[0m
TREC COVID loss: 2.3788


Max consolidation results:
{'map': 0.2037, 'recip_rank': 0.7927, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.621}


Mean consolidation results:
{'map': 0.2011, 'recip_rank': 0.7787, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.6148}


In [40]:
LLM_rerank_01_ms, LLM_max_01_ms, LLM_mean_01_ms = rerank_BM25_retrieved_texts(os.path.join(TRAIN_OUTPUT_FOLDER, "checkpoint_eduseiti_1000_queries_expansion_20230502_02.jsonl_01_epoch_20230503_175718_0.1441"),
                                                                              dataloader_test,
                                                                              trec_docs_tokens,
                                                                              pyserini_runfile, 
                                                                              bm25_run_with_all_data_df)

Parameters 33360770
TREC COVID: 100%|[32m████████████████████[0m| 186/186 [03:51<00:00,  1.24s/it][32m[0m
TREC COVID loss: 2.7602


Max consolidation results:
{'map': 0.2009, 'recip_rank': 0.8463, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.6042}


Mean consolidation results:
{'map': 0.2008, 'recip_rank': 0.7949, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.5881}


In [41]:
LLM_rerank_02_ms, LLM_max_02_ms, LLM_mean_02_ms = rerank_BM25_retrieved_texts(os.path.join(TRAIN_OUTPUT_FOLDER, "checkpoint_eduseiti_1000_queries_expansion_20230502_02.jsonl_02_epoch_20230503_180408_0.1194"),
                                                                              dataloader_test,
                                                                              trec_docs_tokens,
                                                                              pyserini_runfile, 
                                                                              bm25_run_with_all_data_df)

Parameters 33360770
TREC COVID: 100%|[32m████████████████████[0m| 186/186 [03:46<00:00,  1.22s/it][32m[0m
TREC COVID loss: 3.0494


Max consolidation results:
{'map': 0.198, 'recip_rank': 0.8079, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.5837}


Mean consolidation results:
{'map': 0.1996, 'recip_rank': 0.77, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.5775}


In [42]:
LLM_rerank_03_ms, LLM_max_03_ms, LLM_mean_03_ms = rerank_BM25_retrieved_texts(os.path.join(TRAIN_OUTPUT_FOLDER, "checkpoint_eduseiti_1000_queries_expansion_20230502_02.jsonl_03_epoch_20230503_181056_0.1055"),
                                                                              dataloader_test,
                                                                              trec_docs_tokens,
                                                                              pyserini_runfile, 
                                                                              bm25_run_with_all_data_df)

Parameters 33360770
TREC COVID: 100%|[32m████████████████████[0m| 186/186 [03:47<00:00,  1.22s/it][32m[0m
TREC COVID loss: 3.2194


Max consolidation results:
{'map': 0.1975, 'recip_rank': 0.7609, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.5679}


Mean consolidation results:
{'map': 0.1997, 'recip_rank': 0.7554, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.5716}


### Teste de fine tuning no MS MARCO com Learning Rate menor ― 1e-8

In [43]:
LLM_rerank_00_ms_lr_1, LLM_max_00_ms_lr_1, LLM_mean_00_ms_lr_1 = rerank_BM25_retrieved_texts(os.path.join(TRAIN_OUTPUT_FOLDER, "checkpoint_eduseiti_1000_queries_expansion_20230502_02.jsonl_00_epoch_20230503_204234_0.4395"),
                                                                                             dataloader_test,
                                                                                             trec_docs_tokens,
                                                                                             pyserini_runfile, 
                                                                                             bm25_run_with_all_data_df)

Parameters 33360770
TREC COVID: 100%|[32m████████████████████[0m| 186/186 [03:45<00:00,  1.21s/it][32m[0m
TREC COVID loss: 1.6707


Max consolidation results:
{'map': 0.1872, 'recip_rank': 0.8253, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.5879}


Mean consolidation results:
{'map': 0.1844, 'recip_rank': 0.8133, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.5927}


In [40]:
LLM_rerank_01_ms_lr_1, LLM_max_01_ms_lr_1, LLM_mean_01_ms_lr_1 = rerank_BM25_retrieved_texts(os.path.join(TRAIN_OUTPUT_FOLDER, "checkpoint_eduseiti_1000_queries_expansion_20230502_02.jsonl_01_epoch_20230503_204914_0.4281"),
                                                                                             dataloader_test,
                                                                                             trec_docs_tokens,
                                                                                             pyserini_runfile, 
                                                                                             bm25_run_with_all_data_df)

Parameters 33360770
TREC COVID:   0%|[32m                    [0m| 0/186 [00:00<?, ?it/s][32m[0m

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


TREC COVID: 100%|[32m████████████████████[0m| 186/186 [03:52<00:00,  1.25s/it][32m[0m
TREC COVID loss: 1.7174


Max consolidation results:
{'map': 0.1882, 'recip_rank': 0.838, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.5903}


Mean consolidation results:
{'map': 0.1854, 'recip_rank': 0.8263, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.5988}


In [41]:
LLM_rerank_01_ms_lr_2, LLM_max_01_ms_lr_2, LLM_mean_01_ms_lr_2 = rerank_BM25_retrieved_texts(os.path.join(TRAIN_OUTPUT_FOLDER, "checkpoint_eduseiti_1000_queries_expansion_20230502_02.jsonl_02_epoch_20230503_205547_0.4163"),
                                                                                             dataloader_test,
                                                                                             trec_docs_tokens,
                                                                                             pyserini_runfile, 
                                                                                             bm25_run_with_all_data_df)

Parameters 33360770
TREC COVID: 100%|[32m████████████████████[0m| 186/186 [03:45<00:00,  1.21s/it][32m[0m
TREC COVID loss: 1.7605


Max consolidation results:
{'map': 0.1891, 'recip_rank': 0.8413, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.595}


Mean consolidation results:
{'map': 0.1863, 'recip_rank': 0.8297, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.605}


In [42]:
LLM_rerank_01_ms_lr_3, LLM_max_01_ms_lr_3, LLM_mean_01_ms_lr_3 = rerank_BM25_retrieved_texts(os.path.join(TRAIN_OUTPUT_FOLDER, "checkpoint_eduseiti_1000_queries_expansion_20230502_02.jsonl_03_epoch_20230503_210220_0.3915"),
                                                                                             dataloader_test,
                                                                                             trec_docs_tokens,
                                                                                             pyserini_runfile, 
                                                                                             bm25_run_with_all_data_df)

Parameters 33360770
TREC COVID: 100%|[32m████████████████████[0m| 186/186 [03:47<00:00,  1.22s/it][32m[0m
TREC COVID loss: 1.7971


Max consolidation results:
{'map': 0.19, 'recip_rank': 0.8415, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.5968}


Mean consolidation results:
{'map': 0.1871, 'recip_rank': 0.8299, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.6094}


In [43]:
LLM_rerank_04_ms_lr_1, LLM_max_04_ms_lr_1, LLM_mean_04_ms_lr_1 = rerank_BM25_retrieved_texts(os.path.join(TRAIN_OUTPUT_FOLDER, "checkpoint_eduseiti_1000_queries_expansion_20230502_02.jsonl_04_epoch_20230503_210854_0.3876"),
                                                                                             dataloader_test,
                                                                                             trec_docs_tokens,
                                                                                             pyserini_runfile, 
                                                                                             bm25_run_with_all_data_df)

Parameters 33360770
TREC COVID: 100%|[32m████████████████████[0m| 186/186 [03:47<00:00,  1.22s/it][32m[0m
TREC COVID loss: 1.8296


Max consolidation results:
{'map': 0.1907, 'recip_rank': 0.8415, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.5976}


Mean consolidation results:
{'map': 0.1879, 'recip_rank': 0.8299, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.6091}


In [39]:
LLM_rerank_05_ms_lr_1, LLM_max_05_ms_lr_1, LLM_mean_05_ms_lr_1 = rerank_BM25_retrieved_texts(os.path.join(TRAIN_OUTPUT_FOLDER, "checkpoint_eduseiti_1000_queries_expansion_20230502_02.jsonl_05_epochs_20230503_210958_0.3876"),
                                                                                             dataloader_test,
                                                                                             trec_docs_tokens,
                                                                                             pyserini_runfile, 
                                                                                             bm25_run_with_all_data_df)

Parameters 33360770
TREC COVID:   0%|[32m                    [0m| 0/186 [00:00<?, ?it/s][32m[0m

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


TREC COVID: 100%|[32m████████████████████[0m| 186/186 [03:51<00:00,  1.25s/it][32m[0m
TREC COVID loss: 1.8296


Max consolidation results:
{'map': 0.1907, 'recip_rank': 0.8415, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.5976}


Mean consolidation results:
{'map': 0.1879, 'recip_rank': 0.8299, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.6091}


In [40]:
LLM_rerank_06_ms_lr_1, LLM_max_06_ms_lr_1, LLM_mean_06_ms_lr_1 = rerank_BM25_retrieved_texts(os.path.join(TRAIN_OUTPUT_FOLDER, "checkpoint_eduseiti_1000_queries_expansion_20230502_02.jsonl_00_epoch_20230503_211705_0.3742"),
                                                                                             dataloader_test,
                                                                                             trec_docs_tokens,
                                                                                             pyserini_runfile, 
                                                                                             bm25_run_with_all_data_df)

Parameters 33360770
TREC COVID: 100%|[32m████████████████████[0m| 186/186 [03:47<00:00,  1.22s/it][32m[0m
TREC COVID loss: 1.8541


Max consolidation results:
{'map': 0.1915, 'recip_rank': 0.8547, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.5995}


Mean consolidation results:
{'map': 0.1886, 'recip_rank': 0.8429, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.6113}


In [41]:
LLM_rerank_07_ms_lr_1, LLM_max_07_ms_lr_1, LLM_mean_07_ms_lr_1 = rerank_BM25_retrieved_texts(os.path.join(TRAIN_OUTPUT_FOLDER, "checkpoint_eduseiti_1000_queries_expansion_20230502_02.jsonl_01_epoch_20230503_212335_0.3734"),
                                                                                             dataloader_test,
                                                                                             trec_docs_tokens,
                                                                                             pyserini_runfile, 
                                                                                             bm25_run_with_all_data_df)

Parameters 33360770
TREC COVID: 100%|[32m████████████████████[0m| 186/186 [03:46<00:00,  1.22s/it][32m[0m
TREC COVID loss: 1.8813


Max consolidation results:
{'map': 0.1923, 'recip_rank': 0.8575, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.6032}


Mean consolidation results:
{'map': 0.1893, 'recip_rank': 0.8479, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.6155}


In [42]:
LLM_rerank_09_ms_lr_1, LLM_max_09_ms_lr_1, LLM_mean_09_ms_lr_1 = rerank_BM25_retrieved_texts(os.path.join(TRAIN_OUTPUT_FOLDER, "checkpoint_eduseiti_1000_queries_expansion_20230502_02.jsonl_04_epoch_20230503_214306_0.3554"),
                                                                                             dataloader_test,
                                                                                             trec_docs_tokens,
                                                                                             pyserini_runfile, 
                                                                                             bm25_run_with_all_data_df)

Parameters 33360770
TREC COVID: 100%|[32m████████████████████[0m| 186/186 [03:45<00:00,  1.21s/it][32m[0m
TREC COVID loss: 1.9405


Max consolidation results:
{'map': 0.1941, 'recip_rank': 0.8542, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.6143}


Mean consolidation results:
{'map': 0.1912, 'recip_rank': 0.8645, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.6207}


In [43]:
LLM_rerank_10_ms_lr_1, LLM_max_10_ms_lr_1, LLM_mean_10_ms_lr_1 = rerank_BM25_retrieved_texts(os.path.join(TRAIN_OUTPUT_FOLDER, "checkpoint_eduseiti_1000_queries_expansion_20230502_02.jsonl_00_epoch_20230503_224127_0.3430"),
                                                                                             dataloader_test,
                                                                                             trec_docs_tokens,
                                                                                             pyserini_runfile, 
                                                                                             bm25_run_with_all_data_df)

Parameters 33360770
TREC COVID:  80%|[32m████████████████    [0m| 149/186 [03:01<00:45,  1.22s/it][32m[0m


OutOfMemoryError: ignored

In [39]:
LLM_rerank_21_ms_lr_1, LLM_max_21_ms_lr_1, LLM_mean_21_ms_lr_1 = rerank_BM25_retrieved_texts(os.path.join(TRAIN_OUTPUT_FOLDER, "checkpoint_eduseiti_1000_queries_expansion_20230502_02.jsonl_06_epoch_20230503_235426_0.2963"),
                                                                                             dataloader_test,
                                                                                             trec_docs_tokens,
                                                                                             pyserini_runfile, 
                                                                                             bm25_run_with_all_data_df)

Parameters 33360770
TREC COVID:   0%|[32m                    [0m| 0/186 [00:00<?, ?it/s][32m[0m

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


TREC COVID: 100%|[32m████████████████████[0m| 186/186 [03:50<00:00,  1.24s/it][32m[0m
TREC COVID loss: 2.0639


Max consolidation results:
{'map': 0.1996, 'recip_rank': 0.8695, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.6256}


Mean consolidation results:
{'map': 0.196, 'recip_rank': 0.8723, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.6262}


In [40]:
LLM_rerank_23_ms_lr_1, LLM_max_23_ms_lr_1, LLM_mean_23_ms_lr_1 = rerank_BM25_retrieved_texts(os.path.join(TRAIN_OUTPUT_FOLDER, "checkpoint_eduseiti_1000_queries_expansion_20230502_02.jsonl_08_epoch_20230504_000728_0.2839"),
                                                                                             dataloader_test,
                                                                                             trec_docs_tokens,
                                                                                             pyserini_runfile, 
                                                                                             bm25_run_with_all_data_df)

Parameters 33360770
TREC COVID:   6%|[32m█▎                  [0m| 12/186 [00:14<03:26,  1.19s/it][32m[0m


OutOfMemoryError: ignored

In [39]:
LLM_rerank_32_ms_lr_1, LLM_max_32_ms_lr_1, LLM_mean_32_ms_lr_1 = rerank_BM25_retrieved_texts(os.path.join(TRAIN_OUTPUT_FOLDER, "checkpoint_eduseiti_1000_queries_expansion_20230502_02.jsonl_05_epoch_20230504_004406_0.2626"),
                                                                                             dataloader_test,
                                                                                             trec_docs_tokens,
                                                                                             pyserini_runfile, 
                                                                                             bm25_run_with_all_data_df)

Parameters 33360770
TREC COVID:   0%|[32m                    [0m| 0/186 [00:00<?, ?it/s][32m[0m

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


TREC COVID: 100%|[32m████████████████████[0m| 186/186 [02:38<00:00,  1.17it/s][32m[0m
TREC COVID loss: 2.1194


Max consolidation results:
{'map': 0.2016, 'recip_rank': 0.8662, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.6306}


Mean consolidation results:
{'map': 0.198, 'recip_rank': 0.874, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.6338}


In [40]:
LLM_rerank_37_ms_lr_1, LLM_max_37_ms_lr_1, LLM_mean_37_ms_lr_1 = rerank_BM25_retrieved_texts(os.path.join(TRAIN_OUTPUT_FOLDER, "checkpoint_eduseiti_1000_queries_expansion_20230502_02.jsonl_10_epoch_20230504_005543_0.2475"),
                                                                                             dataloader_test,
                                                                                             trec_docs_tokens,
                                                                                             pyserini_runfile, 
                                                                                             bm25_run_with_all_data_df)

Parameters 33360770
TREC COVID: 100%|[32m████████████████████[0m| 186/186 [02:35<00:00,  1.19it/s][32m[0m
TREC COVID loss: 2.1518


Max consolidation results:
{'map': 0.2025, 'recip_rank': 0.871, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.6296}


Mean consolidation results:
{'map': 0.1989, 'recip_rank': 0.8663, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.6354}


In [39]:
LLM_rerank_50_ms_lr_1, LLM_max_50_ms_lr_1, LLM_mean_50_ms_lr_1 = rerank_BM25_retrieved_texts(os.path.join(TRAIN_OUTPUT_FOLDER, "checkpoint_eduseiti_1000_queries_expansion_20230502_02.jsonl_23_epoch_20230504_012557_0.2231"),
                                                                                             dataloader_test,
                                                                                             trec_docs_tokens,
                                                                                             pyserini_runfile, 
                                                                                             bm25_run_with_all_data_df)

Parameters 33360770
TREC COVID:   0%|[32m                    [0m| 0/186 [00:00<?, ?it/s][32m[0m

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


TREC COVID: 100%|[32m████████████████████[0m| 186/186 [02:38<00:00,  1.17it/s][32m[0m
TREC COVID loss: 2.2356


Max consolidation results:
{'map': 0.2036, 'recip_rank': 0.8304, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.6266}


Mean consolidation results:
{'map': 0.2001, 'recip_rank': 0.8374, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.6273}


In [40]:
LLM_rerank_42_ms_lr_1, LLM_max_42_ms_lr_1, LLM_mean_42_ms_lr_1 = rerank_BM25_retrieved_texts(os.path.join(TRAIN_OUTPUT_FOLDER, "checkpoint_eduseiti_1000_queries_expansion_20230502_02.jsonl_15_epoch_20230504_010721_0.2386"),
                                                                                             dataloader_test,
                                                                                             trec_docs_tokens,
                                                                                             pyserini_runfile, 
                                                                                             bm25_run_with_all_data_df)

Parameters 33360770
TREC COVID: 100%|[32m████████████████████[0m| 186/186 [02:35<00:00,  1.19it/s][32m[0m
TREC COVID loss: 2.1847


Max consolidation results:
{'map': 0.2031, 'recip_rank': 0.8503, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.6365}


Mean consolidation results:
{'map': 0.1995, 'recip_rank': 0.8557, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.6381}


In [41]:
LLM_rerank_47_ms_lr_1, LLM_max_47_ms_lr_1, LLM_mean_47_ms_lr_1 = rerank_BM25_retrieved_texts(os.path.join(TRAIN_OUTPUT_FOLDER, "checkpoint_eduseiti_1000_queries_expansion_20230502_02.jsonl_20_epoch_20230504_011900_0.2287"),
                                                                                             dataloader_test,
                                                                                             trec_docs_tokens,
                                                                                             pyserini_runfile, 
                                                                                             bm25_run_with_all_data_df)

Parameters 33360770
TREC COVID: 100%|[32m████████████████████[0m| 186/186 [02:36<00:00,  1.19it/s][32m[0m
TREC COVID loss: 2.2162


Max consolidation results:
{'map': 0.2035, 'recip_rank': 0.8392, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.6292}


Mean consolidation results:
{'map': 0.1999, 'recip_rank': 0.8445, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.6297}


In [42]:
LLM_rerank_43_ms_lr_1, LLM_max_43_ms_lr_1, LLM_mean_43_ms_lr_1 = rerank_BM25_retrieved_texts(os.path.join(TRAIN_OUTPUT_FOLDER, "checkpoint_eduseiti_1000_queries_expansion_20230502_02.jsonl_16_epoch_20230504_010941_0.2360"),
                                                                                             dataloader_test,
                                                                                             trec_docs_tokens,
                                                                                             pyserini_runfile, 
                                                                                             bm25_run_with_all_data_df)

Parameters 33360770
TREC COVID: 100%|[32m████████████████████[0m| 186/186 [02:35<00:00,  1.19it/s][32m[0m
TREC COVID loss: 2.1927


Max consolidation results:
{'map': 0.2032, 'recip_rank': 0.8397, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.6359}


Mean consolidation results:
{'map': 0.1995, 'recip_rank': 0.845, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.6356}


### Other reranking using fine-tune over a MS MARCO partial fine-tune

In [None]:
LLM_rerank, LLM_max, LLM_mean = rerank_BM25_retrieved_texts(os.path.join(TRAIN_OUTPUT_FOLDER, "checkpoint_eduseiti_100_queries_expansion_20230501_01.jsonl_10_epochs_20230502_000854_0.1437"),
                                                            dataloader_test,
                                                            trec_docs_tokens,
                                                            pyserini_runfile, 
                                                            bm25_run_with_all_data_df)

Parameters 33360770
TREC COVID: 100%|[32m████████████████████[0m| 435/435 [13:41<00:00,  1.89s/it][32m[0m
TREC COVID loss: 3.9083
Max consolidation results:


{'map': 0.1818, 'recip_rank': 0.6947, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.4847}
Mean consolidation results:


{'map': 0.1819, 'recip_rank': 0.6982, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.4899}


In [None]:
LLM_rerank, LLM_max, LLM_mean = rerank_BM25_retrieved_texts(os.path.join(TRAIN_OUTPUT_FOLDER, "checkpoint_eduseiti_100_queries_expansion_20230501_01.jsonl_02_epochs_20230502_112819_0.2561"),
                                                            dataloader_test,
                                                            trec_docs_tokens,
                                                            pyserini_runfile, 
                                                            bm25_run_with_all_data_df)

Parameters 33360770
TREC COVID:   0%|[32m                    [0m| 0/435 [00:00<?, ?it/s][32m[0m

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


TREC COVID: 100%|[32m████████████████████[0m| 435/435 [13:34<00:00,  1.87s/it][32m[0m
TREC COVID loss: 2.9322


Max consolidation results:
{'map': 0.1974, 'recip_rank': 0.7895, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.587}


Mean consolidation results:
{'map': 0.1941, 'recip_rank': 0.7757, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.5921}


In [None]:
LLM_rerank_3, LLM_max_3, LLM_mean_3 = rerank_BM25_retrieved_texts(os.path.join(TRAIN_OUTPUT_FOLDER, "checkpoint_eduseiti_100_queries_expansion_20230501_01.jsonl_03_epochs_20230502_115502_0.1698"),
                                                                  dataloader_test,
                                                                  trec_docs_tokens,
                                                                  pyserini_runfile, 
                                                                  bm25_run_with_all_data_df)

Parameters 33360770
TREC COVID: 100%|[32m████████████████████[0m| 435/435 [13:58<00:00,  1.93s/it][32m[0m
TREC COVID loss: 3.1475


Max consolidation results:
{'map': 0.1974, 'recip_rank': 0.8083, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.5527}


Mean consolidation results:
{'map': 0.196, 'recip_rank': 0.815, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.5406}


In [None]:
LLM_rerank_1, LLM_max_1, LLM_mean_1 = rerank_BM25_retrieved_texts(os.path.join(TRAIN_OUTPUT_FOLDER, "checkpoint_eduseiti_100_queries_expansion_20230501_01.jsonl_01_epochs_20230502_132029_0.4151"),
                                                                  dataloader_test,
                                                                  trec_docs_tokens,
                                                                  pyserini_runfile, 
                                                                  bm25_run_with_all_data_df)

Parameters 33360770
TREC COVID: 100%|[32m████████████████████[0m| 435/435 [14:08<00:00,  1.95s/it][32m[0m
TREC COVID loss: 2.1366


Max consolidation results:
{'map': 0.1766, 'recip_rank': 0.655, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.4468}


Mean consolidation results:
{'map': 0.1755, 'recip_rank': 0.6622, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.4568}


In [None]:
LLM_rerank_1k_0, LLM_max_1k_0, LLM_mean_1k_0 = rerank_BM25_retrieved_texts(os.path.join(TRAIN_OUTPUT_FOLDER, "checkpoint_eduseiti_1000_queries_expansion_20230502_02.jsonl_00_epoch_20230502_182845_eval_0.2185"),
                                                                           dataloader_test,
                                                                           trec_docs_tokens,
                                                                           pyserini_runfile, 
                                                                           bm25_run_with_all_data_df)

Parameters 33360770
TREC COVID:   0%|[32m                    [0m| 0/435 [00:00<?, ?it/s][32m[0m

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


TREC COVID: 100%|[32m████████████████████[0m| 435/435 [23:05<00:00,  3.19s/it][32m[0m
TREC COVID loss: 3.2673


Max consolidation results:
{'map': 0.1842, 'recip_rank': 0.7851, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.5241}


Mean consolidation results:
{'map': 0.1838, 'recip_rank': 0.7911, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.5083}


In [None]:
LLM_rerank_1k_2, LLM_max_1k_2, LLM_mean_1k_2 = rerank_BM25_retrieved_texts(os.path.join(TRAIN_OUTPUT_FOLDER, "checkpoint_eduseiti_1000_queries_expansion_20230502_02.jsonl_10_epochs_20230502_192235_0.0774"),
                                                                           dataloader_test,
                                                                           trec_docs_tokens,
                                                                           pyserini_runfile, 
                                                                           bm25_run_with_all_data_df)

Parameters 33360770
TREC COVID: 100%|[32m████████████████████[0m| 435/435 [15:38<00:00,  2.16s/it][32m[0m
TREC COVID loss: 5.0743


Max consolidation results:
{'map': 0.1929, 'recip_rank': 0.8333, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.5499}


Mean consolidation results:
{'map': 0.1937, 'recip_rank': 0.7811, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.5304}


In [None]:
LLM_rerank_1k_3, LLM_max_1k_3, LLM_mean_1k_3 = rerank_BM25_retrieved_texts(os.path.join(TRAIN_OUTPUT_FOLDER, "checkpoint_eduseiti_1000_queries_expansion_20230502_02.jsonl_03_epoch_20230502_204303_0.0687"),
                                                                           dataloader_test,
                                                                           trec_docs_tokens,
                                                                           pyserini_runfile, 
                                                                           bm25_run_with_all_data_df)

Parameters 33360770
TREC COVID: 100%|[32m████████████████████[0m| 435/435 [33:31<00:00,  4.62s/it][32m[0m
TREC COVID loss: 5.5671


Max consolidation results:
{'map': 0.1986, 'recip_rank': 0.8166, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.5683}


Mean consolidation results:
{'map': 0.1997, 'recip_rank': 0.7976, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.5525}


In [None]:
LLM_rerank_1k_9, LLM_max_1k_9, LLM_mean_1k_9 = rerank_BM25_retrieved_texts(os.path.join(TRAIN_OUTPUT_FOLDER, "checkpoint_eduseiti_1000_queries_expansion_20230502_02.jsonl_09_epoch_20230502_212226_0.0258"),
                                                                           dataloader_test,
                                                                           trec_docs_tokens,
                                                                           pyserini_runfile, 
                                                                           bm25_run_with_all_data_df)

Parameters 33360770
TREC COVID: 100%|[32m████████████████████[0m| 435/435 [26:48<00:00,  3.70s/it][32m[0m
TREC COVID loss: 6.5316


Max consolidation results:
{'map': 0.1928, 'recip_rank': 0.7842, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.546}


Mean consolidation results:
{'map': 0.1942, 'recip_rank': 0.7537, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.5204}


In [None]:
LLM_rerank_1k_5, LLM_max_1k_5, LLM_mean_1k_5 = rerank_BM25_retrieved_texts(os.path.join(TRAIN_OUTPUT_FOLDER, "checkpoint_eduseiti_1000_queries_expansion_20230502_02.jsonl_05_epoch_20230502_205610_0.0484"),
                                                                           dataloader_test,
                                                                           trec_docs_tokens,
                                                                           pyserini_runfile, 
                                                                           bm25_run_with_all_data_df)

Parameters 33360770
TREC COVID: 100%|[32m████████████████████[0m| 435/435 [16:00<00:00,  2.21s/it][32m[0m
TREC COVID loss: 5.9943


Max consolidation results:
{'map': 0.1921, 'recip_rank': 0.808, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.5451}


Mean consolidation results:
{'map': 0.1945, 'recip_rank': 0.7715, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.5284}


In [None]:

LLM_rerank_1k_18, LLM_max_1k_18, LLM_mean_1k_18 = rerank_BM25_retrieved_texts(os.path.join(TRAIN_OUTPUT_FOLDER, "checkpoint_eduseiti_1000_queries_expansion_20230502_02.jsonl_08_epoch_20230502_222731_0.0133"),
                                                                              dataloader_test,
                                                                              trec_docs_tokens,
                                                                              pyserini_runfile, 
                                                                              bm25_run_with_all_data_df)

Parameters 33360770
TREC COVID:  29%|[32m█████▊              [0m| 126/435 [04:15<10:25,  2.02s/it][32m[0m


KeyboardInterrupt: 