# TREC-COVID DL reranking evaluation

This notebook perform reranking tests over TREC COVID queries BM25 results.

## Prepare the environment

In [1]:
import os
import sys

import tqdm

In [2]:
IN_COLAB='google.colab' in sys.modules
LINK_WITH_COMET=False

In [3]:
if IN_COLAB:
    from google.colab import drive

    WORKING_FOLDER="/content/drive/MyDrive/unicamp/ia368v_dd/aula_09"

    drive.mount('/content/drive', force_remount=True)

    os.chdir(WORKING_FOLDER)
    
    !pip install transformers -q

    if LINK_WITH_COMET:
        !pip install comet_ml -q
else:
    WORKING_FOLDER="/mnt/0060f889-4c27-409b-b0de-47f5427515e3/unicamp/ia368v_dd/ia368v_dd_class_09/"
    PYSERINI_FOLDER="/mnt/0060f889-4c27-409b-b0de-47f5427515e3/unicamp/ia368v_dd/pyserini/"
    
    TREC_EVAL_FULLPATH=PYSERINI_FOLDER+"tools/eval/trec_eval.9.0.4/trec_eval"
    
    os.environ["ANSERINI_CLASSPATH"]="/media/eduseiti/bigdata01/unicamp/ia368v_dd/anserini/target"

In [4]:
import pandas as pd
import pickle
import numpy as np

import json

import time

import re

from datetime import datetime

from scipy import stats

if LINK_WITH_COMET:
    from comet_ml import Experiment

In [5]:
TREC_COVID_MERGED_FILE="trec_covid_merged_data.tsv"
TREC_COVID_DOCUMENTS_FILE="trec_covid_original_title_text_merged.tsv"

TREC_COVID_QUERIES="trec_covid_queries.tsv"
TREC_COVID_QRELS="trec_covid_qrels.tsv"

API_KEYS_FILE="../api_keys_20230324.json"

pd.set_option('display.max_colwidth', None)

In [6]:
TREC_COVID_ORIGINAL_FOLDER="trec_covid_original"
TREC_COVID_ORIGINAL_INDEX_FOLDER="trec_covid_original/index"
TREC_COVID_ORIGINAL_RUNS_FOLDER="trec_covid_original/runs"

In [7]:
import torch

from transformers import get_linear_schedule_with_warmup, get_constant_schedule
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils import data
from transformers import BatchEncoding

## Set the random seed

In [8]:
RANDOM_SEED = 6

rng = np.random.default_rng(RANDOM_SEED)

### Link with COMET

In [9]:
if LINK_WITH_COMET:
    with open(API_KEYS_FILE) as inputFile:
        api_keys = json.load(inputFile)

    experiment = Experiment(api_key=api_keys['comet_ml'], 
                            project_name="InPars reraking",
                            workspace="eduseiti")

### Initialize reranking model parameters

In [10]:
MODEL_NAME='microsoft/MiniLM-L12-H384-uncased'
MS_MARCO_PRETRAINED_MODEL="checkpoint_0.134_20230316_234651"

MAX_TOKENS_LENGTH=512

In [11]:
TRAIN_OUTPUT_FOLDER="trained_models"

In [12]:
TREC_COVID_TOKENIZED_BM25_RUN="trec_covid_tokenized_{}.pkl"

In [13]:
PYSERINI_TEST_RUN_RERANKED_FILENAME_FORMAT="run.trec_covid_reranking_{}_{}_{}.txt"

In [14]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [15]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [16]:
device

device(type='cuda')

## Perform BM25 search over TREC-COVID queries

In [17]:
if os.path.exists(os.path.join(TREC_COVID_ORIGINAL_RUNS_FOLDER, "run.trec_covid_original_complete_20230501_141634.txt")):
    pyserini_runfile = "run.trec_covid_original_complete_20230501_141634.txt"
else:
    execution_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    pyserini_runfile = "run.trec_covid_original_complete_{}.txt".format(execution_timestamp)

    !python3.8 -m pyserini.search.lucene \
        --index {WORKING_FOLDER}{TREC_COVID_ORIGINAL_INDEX_FOLDER} \
        --topics {WORKING_FOLDER}{TREC_COVID_QUERIES} \
        --output {WORKING_FOLDER}{TREC_COVID_ORIGINAL_RUNS_FOLDER}/{pyserini_runfile} \
        --output-format trec \
        --hits 1000 \
        --bm25 --k1 0.82 --b 0.68 \
        --threads 8

## Now prepare the BM25 results for reranking

### Load the TREC COVID documents

In [18]:
trec_covid_docs_df = pd.read_csv(TREC_COVID_DOCUMENTS_FILE, sep='\t', header=None, names=['corpus-id', 'corpus-title-text'])

display(trec_covid_docs_df.head())

print(trec_covid_docs_df.shape)

Unnamed: 0,corpus-id,corpus-title-text
0,ug7v899j,"Clinical features of culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi ArabiaOBJECTIVE: This retrospective chart review describes the epidemiology and clinical features of 40 patients with culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia. METHODS: Patients with positive M. pneumoniae cultures from respiratory specimens from January 1997 through December 1998 were identified through the Microbiology records. Charts of patients were reviewed. RESULTS: 40 patients were identified, 33 (82.5%) of whom required admission. Most infections (92.5%) were community-acquired. The infection affected all age groups but was most common in infants (32.5%) and pre-school children (22.5%). It occurred year-round but was most common in the fall (35%) and spring (30%). More than three-quarters of patients (77.5%) had comorbidities. Twenty-four isolates (60%) were associated with pneumonia, 14 (35%) with upper respiratory tract infections, and 2 (5%) with bronchiolitis. Cough (82.5%), fever (75%), and malaise (58.8%) were the most common symptoms, and crepitations (60%), and wheezes (40%) were the most common signs. Most patients with pneumonia had crepitations (79.2%) but only 25% had bronchial breathing. Immunocompromised patients were more likely than non-immunocompromised patients to present with pneumonia (8/9 versus 16/31, P = 0.05). Of the 24 patients with pneumonia, 14 (58.3%) had uneventful recovery, 4 (16.7%) recovered following some complications, 3 (12.5%) died because of M pneumoniae infection, and 3 (12.5%) died due to underlying comorbidities. The 3 patients who died of M pneumoniae pneumonia had other comorbidities. CONCLUSION: our results were similar to published data except for the finding that infections were more common in infants and preschool children and that the mortality rate of pneumonia in patients with comorbidities was high."
1,02tnwd4m,"Nitric oxide: a pro-inflammatory mediator in lung disease?Inflammatory diseases of the respiratory tract are commonly associated with elevated production of nitric oxide (NO•) and increased indices of NO• -dependent oxidative stress. Although NO• is known to have anti-microbial, anti-inflammatory and anti-oxidant properties, various lines of evidence support the contribution of NO• to lung injury in several disease models. On the basis of biochemical evidence, it is often presumed that such NO• -dependent oxidations are due to the formation of the oxidant peroxynitrite, although alternative mechanisms involving the phagocyte-derived heme proteins myeloperoxidase and eosinophil peroxidase might be operative during conditions of inflammation. Because of the overwhelming literature on NO• generation and activities in the respiratory tract, it would be beyond the scope of this commentary to review this area comprehensively. Instead, it focuses on recent evidence and concepts of the presumed contribution of NO• to inflammatory diseases of the lung."
2,ejv2xln0,"Surfactant protein-D and pulmonary host defenseSurfactant protein-D (SP-D) participates in the innate response to inhaled microorganisms and organic antigens, and contributes to immune and inflammatory regulation within the lung. SP-D is synthesized and secreted by alveolar and bronchiolar epithelial cells, but is also expressed by epithelial cells lining various exocrine ducts and the mucosa of the gastrointestinal and genitourinary tracts. SP-D, a collagenous calcium-dependent lectin (or collectin), binds to surface glycoconjugates expressed by a wide variety of microorganisms, and to oligosaccharides associated with the surface of various complex organic antigens. SP-D also specifically interacts with glycoconjugates and other molecules expressed on the surface of macrophages, neutrophils, and lymphocytes. In addition, SP-D binds to specific surfactant-associated lipids and can influence the organization of lipid mixtures containing phosphatidylinositol in vitro. Consistent with these diverse in vitro activities is the observation that SP-D-deficient transgenic mice show abnormal accumulations of surfactant lipids, and respond abnormally to challenge with respiratory viruses and bacterial lipopolysaccharides. The phenotype of macrophages isolated from the lungs of SP-D-deficient mice is altered, and there is circumstantial evidence that abnormal oxidant metabolism and/or increased metalloproteinase expression contributes to the development of emphysema. The expression of SP-D is increased in response to many forms of lung injury, and deficient accumulation of appropriately oligomerized SP-D might contribute to the pathogenesis of a variety of human lung diseases."
3,2b73a28n,"Role of endothelin-1 in lung diseaseEndothelin-1 (ET-1) is a 21 amino acid peptide with diverse biological activity that has been implicated in numerous diseases. ET-1 is a potent mitogen regulator of smooth muscle tone, and inflammatory mediator that may play a key role in diseases of the airways, pulmonary circulation, and inflammatory lung diseases, both acute and chronic. This review will focus on the biology of ET-1 and its role in lung disease."
4,9785vg6d,"Gene expression in epithelial cells in response to pneumovirus infectionRespiratory syncytial virus (RSV) and pneumonia virus of mice (PVM) are viruses of the family Paramyxoviridae, subfamily pneumovirus, which cause clinically important respiratory infections in humans and rodents, respectively. The respiratory epithelial target cells respond to viral infection with specific alterations in gene expression, including production of chemoattractant cytokines, adhesion molecules, elements that are related to the apoptosis response, and others that remain incompletely understood. Here we review our current understanding of these mucosal responses and discuss several genomic approaches, including differential display reverse transcription-polymerase chain reaction (PCR) and gene array strategies, that will permit us to unravel the nature of these responses in a more complete and systematic manner."


(171325, 2)


### Load the TREC COVID queries

In [19]:
trec_covid_queries_df = pd.read_csv(TREC_COVID_QUERIES, sep='\t', header=None, names=['query-id', 'query-text'])

display(trec_covid_queries_df)

Unnamed: 0,query-id,query-text
0,1,what is the origin of COVID-19
1,2,how does the coronavirus respond to changes in the weather
2,3,will SARS-CoV2 infected people develop immunity? Is cross protection possible?
3,4,what causes death from Covid-19?
4,5,what drugs have been active against SARS-CoV or SARS-CoV-2 in animal studies?
5,6,what types of rapid testing for Covid-19 have been developed?
6,7,are there serological tests that detect antibodies to coronavirus?
7,8,how has lack of testing availability led to underreporting of true incidence of Covid-19?
8,9,how has COVID-19 affected Canada
9,10,has social distancing had an impact on slowing the spread of COVID-19?


### Now, load the BM25 run

In [20]:
bm25_run_df = pd.read_csv(os.path.join(TREC_COVID_ORIGINAL_RUNS_FOLDER, pyserini_runfile), 
                          sep=" ", 
                          header=None, 
                          names=['query-id', 'Q0', 'doc-id', 'doc-order', 'doc-score', 'comment'])

In [21]:
bm25_run_df.shape

(50000, 6)

In [22]:
bm25_run_df.head()

Unnamed: 0,query-id,Q0,doc-id,doc-order,doc-score,comment
0,1,Q0,dv9m19yk,1,7.7299,Anserini
1,1,Q0,hmvo5b0q,2,6.4755,Anserini
2,1,Q0,0paafp5j,3,6.4311,Anserini
3,1,Q0,96zsd27n,4,6.431099,Anserini
4,1,Q0,5d7zien3,5,6.2127,Anserini


### Check if hasn't already tokenized the data

In [23]:
if os.path.exists(TREC_COVID_TOKENIZED_BM25_RUN.format(os.path.splitext(pyserini_runfile)[0])):
    with open(TREC_COVID_TOKENIZED_BM25_RUN.format(os.path.splitext(pyserini_runfile)[0]), "rb") as inputFile:
        
        tokenized_data = pickle.load(inputFile)

    trec_queries_tokens = tokenized_data['trec_queries_tokens']
    trec_docs_tokens = tokenized_data['trec_docs_tokens']
    bm25_run_with_all_data_df = tokenized_data['bm25_run_with_all_data_df']
    
    tokenized_data_read=True
else:
    tokenized_data_read=False
    
    print("Need to create the tokenized BM25 run data...")

### Build the test data to be tokenized

#### First, filter the TREC COVID topics text using the corresponding IDs on the run

In [24]:
if not tokenized_data_read:
    filtered_topics = trec_covid_queries_df.merge(bm25_run_df, left_on='query-id', right_on='query-id', how='inner')

    display(filtered_topics)

    bm25_run_with_all_data_df = filtered_topics.merge(trec_covid_docs_df, left_on='doc-id', right_on='corpus-id', how='inner')

    display(bm25_run_with_all_data_df)
else:
    print("Has already read the tokenized data...")

Has already read the tokenized data...


#### Now, tokenize both topics and returned texts

In [25]:
if not tokenized_data_read:
    trec_queries_tokens = tokenizer(bm25_run_with_all_data_df['query-text'].tolist(), 
                                    truncation=True, 
                                    max_length=MAX_TOKENS_LENGTH, 
                                    return_length=True)

    print(stats.describe(trec_queries_tokens['length']))

    trec_docs_tokens = tokenizer(bm25_run_with_all_data_df['corpus-title-text'].tolist(), 
                                 truncation=True,
                                 return_overflowing_tokens=True, 
                                 max_length=MAX_TOKENS_LENGTH - np.max(trec_queries_tokens['length']), 
                                 return_length=True)

    print(stats.describe(trec_docs_tokens['length']))
    
    #### Check if has truncated documents

    original_length = bm25_run_with_all_data_df.shape[0]

    if original_length < len(trec_docs_tokens['overflow_to_sample_mapping']):
        print("Added {} overflowing texts...".format(len(trec_docs_tokens['overflow_to_sample_mapping']) - original_length))


    #### Save the tokenized data

    with open(TREC_COVID_TOKENIZED_BM25_RUN.format(os.path.splitext(pyserini_runfile)[0]), "wb") as outputFile:
        pickle.dump({'trec_queries_tokens': trec_queries_tokens,
                     'trec_docs_tokens': trec_docs_tokens,
                     'bm25_run_with_all_data_df': bm25_run_with_all_data_df}, outputFile, pickle.HIGHEST_PROTOCOL)    
else:
    print("Has already read the tokenized data...")    

Has already read the tokenized data...


### Build the concatenated topic + document to feed the model

Remove the 'CLS' token from the documents token sequence.

In [26]:
test_input_ids = []
test_token_type_ids = []
test_attention_mask = []

# Loop through the documents tokens, since there are overflown ones which shares the same question

for i in range(len(trec_docs_tokens['input_ids'])):
    
    which_query = trec_docs_tokens['overflow_to_sample_mapping'][i]
    
    test_input_ids.append(trec_queries_tokens['input_ids'][which_query] + trec_docs_tokens['input_ids'][i][1:])
    test_token_type_ids.append(trec_queries_tokens['token_type_ids'][which_query] + trec_docs_tokens['token_type_ids'][i][1:])
    test_attention_mask.append(trec_queries_tokens['attention_mask'][which_query] + trec_docs_tokens['attention_mask'][i][1:])

In [27]:
x_test = {'input_ids': test_input_ids, 
          'token_type_ids': test_token_type_ids, 
          'attention_mask': test_attention_mask}

Enter this fixed target data just as reference for the Dataset class.

In [28]:
y_test = np.ones(len(trec_docs_tokens['input_ids']), dtype=bool)

## Initialize some model structures before doing anything

In [29]:
def collate_fn(batch):

    # print(len(batch[0]['input_ids']))

    r1 = tokenizer.pad(batch, return_tensors='pt')

    # print(len(r1['input_ids'][0]))

    return BatchEncoding(r1)


class Dataset(data.Dataset):
    def __init__(self, examples, targets):
        self.examples = examples
        self.targets = targets
    
    def __len__(self):
        return len(self.examples['input_ids'])
    
    def __getitem__(self, idx):
        return {
            'input_ids': self.examples['input_ids'][idx],
            'attention_mask': self.examples['attention_mask'][idx],
            'labels': int(self.targets[idx]),
        }

In [30]:
def collect_reranking(model, dataloader, set_name):
    losses = []
    scores = []
    
    model.eval()

    with torch.no_grad():
        for batch in tqdm.tqdm(dataloader, mininterval=0.5, desc=set_name, disable=False, bar_format='{l_bar}{bar:20}{r_bar}{bar:-20b}', colour='GREEN', file=sys.stdout, position=0, leave=True):
            outputs = model(**batch.to(device))
            loss_val = outputs.loss
            losses.append(loss_val.cpu().item())

            scores.append(outputs.logits.cpu())

    print("{} loss: {:0.4f}".format(set_name, np.mean(losses)))

    return scores

### Create the dataset and the dataloader

In [31]:
dataset_test = Dataset(x_test, y_test)

### Make sure the dataloader preserves the samples order (no shuffling!!!)

In [32]:
batch_size=128

dataloader_test = data.DataLoader(dataset_test, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

In [33]:
TREC_RESULT_LINE_FORMAT="{}\tQ0\t{}\t{}\t{}\tInPars_reranking\n"

In [34]:
def consolidate_reranking_scores_and_check_performance(pyserini_runfile, consolidation_approach, bm25_run_with_all_data_df, query_scores):
    
    consolidated_scores = []

    # Consolidate the scores according to the defined approach.
    
    for i in range(bm25_run_with_all_data_df.shape[0]):
        if consolidation_approach == 'mean':
            consolidated_scores.append(np.mean(query_scores[i]))
        else:
            consolidated_scores.append(np.max(query_scores[i]))
            
    bm25_run_with_all_data_df['reranking_scores'] = consolidated_scores
    
    test_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    reranked_run = PYSERINI_TEST_RUN_RERANKED_FILENAME_FORMAT.format(os.path.splitext(pyserini_runfile)[0], test_timestamp, consolidation_approach)
    
    with open(reranked_run, 'w') as outputFile:
        for group_name, group_df in bm25_run_with_all_data_df.groupby('query-id'):
            group_df = group_df.sort_values('reranking_scores', ascending=False).reset_index(drop=True)

            for i, row in group_df.iterrows():
                outputFile.write(TREC_RESULT_LINE_FORMAT.format(group_name, row['doc-id'], i + 1, row['reranking_scores']))
                
    result = !{TREC_EVAL_FULLPATH} -c -mrecall.1000 -mmap -mndcg_cut.10 -mrecip_rank.100 \
                 {WORKING_FOLDER}{TREC_COVID_QRELS} {WORKING_FOLDER}/{reranked_run}

    results = {}

    for line in result:

        line = line.split('\t')

        results[line[0].strip()] = np.float32(line[-1])    
        
    return({"consolidated_scores": consolidated_scores,
            "reranked_run": reranked_run,
            "results": results})

In [35]:
def rerank_BM25_retrieved_texts(model_checkpoint, dataloader_test, trec_docs_tokens, pyserini_runfile, bm25_run_with_all_data_df):

    # Read the model checkpoint
    
    model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint).to(device)
    print('Parameters', model.num_parameters())
    
    # Rerank the BM25 retrieved documents
    
    reranking_scores = collect_reranking(model=model, dataloader=dataloader_test, set_name='TREC COVID')
    
    matches_relevance_score = np.concatenate([batch_scores[:][:, 1].numpy() for batch_scores in reranking_scores])

    query_scores = {}

    for i, match_score in enumerate(matches_relevance_score):
        which_query = trec_docs_tokens['overflow_to_sample_mapping'][i]

        if which_query not in query_scores:
            query_scores[which_query] = []

        query_scores[which_query].append(match_score)
        
    max_results = consolidate_reranking_scores_and_check_performance(pyserini_runfile, "max", bm25_run_with_all_data_df, query_scores)

    print("\n\nMax consolidation results:")
    print(max_results['results'])

    mean_results = consolidate_reranking_scores_and_check_performance(pyserini_runfile, "mean", bm25_run_with_all_data_df, query_scores)

    print("\n\nMean consolidation results:")
    print(mean_results['results'])
    
    return reranking_scores, max_results, mean_results

### Execute tests in different pretraining

In [37]:
MSMARCO_rerank, MSMARCO_max, MSMARCO_mean = rerank_BM25_retrieved_texts(os.path.join(TRAIN_OUTPUT_FOLDER, MS_MARCO_PRETRAINED_MODEL),
                                                                        dataloader_test,
                                                                        trec_docs_tokens,
                                                                        pyserini_runfile, 
                                                                        bm25_run_with_all_data_df)

Parameters 33360770
TREC COVID: 100%|[32m████████████████████[0m| 435/435 [13:49<00:00,  1.91s/it][32m[0m
TREC COVID loss: 2.3508


Max consolidation results:
{'map': 0.1822, 'recip_rank': 0.8427, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.5618}


Mean consolidation results:
{'map': 0.176, 'recip_rank': 0.843, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.5519}


In [40]:
LLM_rerank, LLM_max, LLM_mean = rerank_BM25_retrieved_texts(os.path.join(TRAIN_OUTPUT_FOLDER, "checkpoint_eduseiti_100_queries_expansion_20230501_01.jsonl_10_epochs_20230502_000854_0.1437"),
                                                            dataloader_test,
                                                            trec_docs_tokens,
                                                            pyserini_runfile, 
                                                            bm25_run_with_all_data_df)

Parameters 33360770
TREC COVID: 100%|[32m████████████████████[0m| 435/435 [13:41<00:00,  1.89s/it][32m[0m
TREC COVID loss: 3.9083
Max consolidation results:


{'map': 0.1818, 'recip_rank': 0.6947, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.4847}
Mean consolidation results:


{'map': 0.1819, 'recip_rank': 0.6982, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.4899}


In [36]:
LLM_rerank, LLM_max, LLM_mean = rerank_BM25_retrieved_texts(os.path.join(TRAIN_OUTPUT_FOLDER, "checkpoint_eduseiti_100_queries_expansion_20230501_01.jsonl_02_epochs_20230502_112819_0.2561"),
                                                            dataloader_test,
                                                            trec_docs_tokens,
                                                            pyserini_runfile, 
                                                            bm25_run_with_all_data_df)

Parameters 33360770
TREC COVID:   0%|[32m                    [0m| 0/435 [00:00<?, ?it/s][32m[0m

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


TREC COVID: 100%|[32m████████████████████[0m| 435/435 [13:34<00:00,  1.87s/it][32m[0m
TREC COVID loss: 2.9322


Max consolidation results:
{'map': 0.1974, 'recip_rank': 0.7895, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.587}


Mean consolidation results:
{'map': 0.1941, 'recip_rank': 0.7757, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.5921}


In [38]:
LLM_rerank_3, LLM_max_3, LLM_mean_3 = rerank_BM25_retrieved_texts(os.path.join(TRAIN_OUTPUT_FOLDER, "checkpoint_eduseiti_100_queries_expansion_20230501_01.jsonl_03_epochs_20230502_115502_0.1698"),
                                                                  dataloader_test,
                                                                  trec_docs_tokens,
                                                                  pyserini_runfile, 
                                                                  bm25_run_with_all_data_df)

Parameters 33360770
TREC COVID: 100%|[32m████████████████████[0m| 435/435 [13:58<00:00,  1.93s/it][32m[0m
TREC COVID loss: 3.1475


Max consolidation results:
{'map': 0.1974, 'recip_rank': 0.8083, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.5527}


Mean consolidation results:
{'map': 0.196, 'recip_rank': 0.815, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.5406}


In [39]:
LLM_rerank_1, LLM_max_1, LLM_mean_1 = rerank_BM25_retrieved_texts(os.path.join(TRAIN_OUTPUT_FOLDER, "checkpoint_eduseiti_100_queries_expansion_20230501_01.jsonl_01_epochs_20230502_132029_0.4151"),
                                                                  dataloader_test,
                                                                  trec_docs_tokens,
                                                                  pyserini_runfile, 
                                                                  bm25_run_with_all_data_df)

Parameters 33360770
TREC COVID: 100%|[32m████████████████████[0m| 435/435 [14:08<00:00,  1.95s/it][32m[0m
TREC COVID loss: 2.1366


Max consolidation results:
{'map': 0.1766, 'recip_rank': 0.655, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.4468}


Mean consolidation results:
{'map': 0.1755, 'recip_rank': 0.6622, 'recall_1000': 0.3943, 'ndcg_cut_10': 0.4568}


In [None]:
model = AutoModelForSequenceClassification.from_pretrained(os.path.joint(TRAIN_OUTPUT_FOLDER, MS_MARCO_PRETRAINED_MODEL)).to(device)
print('Parameters', model.num_parameters())

### Rerank the BM25 retrieved texts

In [None]:
reranking_scores = collect_reranking(model=model, dataloader=dataloader_test, set_name='TREC COVID')

#### Consider the logit for class 1 (True) as the relevance score

In [None]:
matches_relevance_score = np.concatenate([batch_scores[:][:, 1].numpy() for batch_scores in reranking_scores])

In [None]:
matches_relevance_score.shape

### Merge the results in the topics x docs dataframe

#### First, map the document tokens to the document referred in the BM25 results

This needs to be done since some documents were longer than the maximun length allowed in the model.

In [None]:
query_scores = {}

for i, match_score in enumerate(matches_relevance_score):
    which_query = trec_docs_tokens['overflow_to_sample_mapping'][i]
    
    if which_query not in query_scores:
        query_scores[which_query] = []
        
    query_scores[which_query].append(match_score)

In [None]:
len(query_scores)

In [None]:
len(trec_docs_tokens['input_ids'])

In [None]:
score_count = np.zeros(len(query_scores))

for key, value in query_scores.items():
    score_count[key] = len(value)

In [None]:
np.unique(score_count, return_counts=True)

#### Consolidates a single score for each document referred in the BM25 results

In [None]:
max_results = consolidate_reranking_scores_and_check_performance(pyserini_runfile, "max", bm25_run_with_all_data_df, query_scores)

In [None]:
max_results['results']

In [None]:
mean_results = consolidate_reranking_scores_and_check_performance(pyserini_runfile, "mean", bm25_run_with_all_data_df, query_scores)

In [None]:
mean_results['results']