In [1]:
import numpy as np
import pandas as pd
import torch

torch.set_float32_matmul_precision('high')

DEV_MODE=True
PATH_COLLECTION_DATA = 'data/subtask_4b/subtask4b_collection_data.pkl'

df_collection = pd.read_pickle(PATH_COLLECTION_DATA)
df_collection.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7718 entries, 162 to 1056448
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   cord_uid          7718 non-null   object        
 1   source_x          7718 non-null   object        
 2   title             7718 non-null   object        
 3   doi               7677 non-null   object        
 4   pmcid             4959 non-null   object        
 5   pubmed_id         6233 non-null   object        
 6   license           7718 non-null   object        
 7   abstract          7718 non-null   object        
 8   publish_time      7715 non-null   object        
 9   authors           7674 non-null   object        
 10  journal           6668 non-null   object        
 11  mag_id            0 non-null      float64       
 12  who_covidence_id  528 non-null    object        
 13  arxiv_id          20 non-null     object        
 14  label             7718 n

In [2]:
query_variant = 'dev' if DEV_MODE else 'test'

PATH_QUERY_DATA = f'data/subtask_4b/subtask4b_query_tweets_{query_variant}.tsv'
df_query = pd.read_csv(PATH_QUERY_DATA, sep = '\t')
df_query.head()

Unnamed: 0,post_id,tweet_text,cord_uid
0,16,covid recovery: this study from the usa reveal...,3qvh482o
1,69,"""Among 139 clients exposed to two symptomatic ...",r58aohnu
2,73,I recall early on reading that researchers who...,sts48u9i
3,93,You know you're credible when NIH website has ...,3sr2exq9
4,96,Resistance to antifungal medications is a grow...,ybwwmyqy


In [3]:
PARTIAL_PREDICTION_FILE = "partial-predictions/classifier/predictions.tsv"
partial_predictions = pd.read_csv(PARTIAL_PREDICTION_FILE, sep = '\t')
partial_predictions.head()

Unnamed: 0,post_id,preds,cord_uid
0,16,"['hg3xpej0', 'styavbvi', '3qvh482o', '1adt71pk...",3qvh482o
1,69,"['r58aohnu', 'icgsbelo', '0emibwp3', '8je46886...",r58aohnu
2,73,"['sts48u9i', 'gruir7aw', 'u6tbu9jf', 'hcfoj5l1...",sts48u9i
3,93,"['3sr2exq9', 'k0f4cwig', '8j3bb6zx', 'sv48gjkk...",3sr2exq9
4,96,"['ybwwmyqy', '3l6ipiwk', 'qh6rif48', 'rs3umc1x...",ybwwmyqy


In [4]:
from rankers.pairwise_ranker import PairwiseRanker
from tqdm import tqdm
from os import listdir

tqdm.pandas()

base_name = "models/pairwise-classifier-large"
dir_list = listdir(base_name)
dir_list.sort()

latest_checkpoint = dir_list[-1]

model_name = f"{base_name}/{latest_checkpoint}"

pairwise_ranker = PairwiseRanker(model_name)


def get_top_cord_uids(query):
    tweet_id = query["post_id"]
    selected_docs_uids = eval(partial_predictions[partial_predictions["post_id"] == tweet_id]["preds"].values[0])[:10]
    
    reduced_corpus = df_collection[df_collection['cord_uid'].isin(selected_docs_uids)]
    pair_topk = pairwise_ranker.rank_avg_prob(query["tweet_text"], reduced_corpus, use_cache=True)

    return pair_topk

# Retrieve topk candidates using the BM25 model
df_query['bi_cross_pair'] = df_query.progress_apply(lambda x: get_top_cord_uids(x), axis=1)

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /answerdotai/ModernBERT-base/resolve/main/tokenizer_config.json HTTP/1.1" 200 0
  0%|          | 0/1400 [00:00<?, ?it/s]W0526 23:53:34.137000 48985 torch/_inductor/utils.py:1250] [0/0] Not enough SMs to use max_autotune_gemm mode
100%|██████████| 1400/1400 [13:51:16<00:00, 35.63s/it]  


In [None]:
from eval_scripts.eval import get_performance_mrr, get_avg_gold_in_pred, create_pred_file
if DEV_MODE:
    mrr_results = get_performance_mrr(df_query, 'cord_uid', 'bi_cross_pair')
    gold_results = get_avg_gold_in_pred(df_query, 'cord_uid', 'bi_cross_pair', list_k=[5, 10])
    # Printed MRR@k results in the following format: {k: MRR@k}
    print(">>>")
    print(mrr_results)
    print(gold_results)
    print("<<<")

create_pred_file(df_query, "bi_cross_pair", prediction_size=5, include_gold=DEV_MODE, base_folder="partial-predictions/pairwise")

>>>
{1: np.float64(0.6528571428571428), 5: np.float64(0.7024761904761904), 10: np.float64(0.7089934807256235)}
{5: np.float64(0.7792857142857142), 10: np.float64(0.8278571428571428)}
<<<
