In [1]:
import numpy as np
import pandas as pd

PATH_COLLECTION_DATA = 'data/subtask_4b/subtask4b_collection_data.pkl'

df_collection = pd.read_pickle(PATH_COLLECTION_DATA)
df_collection.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7718 entries, 162 to 1056448
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   cord_uid          7718 non-null   object        
 1   source_x          7718 non-null   object        
 2   title             7718 non-null   object        
 3   doi               7677 non-null   object        
 4   pmcid             4959 non-null   object        
 5   pubmed_id         6233 non-null   object        
 6   license           7718 non-null   object        
 7   abstract          7718 non-null   object        
 8   publish_time      7715 non-null   object        
 9   authors           7674 non-null   object        
 10  journal           6668 non-null   object        
 11  mag_id            0 non-null      float64       
 12  who_covidence_id  528 non-null    object        
 13  arxiv_id          20 non-null     object        
 14  label             7718 n

In [2]:
PATH_QUERY_DATA = 'data/dev-tweets/subtask4b_query_tweets_dev.tsv'
df_query = pd.read_csv(PATH_QUERY_DATA, sep = '\t')[:5]
df_query.head()

Unnamed: 0,post_id,tweet_text,cord_uid
0,16,covid recovery: this study from the usa reveal...,3qvh482o
1,69,"""Among 139 clients exposed to two symptomatic ...",r58aohnu
2,73,I recall early on reading that researchers who...,sts48u9i
3,93,You know you're credible when NIH website has ...,3sr2exq9
4,96,Resistance to antifungal medications is a grow...,ybwwmyqy


In [3]:
# Data prep for bi-encoder
from tqdm import tqdm
from rankers.bi_encoder_ranker import BiEncoderRanker

tqdm.pandas()

def flatten_corpus(entry):    
    title = entry["title"]
    authors = entry["authors"]
    abstract = entry["abstract"]
    journal = entry["journal"]

    paper_data = f"{title} [SEP] {authors} [SEP] {abstract} [SEP] {journal}"
    return paper_data

cord_uids = df_collection[:]['cord_uid'].tolist()
corpus = df_collection.progress_apply(lambda x: flatten_corpus(x), axis = 1)
corpus = corpus.tolist()

100%|██████████| 7718/7718 [00:00<00:00, 64981.11it/s]


In [4]:
bi_model_name = "multi-qa-mpnet-base-cos-v1"
bi_enc_ranker = BiEncoderRanker(bi_model_name, corpus)

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cuda:0
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: multi-qa-mpnet-base-cos-v1


Batches:   0%|          | 0/242 [00:00<?, ?it/s]

In [5]:
from rankers.gemma_ranker import GemmaRanker
llm_ranker = GemmaRanker()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda


In [6]:
from os import listdir
from rankers.cross_embedding_ranker import CrossRanker

dir_list = listdir("models/cross-embedding")
dir_list.sort()

latest_checkpoint = dir_list[-1]

cross_model_name = f"models/cross-embedding/{latest_checkpoint}"
cross_ranker = CrossRanker(cross_model_name)

def get_top_cord_uids_bi(query):
    doc_scores = bi_enc_ranker.get_scores(query)
    indices = np.argsort(-doc_scores[0])[:100]
    
    bi_topk = [cord_uids[x] for x in indices]
    reduced_corpus = df_collection[df_collection['cord_uid'].isin(bi_topk)]

    return reduced_corpus

def get_top_cord_uids_cross(query):
    reduced_corpus = get_top_cord_uids_bi(query)    
    
    doc_scores = cross_ranker.get_scores(query, reduced_corpus)    
    indices = np.argsort(-doc_scores)[:5]

    cross_topk = [reduced_corpus.iloc[x]["cord_uid"] for x in indices] 
    reduced_corpus = df_collection[df_collection['cord_uid'].isin(cross_topk)]

    return reduced_corpus

def get_llm_sorted_uids(query):
    reduced_corpus = get_top_cord_uids_cross(query)

    llm_topk = llm_ranker.sort_cached_bubble(query, reduced_corpus)

    return llm_topk

# Retrieve topk candidates using the BM25 model
df_query.loc[:,'bi_cross'] = df_query.loc[:, 'tweet_text'].progress_apply(lambda x: get_llm_sorted_uids(x))
df_query.head()

 60%|██████    | 3/5 [07:06<04:55, 147.74s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 5/5 [19:48<00:00, 237.76s/it]


Unnamed: 0,post_id,tweet_text,cord_uid,bi_cross
0,16,covid recovery: this study from the usa reveal...,3qvh482o,"[1adt71pk, styavbvi, rthsl7a9, hg3xpej0, 3qvh4..."
1,69,"""Among 139 clients exposed to two symptomatic ...",r58aohnu,"[r58aohnu, 1s8jzzwg, a0dvzhc8, v3n2r4ia, 9mn6t..."
2,73,I recall early on reading that researchers who...,sts48u9i,"[gruir7aw, hcfoj5l1, u6tbu9jf, sts48u9i, ujq9m..."
3,93,You know you're credible when NIH website has ...,3sr2exq9,"[3sr2exq9, kca5r5hr, k0f4cwig, xk6rp4e7, 6qkjw..."
4,96,Resistance to antifungal medications is a grow...,ybwwmyqy,"[wdw0a2kl, ouvq2wpq, 3l6ipiwk, ybwwmyqy, qh6ri..."


In [None]:
from eval_scripts.eval import get_performance_mrr, get_avg_gold_in_pred

mrr_results = get_performance_mrr(df_query, 'cord_uid', 'bi_cross')
gold_results = get_avg_gold_in_pred(df_query, 'cord_uid', 'bi_cross', list_k=[1, 3, 5, 10])
# Printed MRR@k results in the following format: {k: MRR@k}
print(">>>")
print(mrr_results)
print(gold_results)
print("<<<")

>>>
{1: np.float64(0.4), 5: np.float64(0.54), 10: np.float64(0.54)}
{1: np.float64(0.4), 5: np.float64(1.0), 10: np.float64(1.0), 100: np.float64(1.0)}
<<<
