In [6]:
import numpy as np
import pandas as pd

PATH_COLLECTION_DATA = 'data/subtask_4b/subtask4b_collection_data.pkl'

df_collection = pd.read_pickle(PATH_COLLECTION_DATA)
df_collection.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7718 entries, 162 to 1056448
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   cord_uid          7718 non-null   object        
 1   source_x          7718 non-null   object        
 2   title             7718 non-null   object        
 3   doi               7677 non-null   object        
 4   pmcid             4959 non-null   object        
 5   pubmed_id         6233 non-null   object        
 6   license           7718 non-null   object        
 7   abstract          7718 non-null   object        
 8   publish_time      7715 non-null   object        
 9   authors           7674 non-null   object        
 10  journal           6668 non-null   object        
 11  mag_id            0 non-null      float64       
 12  who_covidence_id  528 non-null    object        
 13  arxiv_id          20 non-null     object        
 14  label             7718 n

In [7]:
PATH_QUERY_DATA = 'data/eval-tweets/eval-tweets.tsv'
df_query = pd.read_csv(PATH_QUERY_DATA, sep = '\t')
sub_df_query = df_query[:100]
sub_df_query.head()

Unnamed: 0.1,Unnamed: 0,tweet_text,cord_uid
0,11163,"does the #lyme bacteria ""colonize""? interestin...",tffhk1hy
1,9641,nice preprint from the outstanding dr. joshua ...,wq92nfnd
2,10765,an analysis by and others concluded that halt...,e85xdrcw
3,3048,Neutralization of SARS-CoV-2 lineage B.1.1.7 p...,aefzogn3
4,2467,that T-cell immunity suggests we may already h...,4gr6i8rf


In [8]:
import os

dir_list = os.listdir("models/cross-embedding")
dir_list.sort()

latest_checkpoint = dir_list[-1]

model_name = f"models/cross-embedding/{latest_checkpoint}"

print(model_name)

models/cross-embedding/checkpoint-4276


In [9]:
from tqdm import tqdm
from rankers.cross_embedding_ranker import CrossRanker
from rank_bm25 import BM25Okapi

tqdm.pandas()

corpus = df_collection[:][['title', 'abstract']].apply(lambda x: f"{x['title']} {x['abstract']}", axis=1).tolist()

tokenized_corpus = [doc.split(' ') for doc in corpus]
bm25 = BM25Okapi(tokenized_corpus)

cord_uids = df_collection[:]['cord_uid'].tolist()
cross_ranker = CrossRanker(model_name)

def get_top_cord_uids_bm25(query):
    tokenized_query = query.split(' ')
    doc_scores = bm25.get_scores(tokenized_query)
    indices = np.argsort(-doc_scores)[:100]
    bm25_topk = [cord_uids[x] for x in indices]

    reduced_corpus = df_collection[df_collection['cord_uid'].isin(bm25_topk)]

    return reduced_corpus


def get_top_cord_uids(query):
    reduced_corpus = get_top_cord_uids_bm25(query)
    
    doc_scores = cross_ranker.get_scores(query, reduced_corpus)
    
    indices = np.argsort(-doc_scores)[:10]

    cross_topk = [reduced_corpus.iloc[x]["cord_uid"] for x in indices]

    return cross_topk

# Retrieve topk candidates using the BM25 model
sub_df_query.loc[:,'hybrid'] = sub_df_query.loc[:, 'tweet_text'].progress_apply(lambda x: get_top_cord_uids(x))
sub_df_query.head()


DEBUG:urllib3.connectionpool:Resetting dropped connection: huggingface.co
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /allenai/scibert_scivocab_cased/resolve/main/tokenizer_config.json HTTP/1.1" 404 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /allenai/scibert_scivocab_cased/resolve/main/config.json HTTP/1.1" 200 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /allenai/scibert_scivocab_cased/resolve/main/tokenizer_config.json HTTP/1.1" 404 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /allenai/scibert_scivocab_cased/resolve/main/vocab.txt HTTP/1.1" 200 0
100%|██████████| 100/100 [02:54<00:00,  1.74s/it]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_df_query.loc[:,'hybrid'] = sub_df_query.loc[:, 't

Unnamed: 0.1,Unnamed: 0,tweet_text,cord_uid,hybrid
0,11163,"does the #lyme bacteria ""colonize""? interestin...",tffhk1hy,"[tffhk1hy, h8x5cpz4, mjm2kap6, lvbutmio, cv7tb..."
1,9641,nice preprint from the outstanding dr. joshua ...,wq92nfnd,"[wq92nfnd, opjfy3xr, vx1hjh26, dt2pew66, 0imnd..."
2,10765,an analysis by and others concluded that halt...,e85xdrcw,"[e85xdrcw, tvzn0112, 5yscqct1, xtnyad3j, pf45e..."
3,3048,Neutralization of SARS-CoV-2 lineage B.1.1.7 p...,aefzogn3,"[jvgq2p61, aefzogn3, n3yuvf8v, 8aosx41e, ybcr7..."
4,2467,that T-cell immunity suggests we may already h...,4gr6i8rf,"[9p2pzsx0, nln6pl3h, ouno4jpl, tzp03hmr, 6e444..."


In [10]:
def get_performance_mrr(data, col_gold, col_pred, list_k = [1, 5, 10]):
    d_performance = {}
    for k in list_k:
        data.loc[:,"in_topx"] = data.apply(lambda x: (1/([i for i in x[col_pred][:k]].index(x[col_gold]) + 1) if x[col_gold] in [i for i in x[col_pred][:k]] else 0), axis=1)
        d_performance[k] = data["in_topx"].mean()
    return d_performance

results = get_performance_mrr(sub_df_query, 'cord_uid', 'hybrid')
# Printed MRR@k results in the following format: {k: MRR@k}
print(results)

{1: np.float64(0.64), 5: np.float64(0.6920000000000001), 10: np.float64(0.6946666666666665)}


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.loc[:,"in_topx"] = data.apply(lambda x: (1/([i for i in x[col_pred][:k]].index(x[col_gold]) + 1) if x[col_gold] in [i for i in x[col_pred][:k]] else 0), axis=1)
