In [61]:
import os
import logging
import time
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from typing import List, Tuple
from datetime import datetime
from transformers import AutoTokenizer, AutoModel
import torch
from sentence_transformers import SentenceTransformer

In [62]:
log_format = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")

# Create handlers
console_handler = logging.StreamHandler()
# file_handler = logging.FileHandler(...
# Create formatters
console_handler.setFormatter(log_format)
#file_handler.setFormatter(log_format)

# Create a logger and set the level
logger = logging.getLogger()
#logger.setLevel(logging.INFO)

# Attach handlers
logger.addHandler(console_handler)
#logger.addHandler(file_handler)

# Now all log messages go to both the console and the file
#logger.info("This goes to both places!")


In [63]:
!ls ../../ # home

CHANGELOG.md                     [34mdata[m[m
LICENSE                          [34mmodels[m[m
README.md                        [34mnotebooks[m[m
[31mactivate-3.10.sh[m[m                 poetry.lock
[31mactivate-3.12.sh[m[m                 pyproject.toml
[34mclimatesense_checkthat2025_task4[m[m


In [105]:

def corpus_titles(df: str) -> List[Tuple[str, str]]:
    output = []
    df.reset_index()
    for index, row in df.iterrows():
        #print(row['cord_uid'], row['title'])
        output.append((row['cord_uid'], row['title']))
    return output

def corpus_titles_abstracts(df: str) -> List[Tuple[str, str]]:
    output = []
    df.reset_index()
    for index, row in df.iterrows():
        #print(row['cord_uid'], row['title'])
        output.append((row['cord_uid'], row['title'] + "\n\n" + row['abstract']))
    return output


def corpus_abstracts(df: str) -> List[Tuple[str, str]]:
    output = []
    df.reset_index()
    for index, row in df.iterrows():
        #print(row['cord_uid'], row['title'])
        output.append((row['cord_uid'], row['abstract']))
    return output


In [65]:
CORPUS = "../../data/raw/task4/subtask_4b/subtask4b_collection_data.pkl"
DEVEL = "../../data/processed/task4/subtask_4b/subtask4b_query_tweets_dev_clean.tsv"
TRAIN = "../../data/processed/task4/subtask_4b/subtask4b_query_tweets_train_clean.tsv"
OUTPUT_PREFIX = "../../data/processed/task4/subtask_4b/subtask4b_output_"
# Load the collection
import pickle

st4b_collection_df = pickle.load(open(CORPUS,'rb'))
st4b_collection_df.head(2)

Unnamed: 0,cord_uid,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,mag_id,who_covidence_id,arxiv_id,label,time,timet
162,umvrwgaw,PMC,Professional and Home-Made Face Masks Reduce E...,10.1371/journal.pone.0002618,PMC2440799,18612429,cc-by,BACKGROUND: Governments are preparing for a po...,2008-07-09,"van der Sande, Marianne; Teunis, Peter; Sabel,...",PLoS One,,,,umvrwgaw,2008-07-09,1215561600
611,spiud6ok,PMC,The Failure of R (0),10.1155/2011/527610,PMC3157160,21860658,cc-by,"The basic reproductive ratio, R (0), is one of...",2011-08-16,"Li, Jing; Blakeley, Daniel; Smith?, Robert J.",Comput Math Methods Med,,,,spiud6ok,2011-08-16,1313452800


In [106]:
# Prepare inmput data and corpus

# Load the data (train and dev)
st4b_train_df = pd.read_csv(TRAIN, sep="\t")
st4b_dev_df = pd.read_csv(DEVEL, sep="\t")

# Load the corpus (three versions)
st4b_corpus_titles = corpus_titles(st4b_collection_df)
st4b_corpus_titles_abstracts = corpus_titles_abstracts(st4b_collection_df)
st4b_corpus_abstracts = corpus_abstracts(st4b_collection_df)


In [72]:
class BaseRetriever:
    def retrieve(self, query: str, top_k: int):
        """Return a list of (filename, doc_text) for the top-k matches."""
        raise NotImplementedError()

class BM25Retriever(BaseRetriever):
    def __init__(self, corpus: List[Tuple[str, str]]):
        from rank_bm25 import BM25Okapi
        self.corpus = corpus
        self.documents = [doc[1] for doc in corpus]  # just the text
        tokenized_docs = [text.split() for text in self.documents]
        self.bm25 = BM25Okapi(tokenized_docs)

    def retrieve(self, query: str, top_k: int):
        tokenized_query = query.split()
        scores = self.bm25.get_scores(tokenized_query)
        ranked_indices = np.argsort(scores)[::-1]
        top_indices = ranked_indices[:top_k]
        return [self.corpus[i] for i in top_indices]

class VectorRetriever(BaseRetriever):
    def __init__(self, corpus: List[Tuple[str, str]], model_name: str):
        self.corpus = corpus
        self.documents = [c[1] for c in corpus]
        logging.info(f"Loading model '{model_name}' for embeddings...")
        self.model = SentenceTransformer(model_name)
        self.model.eval()

        #logging.info("Encoding corpus ...")
        self.doc_embeddings = self.model.encode(
            self.documents, convert_to_tensor=True, show_progress_bar=True
        )

    def retrieve(self, query: str, top_k: int):
        query_embedding = self.model.encode(query, convert_to_tensor=True)
        scores = torch.nn.functional.cosine_similarity(
            query_embedding.unsqueeze(0), self.doc_embeddings, dim=1
        )
        ranked_indices = torch.argsort(scores, descending=True).cpu().numpy()
        top_indices = ranked_indices[:top_k]
        return [(self.corpus[i][0], self.corpus[i][1]) for i in top_indices]

class PromptRetriever(BaseRetriever):
    """
    Example stub for Promptriever usage.
    Adjust as needed.
    """
    def __init__(self, corpus: List[Tuple[str, str]], model_name: str = "allenai/promptriever"):
        self.corpus = corpus
        self.documents = [c[1] for c in corpus]
        logging.info(f"Loading Promptriever model '{model_name}'...")
        self.model = SentenceTransformer(model_name)
        self.doc_embeddings = self.model.encode(self.documents, convert_to_tensor=True)

    def retrieve(self, query: str, top_k: int):
        q_emb = self.model.encode(query, convert_to_tensor=True)
        scores = torch.nn.functional.cosine_similarity(q_emb.unsqueeze(0), self.doc_embeddings, dim=1)
        ranked_indices = torch.argsort(scores, descending=True).cpu().numpy()
        top_indices = ranked_indices[:top_k]
        return [(self.corpus[i][0], self.corpus[i][1]) for i in top_indices]

In [68]:
st4b_dev_df.head(5)

Unnamed: 0,post_id,tweet_text,cord_uid
0,16,covid recovery: this study from the usa reveal...,3qvh482o
1,69,"""Among 139 clients exposed to two symptomatic ...",r58aohnu
2,73,I recall early on reading that researchers who...,sts48u9i
3,93,You know you're credible when NIH website has ...,3sr2exq9
4,96,Resistance to antifungal medications is a grow...,ybwwmyqy


In [117]:
# Compute metric
# Evaluate retrieved candidates using MRR@k
def get_performance_mrr(data, col_gold, col_pred, list_k = [1, 5, 10]):
    d_performance = {}
    for k in list_k:
        data["in_topx"] = data.apply(lambda x: (1/([i for i in x[col_pred][:k]].index(x[col_gold]) + 1) if x[col_gold] in [i for i in x[col_pred][:k]] else 0), axis=1)
        #performances.append(data["in_topx"].mean())
        d_performance[k] = data["in_topx"].mean()
    return d_performance

In [122]:
# Set Experiment Parameters
input_type = "train" # "train"
corpus_type = "titles_abstracts" #"titles" # "titles_abstracts"# "abstracts"
retriever_name = "BM25" # "BM25"
K = 10


In [123]:
# Guidelines: https://codalab.lisn.upsaclay.fr/competitions/22359#learn_the_details
# https://huggingface.co/spaces/mteb/leaderboard
### All parameters

_K = [10]
_input_type = ["dev", "train"]
_corpus_type = ["titles", "titles_abstracts", "abstracts"]
_retriever_name = [
    "BM25",
    "gtr-t5-large",
    "gtr-t5-base",
    "gtr-t5-xl",
    "gtr-t5-xxl",
    "gooaq",
    "bge-large-en",
    "instructor-xl",
    "promptretriever",
    "conan-embedding-v2"
]

###
run_record_file = OUTPUT_PREFIX + "_run_record.tsv"

####
import socket
hostname = socket.gethostname()
run_name = OUTPUT_PREFIX + hostname + "_" + input_type + "_" + corpus_type + "_" + retriever_name + "_" + "K" + str(K) 
output = []

start_time = time.time()

if input_type == "dev":
    df_in = st4b_dev_df
elif input_type == "train":
    df_in = st4b_train_df
else:
    raise Exception("bad input type")

if corpus_type == "titles":
    st4b_corpus = st4b_corpus_titles
elif corpus_type == "titles_abstracts":
    st4b_corpus = st4b_corpus_titles_abstracts
elif corpus_type == "abstracts":
    st4b_corpus = st4b_corpus_abstracts
else:
    raise Exception("bad corpus type")

if retriever_name == "BM25":
    retriever = BM25Retriever(st4b_corpus)
elif retriever_name == "gtr-t5-large":
    retriever = VectorRetriever(st4b_corpus, "sentence-transformers/gtr-t5-large")
elif retriever_name == "gtr-t5-base":
    retriever = VectorRetriever(st4b_corpus, "sentence-transformers/gtr-t5-base")
elif retriever_name == "gtr-t5-xl":
    retriever = VectorRetriever(st4b_corpus, "sentence-transformers/gtr-t5-xl")
elif retriever_name == "gtr-t5-xxl":
    retriever = VectorRetriever(st4b_corpus, "sentence-transformers/gtr-t5-xxl")
elif retriever_name == "gooaq":
    retriever = VectorRetriever(st4b_corpus, "sentence-transformers/gooaq")
elif retriever_name == "bge-large-en":
    retriever = VectorRetriever(st4b_corpus, "BAAI/bge-large-en")
elif retriever_name == "instructor-xl":
    retriever = VectorRetriever(st4b_corpus, "hkunlp/instructor-xl")
elif retriever_name == "conan-embedding-v2":
    retriever = VectorRetriever(st4b_corpus, "TencentBAC/Conan-embedding-v2")
elif retriever_name == "promptretriever":
    retriever = PromptRetriever(st4b_corpus)
else:
    raise Exception("bad retriever name")

for idx, row in tqdm(df_in.iterrows(), total=len(df_in), desc="Processing rows"):
    top_docs = retriever.retrieve(row["tweet_text"], K)
    
    # top_docs is a list of (filename, text)
    #print(cord_uid, top_docs[0])
    matches = False
    top_cord_ids = []
    for idx, item in enumerate(top_docs):
        #print(idx,item)
        top_cord_ids.append(item[0])
        if item[0] == row["cord_uid"]:
            matches = True
            #print(row["cord_uid"] + " " + item[0], matches)

    o = {
        "retriever" : retriever_name,
        "post_id" : row["post_id"],
        "tweet_text" : row["tweet_text"],
        "cord_uid" : row["cord_uid"],
        "K" : K,
        "topk" : top_cord_ids,
        "found" : matches
    }
    output.append(o)

end_time = time.time()
proc_time = (end_time - start_time)
logging.info(f"Total processing time: {proc_time:2f} seconds.")

df_out = pd.DataFrame(output)
df_out.to_csv(run_name + ".tsv", sep = "\t", index=False)

# Evaluate precision
output_df = pd.DataFrame(output)
precision = len(df_out.loc[df_out['found'] == True])/ len(df_out)
mrr = get_performance_mrr(output_df, 'cord_uid', 'topk')

### Keep a log of this experiment
import csv
import os.path
run_record = [socket.gethostname(),input_type,corpus_type,retriever_name,K, precision, mrr, proc_time,run_name]

if not os.path.isfile(run_record_file):
    file = open(run_record_file, 'a');
    writer = csv.writer(file, delimiter='\t', lineterminator='\n')
    writer.writerow(["hostname", "input_type", "corpus_type", "retriever_name", "K", "precision", "MRR@k", "proc_time", "run_name"])    
else:
    file = open(run_record_file, 'a');
    writer = csv.writer(file, delimiter='\t', lineterminator='\n')
writer.writerow(run_record)
file.close()

# Export predictions for submission
output_df['preds'] = output_df['topk'].apply(lambda x: x[:5])
output_df[['post_id', 'preds']].to_csv(run_name + '_predictions.tsv', index=None, sep='\t')




Processing rows: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 12842/12842 [15:07<00:00, 14.15it/s]
2025-04-29 16:05:17,025 - INFO - Total processing time: 908.280115 seconds.
2025-04-29 16:05:17,025 - INFO - Total processing time: 908.280115 seconds.
2025-04-29 16:05:17,025 - INFO - Total processing time: 908.280115 seconds.
