In [1]:
import sys
import os
sys.path.append(os.path.abspath('../..'))

from financerag.tasks import FinDER

import numpy as np 
import pandas as pd

from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document
from sentence_transformers import CrossEncoder

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import warnings

warnings.filterwarnings('ignore')

## Read Data

In [3]:
data_folder_path = os.path.join('../..', 'data')
query_df = pd.read_csv(os.path.join(data_folder_path, "FinDER/queries.csv"), index_col=0)
documents_df = pd.read_csv(os.path.join(data_folder_path, "FinDER/corpus.csv"), index_col=0)
documents_df.dropna(subset=['text'], inplace=True)

## Initiliaze Database

In [4]:
embedder = HuggingFaceEmbeddings(model_name="msmarco-distilbert-base-v4")

In [5]:
docs = [Document(page_content=text, metadata={"id": str(id)}) for id, text in documents_df.text.items()]

persist_directory = ".chroma"

if os.path.exists(persist_directory):
    # Load the existing ChromaDB
    chroma_db = Chroma(persist_directory=persist_directory, embedding_function=embedder)
    print("Loaded existing ChromaDB from .chroma")
else:
    # Create ChromaDB and store the documents
    chroma_db = Chroma.from_documents(
        documents=docs,
        embedding=embedder,
        persist_directory=persist_directory,  # Specify the directory if needed for persistence
    )
    print("Created new ChromaDB and saved to .chroma")


Loaded existing ChromaDB from .chroma


## Retrieve

In [23]:
retriever = chroma_db.as_retriever(search_kwargs={"k": 100})

In [24]:
retrieved_df = pd.DataFrame([[{} ] for _ in query_df.index], index=query_df.index, columns=["Documents"])

In [25]:
for idx, query in query_df.text.items():

    retrieved = retriever.invoke(query)
    
    retrieved = {
        str(doc.metadata["id"]):  1
        for doc in retrieved
    }
    retrieved_df.loc[idx]["Documents"] = retrieved

In [26]:
retrieved_results = retrieved_df["Documents"].to_dict()

In [27]:
retrieved_results["q00007"]["MSFT20231529"]

1

## Re-Rank

In [74]:
import torch

cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

def sigmoid(x):
    return 1 / (1 + torch.exp(-torch.tensor(x)))

for idx, query in query_df.text.items():
    for doc_id in retrieved_results[idx]:
        
        raw_score = cross_encoder.predict((query, documents_df.loc[doc_id].text))
        normalized_score = sigmoid(raw_score).item()

        retrieved_results[idx][doc_id] = normalized_score
    
    retrieved_results[idx] = dict(sorted(retrieved_results[idx].items(), key=lambda item: item[1], reverse=True))

## Evaluate

In [77]:
finder_task = FinDER()

A Hugging Face repository is provided. This will override the data_folder, prefix, and *_file arguments.
Using the latest cached version of the dataset since Linq-AI-Research/FinanceRAG couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'FinDER' at /Users/emrekuru/.cache/huggingface/datasets/Linq-AI-Research___finance_rag/FinDER/0.0.0/ba6f7470152e76b389ad48e3997f15381e6aecae (last modified on Mon Oct 21 09:52:22 2024).


In [78]:
qrels = pd.read_csv('../../data/resources/finder_qrels.tsv', sep='\t')
qrels

Unnamed: 0,query_id,corpus_id,score
0,q00001,MSFT20230014,1
1,q00001,MSFT20230015,1
2,q00007,MSFT20231529,1
3,q00008,MSFT20231529,1
4,q00010,ADBE20231571,1
...,...,...,...
98,q00197,UNH20230438,1
99,q00200,GOOGL20230050,1
100,q00204,GOOGL20230680,1
101,q00210,BRK.A20230396,1


In [79]:
qrels_dict = {}
for index, row in qrels.iterrows():
    key = row['query_id']
    if key not in qrels_dict:
        qrels_dict[key] = {}
    qrels_dict[key][row['corpus_id']] = row['score']
qrels_dict

{'q00001': {'MSFT20230014': 1, 'MSFT20230015': 1},
 'q00007': {'MSFT20231529': 1},
 'q00008': {'MSFT20231529': 1},
 'q00010': {'ADBE20231571': 1,
  'ADBE20231572': 1,
  'ADBE20230728': 1,
  'ADBE20231573': 1},
 'q00019': {'CPNG20230732': 1},
 'q00021': {'CPNG20230658': 1},
 'q00022': {'CPNG20230553': 1},
 'q00027': {'LIN20231133': 1},
 'q00028': {'LIN20231195': 1},
 'q00030': {'LIN20230064': 1,
  'LIN20230065': 1,
  'LIN20230066': 1,
  'LIN20230067': 1},
 'q00034': {'LIN20230551': 1},
 'q00039': {'ORCL20230738': 1, 'ORCL20230739': 1},
 'q00042': {'ORCL20230129': 1, 'ORCL20230130': 1, 'ORCL20230131': 1},
 'q00043': {'ORCL20231527': 1, 'ORCL20231529': 1},
 'q00044': {'ORCL20231505': 1},
 'q00048': {'NVDA20231260': 1},
 'q00062': {'PG20230221': 1, 'PG20230805': 1},
 'q00067': {'PG20230429': 1},
 'q00070': {'PG20230438': 1},
 'q00071': {'PG20230440': 1},
 'q00078': {'DAL20230459': 1},
 'q00081': {'DAL20230513': 1},
 'q00092': {'TSLA20230391': 1},
 'q00095': {'TSLA20231453': 1, 'TSLA2023145

In [80]:
k_values = [10, 50, 100]
results = finder_task.evaluate(qrels=qrels_dict, results=retrieved_results, k_values=k_values)

In [81]:
metrics_df = pd.DataFrame(index=k_values, columns=["MAP", "NDCG", "P@K", "R@K"])

metrics_df["MAP"] = [results[1][f"MAP@{k}"] for k in k_values]
metrics_df["NDCG"] = [results[0][f"NDCG@{k}"] for k in k_values]
metrics_df["P@K"] = [results[3][f"P@{k}"] for k in k_values]
metrics_df["R@K"] = [results[2][f"Recall@{k}"] for k in k_values]

metrics_df

Unnamed: 0,MAP,NDCG,P@K,R@K
10,0.22589,0.25574,0.04063,0.32812
50,0.23863,0.30929,0.01469,0.54531
100,0.23915,0.31293,0.00781,0.56094


### Compare With Just Retrievel

    MAP	        NDCG	    P@K	        R@K
| K   | MAP     | NDCG    | P@K     | R@K    |
|-----|---------|---------|---------|--------|
| 10  | 0.00781 | 0.00986 | 0.00156 | 0.01562|
| 50  | 0.01750 | 0.06159 | 0.00656 | 0.24844|
| 100 | 0.02224 | 0.11573 | 0.00781 | 0.5609 |