In [1]:
import os
import sys
import numpy as np 
import pandas as pd

from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

sys.path.append('..')

In [2]:
import warnings

warnings.filterwarnings('ignore')

## Initiliaze Database

In [3]:
init = False

In [4]:
embedder = HuggingFaceEmbeddings(model_name="msmarco-distilbert-base-v4")

In [5]:
vector_store = Chroma(
    embedding_function=embedder,  
    persist_directory="./chroma"  
)

## Read Data

In [6]:
data_folder_path = os.path.join('..', 'data')
query_df = pd.read_csv(os.path.join(data_folder_path, "FinDER/queries.csv"), index_col=0)
documents_df = pd.read_csv(os.path.join(data_folder_path, "FinDER/corpus.csv"), index_col=0)
documents_df.dropna(subset=['text'], inplace=True)
documents_df

Unnamed: 0,title,text
ADBE20230004,ADBE OVERVIEW,Adobe is a global technology company with a mi...
ADBE20230006,ADBE OFFERINGS,"We deliver a wide range of products, services ..."
ADBE20230007,ADBE OFFERINGS,"Digital Media. We provide products, services a..."
ADBE20230008,ADBE OFFERINGS,Digital Experience. We provide an integrated p...
ADBE20230010,ADBE OFFERINGS,"We offer a comprehensive suite of products, se..."
...,...,...
V20232000,V _______________,† Confidential treatment has been requested fo...
V20232001,V _______________,"* Management contract, compensatory plan or ar..."
V20232004,V _______________,+ Filed or furnished herewith. # Schedules hav...
V20232012,V SIGNATURES,Pursuant to the requirements of Section 13 or ...


## Chunk -> Embed -> Save the Documents to the DB

In [7]:
if init:

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=300,  # Maximum number of characters per chunk
        chunk_overlap=50,  # Number of characters that will overlap between chunks
    )

    for index, row in documents_df.iterrows():
        doc_content = row['text']
        chunks = text_splitter.split_text(doc_content)
        
        langchain_documents = [
            Document(page_content=chunk, metadata={"id": f"{index}_chunk_{i+1}", "document_id": index})
            for i, chunk in enumerate(chunks)
        ]
        
        vector_store.add_documents(langchain_documents)

## Retrieve

In [8]:
retrieved_df = pd.DataFrame([[[] ] for _ in query_df.index], index=query_df.index, columns=["Documents"])
retrieved_df

Unnamed: 0,Documents
q00001,[]
q00002,[]
q00003,[]
q00004,[]
q00005,[]
...,...
q00214,[]
q00215,[]
q00216,[]
q00217,[]


In [9]:
for query_index, query_row in query_df.iterrows():
    query = query_row['text']
    retrieved_docs = vector_store.similarity_search(query, k=3)
    retrieved_doc_ids = list(set([doc.metadata['document_id'] for doc in retrieved_docs]))
    retrieved_df.loc[query_index, "Documents"] = retrieved_doc_ids

In [10]:
retrieved_df

Unnamed: 0,Documents
q00001,"[MSFT20230254, MSFT20230966, MSFT20230216]"
q00002,"[JNJ20231935, V20230670, JNJ20231936]"
q00003,"[MSFT20230058, MSFT20230025, MSFT20230236]"
q00004,"[NVDA20230141, MSFT20230469, MSFT20230010]"
q00005,"[JNJ20231935, ADBE20231285, JPM20232533]"
...,...
q00214,"[BRK.A20230028, BRK.A20230514, BRK.A20230427]"
q00215,"[BRK.A20230421, BRK.A20230404, BRK.A20230920]"
q00216,"[UNH20230432, ORCL20231157, AAPL20230596]"
q00217,"[BRK.A20230063, ADBE20230747, BRK.A20230594]"


## Evaluate

In [11]:
def evaluate(query_df, retrieved_df):
    precisions = []
    recalls = []
    
    for i, row in query_df.iterrows():
        actual_ids = set(row['Related Documents'])
        retrieved_ids = set(retrieved_df.loc[i, 'Documents'])
        
        true_positives = len(actual_ids & retrieved_ids)  
        
        precision = true_positives / len(retrieved_ids) if retrieved_ids else 0
        precisions.append(precision)
        
        recall = true_positives / len(actual_ids) if actual_ids else 0
        recalls.append(recall)
    
    avg_precision = sum(precisions) / len(precisions) if precisions else 0
    avg_recall = sum(recalls) / len(recalls) if recalls else 0
    avg_f1 = 2 * (avg_precision * avg_recall) / (avg_precision + avg_recall) if (avg_precision + avg_recall) > 0 else 0
    
    return {
        'precision': avg_precision,
        'recall': avg_recall,
        'f1_score': avg_f1
    }

In [12]:
evaluate(query_df, retrieved_df)

{'precision': 0.0, 'recall': 0.0, 'f1_score': 0}

In [13]:
non_empty = query_df[query_df['Related Documents'].apply(lambda x: len(x) > 2)]
index = non_empty.index
non_empty

Unnamed: 0,text,Related Documents
q00001,What are the service and product offerings fro...,"['MSFT20230014', 'MSFT20230015']"
q00007,How much revenue does Microsoft generate from ...,['MSFT20231529']
q00008,MSFT remaining performance obligation,['MSFT20231529']
q00010,ADBE share repurchase,"['ADBE20231571', 'ADBE20231572', 'ADBE20230728..."
q00019,When did Coupang`s Farfetch consolidation start,['CPNG20230732']
...,...,...
q00197,What factors contributed to the significant in...,"['UNH20230432', 'UNH20230433', 'UNH20230436', ..."
q00200,Primary revenue source of Google Services,['GOOGL20230050']
q00204,Capex guidance Alphabet,['GOOGL20230680']
q00210,Who runs berkshire,['BRK.A20230396']


In [14]:
retrieved_df.loc[index]

Unnamed: 0,Documents
q00001,"[MSFT20230254, MSFT20230966, MSFT20230216]"
q00007,"[MSFT20230574, MSFT20230482]"
q00008,"[ADBE20231120, V20231038, MSFT20230508]"
q00010,"[PG20230165, MSFT20230455, AAPL20230871]"
q00019,"[CPNG20230539, CPNG20231413, CPNG20230684]"
...,...
q00197,"[GOOGL20230563, ADBE20230571, BRK.A20230790]"
q00200,"[GOOGL20230050, GOOGL20230392, AMZN20230278]"
q00204,"[AMZN20231097, V20231998, TSLA20231020]"
q00210,"[BRK.A20230080, BRK.A20230004, BRK.A20231350]"


In [17]:
results = pd.DataFrame(columns=["Ground Truth", "Retrieved"])
results["Ground Truth"] = non_empty['Related Documents']
results["Retrieved"] = retrieved_df.loc[index, 'Documents']
results.head(40)

Unnamed: 0,Ground Truth,Retrieved
q00001,"['MSFT20230014', 'MSFT20230015']","[MSFT20230254, MSFT20230966, MSFT20230216]"
q00007,['MSFT20231529'],"[MSFT20230574, MSFT20230482]"
q00008,['MSFT20231529'],"[ADBE20231120, V20231038, MSFT20230508]"
q00010,"['ADBE20231571', 'ADBE20231572', 'ADBE20230728...","[PG20230165, MSFT20230455, AAPL20230871]"
q00019,['CPNG20230732'],"[CPNG20230539, CPNG20231413, CPNG20230684]"
q00021,['CPNG20230658'],"[JNJ20231935, ADBE20231613, JPM20232533]"
q00022,['CPNG20230553'],"[AMZN20230439, AMZN20230279, DAL20230783]"
q00027,['LIN20231133'],"[LIN20231143, LIN20231727, LIN20231724]"
q00028,['LIN20231195'],"[LIN20230593, LIN20230179, LIN20231195]"
q00030,"['LIN20230064', 'LIN20230065', 'LIN20230066', ...","[LIN20230092, LIN20231636, LIN20230759]"
