In [1]:
import sys
import os
sys.path.append(os.path.abspath('..'))

from financerag.tasks import FinDER
from financerag.retrieval import SentenceTransformerEncoder, DenseRetrieval
from financerag.rerank import CrossEncoderReranker
from sentence_transformers import CrossEncoder

  from .autonotebook import tqdm as notebook_tqdm


## Initialize Dataset Task

In [2]:
finder_task = FinDER()

A Hugging Face repository is provided. This will override the data_folder, prefix, and *_file arguments.


In [3]:
finder_task.queries

{'q00001': 'What are the service and product offerings from Microsoft',
 'q00002': 'MSFT segment breakdown',
 'q00003': 'Who are Microsoft`s key customers?',
 'q00004': 'What is Microsoft`s business model',
 'q00005': 'MSFT Capex commitment',
 'q00006': 'Which recent M&A activities has Microsoft been involved in',
 'q00007': 'How much revenue does Microsoft generate from contracts with customers?',
 'q00008': 'MSFT remaining performance obligation',
 'q00009': 'Adobe subsidiaries of trademarks',
 'q00010': 'ADBE share repurchase',
 'q00011': 'fully diluted shares outstanding ADBE',
 'q00012': 'Who are the members of Adobe`s management team',
 'q00013': 'ADBE RPO',
 'q00014': 'ADBE KPI',
 'q00015': 'How are Coupang`s KPIs?"',
 'q00016': 'Coupang segment margin',
 'q00017': 'CPNG capital expenditure',
 'q00018': 'CPNG any recent M&A activities',
 'q00019': 'When did Coupang`s Farfetch consolidation start',
 'q00020': 'What is FLC, and how is its revenue recognized by Coupang',
 'q00021':

In [4]:
finder_task.corpus

{'ADBE20230004': {'title': 'ADBE OVERVIEW',
  'text': 'Adobe is a global technology company with a mission to change the world through personalized digital experiences. For over four decades, Adobe’s innovations have transformed how individuals, teams, businesses, enterprises, institutions, and governments engage and interact across all types of media. Our products, services and solutions are used around the world to imagine, create, manage, deliver, measure, optimize and engage with content across surfaces and fuel digital experiences. We have a diverse user base that includes consumers, communicators, creative professionals, developers, students, small and medium businesses and enterprises. We are also empowering creators by putting the power of artificial intelligence (“AI”) in their hands, and doing so in ways we believe are responsible. Our products and services help unleash creativity, accelerate document productivity and power businesses in a digital world.'},
 'ADBE20230006': {

## Setup Models

In [5]:
encoder_model = SentenceTransformerEncoder(
    model_name_or_path='intfloat/e5-small-v2',
    query_prompt='query: ',
    doc_prompt='passage: ',
)

retrieval_model = DenseRetrieval(
    model=encoder_model
)

## Retrieve

In [6]:
retrieval_result = finder_task.retrieve(
    retriever=retrieval_model
)

print(f"Retrieved results for {len(retrieval_result)} queries. Here's an example of the top 5 documents for the first query:")

for q_id, result in retrieval_result.items():
    print(f"\nQuery ID: {q_id}")
    sorted_results = sorted(result.items(), key=lambda x: x[1], reverse=True)

    for i, (doc_id, score) in enumerate(sorted_results[:5]):
        print(f"  Document {i + 1}: Document ID = {doc_id}, Score = {score}")

    break 

Retrieved results for 216 queries. Here's an example of the top 5 documents for the first query:

Query ID: q00001
  Document 1: Document ID = MSFT20230216, Score = 0.8815328478813171
  Document 2: Document ID = MSFT20231728, Score = 0.8768289089202881
  Document 3: Document ID = MSFT20230236, Score = 0.8760465979576111
  Document 4: Document ID = MSFT20230254, Score = 0.8750152587890625
  Document 5: Document ID = MSFT20230170, Score = 0.8748767971992493


In [7]:
finder_task.retrieve_results

{'q00001': {'MSFT20230262': 0.8312286138534546,
  'MSFT20230014': 0.8339003920555115,
  'MSFT20230029': 0.8315328359603882,
  'MSFT20230470': 0.8364467024803162,
  'MSFT20230971': 0.834015965461731,
  'GOOGL20230039': 0.8315824270248413,
  'MSFT20230164': 0.8398609161376953,
  'MSFT20230144': 0.837587296962738,
  'MSFT20230347': 0.8365101218223572,
  'MSFT20231767': 0.8353182673454285,
  'MSFT20230160': 0.8340560793876648,
  'MSFT20230256': 0.8325105905532837,
  'MSFT20230058': 0.841697633266449,
  'MSFT20230034': 0.8412221670150757,
  'MSFT20230509': 0.8400826454162598,
  'MSFT20230162': 0.8390372395515442,
  'MSFT20230035': 0.8377923369407654,
  'MSFT20230197': 0.8373541831970215,
  'MSFT20230377': 0.8368573784828186,
  'MSFT20230217': 0.8360196948051453,
  'MSFT20230159': 0.8355370759963989,
  'ORCL20230037': 0.8350878357887268,
  'MSFT20230486': 0.8341904282569885,
  'MSFT20230352': 0.8332923650741577,
  'MSFT20231766': 0.832577645778656,
  'MSFT20230521': 0.8419463038444519,
  'MS

## Rerank

In [8]:
reranker = CrossEncoderReranker(
    model=CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')
)

reranking_result = finder_task.rerank(
    reranker=reranker,
    results=retrieval_result,
    top_k=100,  
    batch_size=32
)

print(f"Reranking results for {len(reranking_result)} queries. Here's an example of the top 5 documents for the first query:")

for q_id, result in reranking_result.items():
    print(f"\nQuery ID: {q_id}")
    sorted_results = sorted(result.items(), key=lambda x: x[1], reverse=True)

    for i, (doc_id, score) in enumerate(sorted_results[:5]):
        print(f"  Document {i + 1}: Document ID = {doc_id}, Score = {score}")

    break 

KeyboardInterrupt: 

In [None]:
finder_task.rerank_results

## Save the Result

In [None]:
output_dir = '../results'
finder_task.save_results(output_dir=output_dir)
print(f"Results have been saved to {output_dir}/FinDER/results.csv")