In [1]:
import json
import logging
import os
from typing import List

import pandas as pd
from sentence_transformers import CrossEncoder

from financerag.rerank import CrossEncoderReranker
from financerag.retrieval import DenseRetrieval, SentenceTransformerEncoder, BM25Retriever, BM25Model
from financerag.tasks import (
    BaseTask,
    ConvFinQA,
    FinDER,
    FinQABench,
    FinQA,
    FinanceBench,
    MultiHiertt,
    TATQA
)
# Step 1: Import necessary libraries
# --------------------------------------
# Import required libraries for document retrieval, reranking, and logging setup.
from sentence_transformers import CrossEncoder
import logging

from financerag.rerank import CrossEncoderReranker, RRFReranker
from financerag.retrieval import DenseRetrieval, BM25Retriever, SentenceTransformerEncoder
from financerag.tasks import FinDER

# Setup basic logging configuration to show info level messages.
logging.basicConfig(level=logging.INFO)


from task_runner import TaskRunner


import gc
import torch

  from tqdm.autonotebook import tqdm, trange


In [3]:
def run(task: BaseTask, results_dir: str = "./results"):
    print(f"Running {task.metadata.name}")
    encoder_model = SentenceTransformerEncoder(
        model_name_or_path='intfloat/e5-large-v2',
        query_prompt='query: ',
        doc_prompt='passage: ',
    )
    dense_retrieval_model = DenseRetrieval(
        model=encoder_model
    )

    bm25_model = BM25Model(task.corpus)
    sparse_model = BM25Retriever(
        model=bm25_model
    )

    retrieval_result = task.retrieve_hybrid(
        dense_retriever=dense_retrieval_model,
        sparse_retriever=sparse_model
    )

    print(f"Retrieved hybrid results for {len(retrieval_result)} queries. Here's an example of the top 5 documents for the first query:")

    for q_id, result in retrieval_result[0].items():
        print(f"\nQuery ID: {q_id}")
        # Sort the result to print the top 5 document ID and its score
        sorted_results = sorted(result.items(), key=lambda x: x[1], reverse=True)

        for i, (doc_id, score) in enumerate(sorted_results[:5]):
            print(f"  Document {i + 1}: Document ID = {doc_id}, Score = {score}")

        break  # Only show the first query

    for q_id, result in retrieval_result[1].items():
        print(f"\nQuery ID: {q_id}")
        # Sort the result to print the top 5 document ID and its score
        sorted_results = sorted(result.items(), key=lambda x: x[1], reverse=True)

        for i, (doc_id, score) in enumerate(sorted_results[:5]):
            print(f"  Document {i + 1}: Document ID = {doc_id}, Score = {score}")

        break  # Only show the first query

    # Delete references
    del encoder_model
    del dense_retrieval_model
    del bm25_model
    del sparse_model
    
    # Force garbage collection (CPU RAM)
    gc.collect()

    reranker = RRFReranker()
    
    
    # Clear GPU cache (Crucial for SentenceTransformer/PyTorch)
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    task.rerank(
        reranker=reranker,
        results=retrieval_result,
        top_k=100,  # Rerank the top 100 documents
        k=6
    )

    del reranker

    task.save_results(output_dir='./results')
    task.save_original_results(output_dir='./results')

In [4]:
all_tasks: List[type] = [
    ConvFinQA,
    FinDER,
    FinQABench,
    FinQA,
    FinanceBench,
    MultiHiertt,
    TATQA
]

In [5]:
for task in all_tasks:
    current_task: BaseTask = task() # (corpus_file='corpus_prep.jsonl', query_file='queries_prep.jsonl')
    TaskRunner.run_custom(current_task, run, results_dir='./results')
    evaluate_result = TaskRunner.evaluate(task, results_dir='./results')
    print(current_task.metadata.name)
    print(TaskRunner.format_results(evaluate_result))

INFO:financerag.common.loader:Loading Corpus...
INFO:financerag.common.loader:Loaded 2066 Documents.
INFO:financerag.common.loader:Corpus Example: {'id': 'dd4bff516', 'title': '', 'text': 'containerboard , kraft papers and saturating kraft .\nkapstone also owns victory packaging , a packaging solutions distribution company with facilities in the u.s. , canada and mexico .\nwe have included the financial results of kapstone in our corrugated packaging segment since the date of the acquisition .\non september 4 , 2018 , we completed the acquisition ( the 201cschl fcter acquisition 201d ) of schl fcter print pharma packaging ( 201cschl fcter 201d ) .\nschl fcter is a leading provider of differentiated paper and packaging solutions and a german-based supplier of a full range of leaflets and booklets .\nthe schl fcter acquisition allowed us to further enhance our pharmaceutical and automotive platform and expand our geographical footprint in europe to better serve our customers .\nwe have i

Running ConvFinQA


INFO:financerag.retrieval.dense:Encoding queries...
Batches: 100%|██████████| 7/7 [00:00<00:00,  9.33it/s]
INFO:financerag.retrieval.dense:Sorting corpus by document length...
INFO:financerag.retrieval.dense:Encoding corpus in batches... This may take a while.
INFO:financerag.retrieval.dense:Encoding batch 1/1...
Batches: 100%|██████████| 33/33 [00:43<00:00,  1.32s/it]
INFO:financerag.retrieval.bm25:Tokenizing queries with lower cases
INFO:financerag.tasks.BaseTask:Output directory set to: ./results/ConvFinQA
INFO:financerag.tasks.BaseTask:Saving top 10 results to CSV file: ./results/ConvFinQA/results.csv
INFO:financerag.tasks.BaseTask:Writing header ['query_id', 'corpus_id'] to CSV.
INFO:financerag.tasks.BaseTask:Top 10 results saved successfully to ./results/ConvFinQA/results.csv
INFO:financerag.tasks.BaseTask:Output directory set to: ./results/ConvFinQA
INFO:financerag.tasks.BaseTask:Saving original results to JSONL file: ./results/ConvFinQA/original_results.jsonl
INFO:financerag.ta

Retrieved hybrid results for 2 queries. Here's an example of the top 5 documents for the first query:

Query ID: qd4982518
  Document 1: Document ID = dd4b9f7f6, Score = 0.8368977308273315
  Document 2: Document ID = dd4c4c208, Score = 0.8250285387039185
  Document 3: Document ID = dd4bbc202, Score = 0.8242160081863403
  Document 4: Document ID = dd4c5c572, Score = 0.8240017890930176
  Document 5: Document ID = dd4b87d18, Score = 0.8225685954093933

Query ID: qd4982518
  Document 1: Document ID = dd4be45d6, Score = 41.62077630564368
  Document 2: Document ID = dd4b87d18, Score = 41.555438014063846
  Document 3: Document ID = dd4b9f7f6, Score = 41.53959889054542
  Document 4: Document ID = dd4982158, Score = 41.18625771204589
  Document 5: Document ID = dd496d5d2, Score = 40.38274656385465
ConvFinQA
	NDCG@1: 0.23016     NDCG@5: 0.38796     NDCG@10: 0.42317
	MAP@1: 0.23016      MAP@5: 0.34259      MAP@10: 0.35673
	Recall@1: 0.23016   Recall@5: 0.52381   Recall@10: 0.63492
	P@1: 0.23016  

INFO:financerag.common.loader:Loaded 13867 Documents.
INFO:financerag.common.loader:Corpus Example: {'id': 'ADBE20230004', 'title': 'ADBE OVERVIEW', 'text': 'Adobe is a global technology company with a mission to change the world through personalized digital experiences. For over four decades, Adobe’s innovations have transformed how individuals, teams, businesses, enterprises, institutions, and governments engage and interact across all types of media. Our products, services and solutions are used around the world to imagine, create, manage, deliver, measure, optimize and engage with content across surfaces and fuel digital experiences. We have a diverse user base that includes consumers, communicators, creative professionals, developers, students, small and medium businesses and enterprises. We are also empowering creators by putting the power of artificial intelligence (“AI”) in their hands, and doing so in ways we believe are responsible. Our products and services help unleash crea

Running FinDER


INFO:financerag.retrieval.dense:Encoding queries...
Batches: 100%|██████████| 4/4 [00:00<00:00, 18.64it/s]
INFO:financerag.retrieval.dense:Sorting corpus by document length...
INFO:financerag.retrieval.dense:Encoding corpus in batches... This may take a while.
INFO:financerag.retrieval.dense:Encoding batch 1/1...
Batches: 100%|██████████| 217/217 [04:14<00:00,  1.17s/it]
INFO:financerag.retrieval.bm25:Tokenizing queries with lower cases
INFO:financerag.tasks.BaseTask:Output directory set to: ./results/FinDER
INFO:financerag.tasks.BaseTask:Saving top 10 results to CSV file: ./results/FinDER/results.csv
INFO:financerag.tasks.BaseTask:Writing header ['query_id', 'corpus_id'] to CSV.
INFO:financerag.tasks.BaseTask:Top 10 results saved successfully to ./results/FinDER/results.csv
INFO:financerag.tasks.BaseTask:Output directory set to: ./results/FinDER
INFO:financerag.tasks.BaseTask:Saving original results to JSONL file: ./results/FinDER/original_results.jsonl
INFO:financerag.tasks.BaseTask:

Retrieved hybrid results for 2 queries. Here's an example of the top 5 documents for the first query:

Query ID: q00001
  Document 1: Document ID = MSFT20230966, Score = 0.8739963173866272
  Document 2: Document ID = MSFT20230216, Score = 0.8645689487457275
  Document 3: Document ID = MSFT20230015, Score = 0.8594435453414917
  Document 4: Document ID = MSFT20230254, Score = 0.8580107688903809
  Document 5: Document ID = MSFT20230155, Score = 0.853409469127655

Query ID: q00001
  Document 1: Document ID = MSFT20230134, Score = 24.918260166638486
  Document 2: Document ID = MSFT20230011, Score = 22.886829812454828
  Document 3: Document ID = AAPL20230021, Score = 21.52816613237681
  Document 4: Document ID = AAPL20230240, Score = 21.44563038565722
  Document 5: Document ID = MSFT20230331, Score = 21.12678510971929
FinDER
	NDCG@1: 0.18750     NDCG@5: 0.31345     NDCG@10: 0.33999
	MAP@1: 0.15156      MAP@5: 0.26250      MAP@10: 0.27542
	Recall@1: 0.15156   Recall@5: 0.42500   Recall@10: 0.

INFO:financerag.common.loader:Loaded 92 Documents.
INFO:financerag.common.loader:Corpus Example: {'id': 'd4aa0660c', 'title': '', 'text': 'Apple Inc.\nCONSOLIDATED STATEMENTS OF OPERATIONS\n(In millions, except number of shares which are reflected in thousands and per share amounts)\nYears ended\nSeptember 24,\n2022September 25,\n2021September 26,\n2020\nNet sales:\n   Products $ 316,199 $ 297,392 $ 220,747 \n   Services  78,129  68,425  53,768 \nTotal net sales  394,328  365,817  274,515 \nCost of sales:\n   Products  201,471  192,266  151,286 \n   Services  22,075  20,715  18,273 \nTotal cost of sales  223,546  212,981  169,559 \nGross margin  170,782  152,836  104,956 \nOperating expenses:\nResearch and development  26,251  21,914  18,752 \nSelling, general and administrative  25,094  21,973  19,916 \nTotal operating expenses  51,345  43,887  38,668 \nOperating income  119,437  108,949  66,288 \nOther income/(expense), net  (334)  258  803 \nIncome before provision for income taxes 

Running FinQABench


INFO:financerag.retrieval.dense:Encoding queries...
Batches: 100%|██████████| 2/2 [00:00<00:00,  4.05it/s]
INFO:financerag.retrieval.dense:Sorting corpus by document length...
INFO:financerag.retrieval.dense:Encoding corpus in batches... This may take a while.
INFO:financerag.retrieval.dense:Encoding batch 1/1...
Batches: 100%|██████████| 2/2 [00:03<00:00,  1.68s/it]
INFO:financerag.retrieval.bm25:Tokenizing queries with lower cases
INFO:financerag.tasks.BaseTask:Output directory set to: ./results/FinQABench
INFO:financerag.tasks.BaseTask:Saving top 10 results to CSV file: ./results/FinQABench/results.csv
INFO:financerag.tasks.BaseTask:Writing header ['query_id', 'corpus_id'] to CSV.
INFO:financerag.tasks.BaseTask:Top 10 results saved successfully to ./results/FinQABench/results.csv
INFO:financerag.tasks.BaseTask:Output directory set to: ./results/FinQABench
INFO:financerag.tasks.BaseTask:Saving original results to JSONL file: ./results/FinQABench/original_results.jsonl
INFO:financerag

Retrieved hybrid results for 2 queries. Here's an example of the top 5 documents for the first query:

Query ID: q4aa0b116
  Document 1: Document ID = d4aa0b1f2, Score = 0.9002195596694946
  Document 2: Document ID = d4aa0a52c, Score = 0.8528528809547424
  Document 3: Document ID = d4aa0a9e6, Score = 0.8339228630065918
  Document 4: Document ID = d4aa10314, Score = 0.8284483551979065
  Document 5: Document ID = d4aa0a7d4, Score = 0.8265162706375122

Query ID: q4aa0b116
  Document 1: Document ID = d4aa0b1f2, Score = 72.54431424782084
  Document 2: Document ID = d4aa0a52c, Score = 43.52782268393706
  Document 3: Document ID = d4aa1b854, Score = 35.78485595141424
  Document 4: Document ID = d4aa0a9e6, Score = 35.27641243736042
  Document 5: Document ID = d4aa0a7d4, Score = 31.24090461784376
FinQABench
	NDCG@1: 0.73333     NDCG@5: 0.85515     NDCG@10: 0.86703
	MAP@1: 0.73333      MAP@5: 0.82778      MAP@10: 0.83333
	Recall@1: 0.73333   Recall@5: 0.93333   Recall@10: 0.96667
	P@1: 0.73333  

INFO:financerag.common.loader:Loaded 2789 Documents.
INFO:financerag.common.loader:Corpus Example: {'id': 'd61d9e858', 'title': '', 'text': 'performance graph the performance graph below shows the five-year cumulative total stockholder return on applied common stock during the period from october 25 , 2009 through october 26 , 2014 .\nthis is compared with the cumulative total return of the standard & poor 2019s 500 stock index and the rdg semiconductor composite index over the same period .\nthe comparison assumes $ 100 was invested on october 25 , 2009 in applied common stock and in each of the foregoing indices and assumes reinvestment of dividends , if any .\ndollar amounts in the graph are rounded to the nearest whole dollar .\nthe performance shown in the graph represents past performance and should not be considered an indication of future performance .\ncomparison of 5 year cumulative total return* among applied materials , inc. , the s&p 500 index 201cs&p 201d is a registered 

Running FinQA


INFO:financerag.retrieval.dense:Encoding queries...
Batches: 100%|██████████| 18/18 [00:03<00:00,  5.21it/s]
INFO:financerag.retrieval.dense:Sorting corpus by document length...
INFO:financerag.retrieval.dense:Encoding corpus in batches... This may take a while.
INFO:financerag.retrieval.dense:Encoding batch 1/1...
Batches: 100%|██████████| 44/44 [02:03<00:00,  2.82s/it]
INFO:financerag.retrieval.bm25:Tokenizing queries with lower cases
INFO:financerag.tasks.BaseTask:Output directory set to: ./results/FinQA
INFO:financerag.tasks.BaseTask:Saving top 10 results to CSV file: ./results/FinQA/results.csv
INFO:financerag.tasks.BaseTask:Writing header ['query_id', 'corpus_id'] to CSV.
INFO:financerag.tasks.BaseTask:Top 10 results saved successfully to ./results/FinQA/results.csv
INFO:financerag.tasks.BaseTask:Output directory set to: ./results/FinQA
INFO:financerag.tasks.BaseTask:Saving original results to JSONL file: ./results/FinQA/original_results.jsonl


Retrieved hybrid results for 2 queries. Here's an example of the top 5 documents for the first query:

Query ID: q61676968
  Document 1: Document ID = d616769d6, Score = 0.8724496364593506
  Document 2: Document ID = d61d31046, Score = 0.8685970902442932
  Document 3: Document ID = d61d2c15e, Score = 0.8656296730041504
  Document 4: Document ID = d61d92bf2, Score = 0.8333556056022644
  Document 5: Document ID = d61e30eec, Score = 0.8305553197860718

Query ID: q61676968
  Document 1: Document ID = d61d2a890, Score = 68.21771079275975
  Document 2: Document ID = d616769d6, Score = 67.9709916382047
  Document 3: Document ID = d61d2c15e, Score = 64.58270160560757
  Document 4: Document ID = d61d31046, Score = 64.3839305040667
  Document 5: Document ID = d61e30eec, Score = 55.57952028565895


INFO:financerag.tasks.BaseTask:Original results saved successfully to ./results/FinQA/original_results.jsonl
INFO:financerag.tasks.BaseTask:For evaluation, we ignore identical query and document ids (default), please explicitly set ``ignore_identical_ids=False`` to ignore this.
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:NDCG@1: 0.2413
INFO:financerag.tasks.BaseTask:NDCG@5: 0.4068
INFO:financerag.tasks.BaseTask:NDCG@10: 0.4432
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:MAP@1: 0.2413
INFO:financerag.tasks.BaseTask:MAP@5: 0.3576
INFO:financerag.tasks.BaseTask:MAP@10: 0.3730
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:Recall@1: 0.2413
INFO:financerag.tasks.BaseTask:Recall@5: 0.5552
INFO:financerag.tasks.BaseTask:Recall@10: 0.6657
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:P@1: 0.2413
INFO:financerag.tasks.BaseTask:P@5: 0.1110
INFO:financerag.tasks.BaseTask:P@10: 0.0666
INFO:financerag.common.loader:Loading C

FinQA
	NDCG@1: 0.24128     NDCG@5: 0.40683     NDCG@10: 0.44325
	MAP@1: 0.24128      MAP@5: 0.35761      MAP@10: 0.37305
	Recall@1: 0.24128   Recall@5: 0.55523   Recall@10: 0.66570
	P@1: 0.24128        P@5: 0.11105        P@10: 0.06657



INFO:financerag.common.loader:Loaded 180 Documents.
INFO:financerag.common.loader:Corpus Example: {'id': 'dd2af2336', 'title': 'PEPSICO_2022_10K', 'text': '6) Africa, Middle East and South Asia (AMESA), which includes all of our beverage and convenient food businesses in\nAfrica, the Middle East and South Asia; and\n7) Asia Pacific, Australia and New Zealand and China Region (APAC), which includes all of our beverage and convenient\nfood businesses in Asia Pacific, Australia and New Zealand, and China region.'}
INFO:financerag.common.loader:Loading Queries...
INFO:financerag.common.loader:Loaded 150 Queries.
INFO:financerag.common.loader:Query Example: {'id': 'qd2ac917a', 'text': 'What is the FY2019 - FY2020 total revenue growth rate for Block (formerly known as Square)? Answer in units of percents and round to one decimal place. Approach the question asked by assuming the standpoint of an investment banking analyst who only has access to the statement of income.'}
INFO:sentence_transf

Running FinanceBench


INFO:financerag.retrieval.dense:Encoding queries...
Batches: 100%|██████████| 3/3 [00:01<00:00,  2.38it/s]
INFO:financerag.retrieval.dense:Sorting corpus by document length...
INFO:financerag.retrieval.dense:Encoding corpus in batches... This may take a while.
INFO:financerag.retrieval.dense:Encoding batch 1/1...
Batches: 100%|██████████| 3/3 [00:06<00:00,  2.26s/it]
INFO:financerag.retrieval.bm25:Tokenizing queries with lower cases
INFO:financerag.tasks.BaseTask:Output directory set to: ./results/FinanceBench
INFO:financerag.tasks.BaseTask:Saving top 10 results to CSV file: ./results/FinanceBench/results.csv
INFO:financerag.tasks.BaseTask:Writing header ['query_id', 'corpus_id'] to CSV.
INFO:financerag.tasks.BaseTask:Top 10 results saved successfully to ./results/FinanceBench/results.csv
INFO:financerag.tasks.BaseTask:Output directory set to: ./results/FinanceBench
INFO:financerag.tasks.BaseTask:Saving original results to JSONL file: ./results/FinanceBench/original_results.jsonl
INFO:

Retrieved hybrid results for 2 queries. Here's an example of the top 5 documents for the first query:

Query ID: qd2ac917a
  Document 1: Document ID = dd2acce74, Score = 0.8359518051147461
  Document 2: Document ID = dd2acd630, Score = 0.832131028175354
  Document 3: Document ID = dd2ac8f0e, Score = 0.8138458728790283
  Document 4: Document ID = dd2ac55e8, Score = 0.801262617111206
  Document 5: Document ID = dd2abd7b2, Score = 0.7998214364051819

Query ID: qd2ac917a
  Document 1: Document ID = dd2ac6718, Score = 54.6589281126885
  Document 2: Document ID = dd2adb8d4, Score = 54.523065246085565
  Document 3: Document ID = dd2adb5be, Score = 54.3433731245759
  Document 4: Document ID = dd2af18f0, Score = 47.643156136424956
  Document 5: Document ID = dd2afc3f4, Score = 47.27151639660315
FinanceBench
	NDCG@1: 0.28889     NDCG@5: 0.55853     NDCG@10: 0.62618
	MAP@1: 0.24444      MAP@5: 0.46870      MAP@10: 0.50135
	Recall@1: 0.24444   Recall@5: 0.77778   Recall@10: 0.97778
	P@1: 0.28889  

INFO:financerag.common.loader:Loaded 10475 Documents.
INFO:financerag.common.loader:Corpus Example: {'id': 'd8e4ea4ac', 'title': '', 'text': '|  | Years Ended December 31, |\n|  | 2006 | 2005 |\n|  | (In millions) |\n| Investment return | $192 | $-26 |\n| Expense | 45 | 11 |\n| In-force/Persistency | -7 | -33 |\n| Policyholder dividends and other | -39 | -11 |\n| Total | $191 | $-59 |\nAs of December 31, 2006 and 2005, DAC and VOBA for the Individual segment were $14.0 billion and $13.5 billion, respectively, and for the total Company were $20.8 billion and $19.7 billion, respectively.\nGoodwill Goodwill is the excess of cost over the fair value of net assets acquired.\nThe Company tests goodwill for impairment at least annually or more frequently if events or circumstances, such as adverse changes in the business climate, indicate that there may be justification for conducting an interim test.\nImpairment testing is performed using the fair value approach, which requires the use of es

Running MultiHiertt


INFO:financerag.retrieval.dense:Encoding queries...
Batches: 100%|██████████| 16/16 [00:03<00:00,  5.24it/s]
INFO:financerag.retrieval.dense:Sorting corpus by document length...
INFO:financerag.retrieval.dense:Encoding corpus in batches... This may take a while.
INFO:financerag.retrieval.dense:Encoding batch 1/1...
Batches: 100%|██████████| 164/164 [07:03<00:00,  2.58s/it]
INFO:financerag.retrieval.bm25:Tokenizing queries with lower cases
INFO:financerag.tasks.BaseTask:Output directory set to: ./results/MultiHiertt
INFO:financerag.tasks.BaseTask:Saving top 10 results to CSV file: ./results/MultiHiertt/results.csv
INFO:financerag.tasks.BaseTask:Writing header ['query_id', 'corpus_id'] to CSV.
INFO:financerag.tasks.BaseTask:Top 10 results saved successfully to ./results/MultiHiertt/results.csv
INFO:financerag.tasks.BaseTask:Output directory set to: ./results/MultiHiertt
INFO:financerag.tasks.BaseTask:Saving original results to JSONL file: ./results/MultiHiertt/original_results.jsonl


Retrieved hybrid results for 2 queries. Here's an example of the top 5 documents for the first query:

Query ID: q82d4c6ec
  Document 1: Document ID = d88cdb45a, Score = 0.8187432885169983
  Document 2: Document ID = d8823b39c, Score = 0.8180525302886963
  Document 3: Document ID = d81a04fe4, Score = 0.8175972700119019
  Document 4: Document ID = d8f0945a0, Score = 0.8119762539863586
  Document 5: Document ID = d88be204e, Score = 0.809672474861145

Query ID: q82d4c6ec
  Document 1: Document ID = d85eb42c0, Score = 62.18286385925543
  Document 2: Document ID = d85fbdc98, Score = 59.68655470399278
  Document 3: Document ID = d8dae16c2, Score = 57.97245500476361
  Document 4: Document ID = d8f1c990c, Score = 57.53212209523874
  Document 5: Document ID = d8a223308, Score = 57.3913661503724


INFO:financerag.tasks.BaseTask:Original results saved successfully to ./results/MultiHiertt/original_results.jsonl
INFO:financerag.tasks.BaseTask:For evaluation, we ignore identical query and document ids (default), please explicitly set ``ignore_identical_ids=False`` to ignore this.
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:NDCG@1: 0.1849
INFO:financerag.tasks.BaseTask:NDCG@5: 0.1131
INFO:financerag.tasks.BaseTask:NDCG@10: 0.1229
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:MAP@1: 0.0459
INFO:financerag.tasks.BaseTask:MAP@5: 0.0630
INFO:financerag.tasks.BaseTask:MAP@10: 0.0675
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:Recall@1: 0.0459
INFO:financerag.tasks.BaseTask:Recall@5: 0.0932
INFO:financerag.tasks.BaseTask:Recall@10: 0.1243
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:P@1: 0.1849
INFO:financerag.tasks.BaseTask:P@5: 0.0753
INFO:financerag.tasks.BaseTask:P@10: 0.0500
INFO:financerag.common.loader:Loa

MultiHiertt
	NDCG@1: 0.18493     NDCG@5: 0.11308     NDCG@10: 0.12291
	MAP@1: 0.04594      MAP@5: 0.06299      MAP@10: 0.06747
	Recall@1: 0.04594   Recall@5: 0.09321   Recall@10: 0.12427
	P@1: 0.18493        P@5: 0.07534        P@10: 0.05000



INFO:financerag.common.loader:Loaded 2756 Documents.
INFO:financerag.common.loader:Corpus Example: {'id': 'd1b2e74c0', 'title': '', 'text': 'The following tables present the recorded investment by portfolio segment and by class, excluding commercial financing receivables and other miscellaneous financing receivables at December 31, 2019 and 2018. Commercial financing receivables are excluded from the presentation of financing receivables by portfolio segment, as they are short term in nature and the current estimated risk of loss and resulting impact to the company’s financing results are not material.\nWrite-offs of lease receivables and loan receivables were $16 million and $47 million, respectively, for the year ended December 31, 2019. Provisions for credit losses recorded for lease receivables and loan receivables were a release of $6 million and an addition of $2 million, respectively, for the year ended December 31, 2019.\nThe average recorded investment of impaired leases and l

Running TAT-QA


INFO:financerag.retrieval.dense:Encoding queries...
Batches: 100%|██████████| 26/26 [00:03<00:00,  6.70it/s]
INFO:financerag.retrieval.dense:Sorting corpus by document length...
INFO:financerag.retrieval.dense:Encoding corpus in batches... This may take a while.
INFO:financerag.retrieval.dense:Encoding batch 1/1...
Batches: 100%|██████████| 44/44 [01:48<00:00,  2.47s/it]
INFO:financerag.retrieval.bm25:Tokenizing queries with lower cases
INFO:financerag.tasks.BaseTask:Output directory set to: ./results/TAT-QA
INFO:financerag.tasks.BaseTask:Saving top 10 results to CSV file: ./results/TAT-QA/results.csv
INFO:financerag.tasks.BaseTask:Writing header ['query_id', 'corpus_id'] to CSV.


Retrieved hybrid results for 2 queries. Here's an example of the top 5 documents for the first query:

Query ID: q1a73c1d4
  Document 1: Document ID = d1b3576b2, Score = 0.7951472997665405
  Document 2: Document ID = d1b2ee16c, Score = 0.7919341325759888
  Document 3: Document ID = d1b3af812, Score = 0.7904269695281982
  Document 4: Document ID = d1b323f38, Score = 0.7874468564987183
  Document 5: Document ID = d1b3c11c0, Score = 0.7868978977203369

Query ID: q1a73c1d4
  Document 1: Document ID = d1b30ab0a, Score = 16.313886939743774
  Document 2: Document ID = d1b3ae476, Score = 15.8173973365356
  Document 3: Document ID = d1b37f0ae, Score = 15.808281852435849
  Document 4: Document ID = d1b326d46, Score = 15.743305253666042
  Document 5: Document ID = d1b2f539a, Score = 15.673548908266431


INFO:financerag.tasks.BaseTask:Top 10 results saved successfully to ./results/TAT-QA/results.csv
INFO:financerag.tasks.BaseTask:Output directory set to: ./results/TAT-QA
INFO:financerag.tasks.BaseTask:Saving original results to JSONL file: ./results/TAT-QA/original_results.jsonl
INFO:financerag.tasks.BaseTask:Original results saved successfully to ./results/TAT-QA/original_results.jsonl
INFO:financerag.tasks.BaseTask:For evaluation, we ignore identical query and document ids (default), please explicitly set ``ignore_identical_ids=False`` to ignore this.
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:NDCG@1: 0.2791
INFO:financerag.tasks.BaseTask:NDCG@5: 0.3950
INFO:financerag.tasks.BaseTask:NDCG@10: 0.4290
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:MAP@1: 0.2791
INFO:financerag.tasks.BaseTask:MAP@5: 0.3624
INFO:financerag.tasks.BaseTask:MAP@10: 0.3767
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:Recall@1: 0.2791
INFO:financerag.ta

TAT-QA
	NDCG@1: 0.27912     NDCG@5: 0.39498     NDCG@10: 0.42903
	MAP@1: 0.27912      MAP@5: 0.36245      MAP@10: 0.37666
	Recall@1: 0.27912   Recall@5: 0.49197   Recall@10: 0.59639
	P@1: 0.27912        P@5: 0.09839        P@10: 0.05964



In [6]:
final_str = TaskRunner.save_metrics(tasks=all_tasks, title='hybrid', results_dir='results')
print(final_str)
TaskRunner.combine_results(tasks=all_tasks, results_dir='results')

INFO:financerag.tasks.BaseTask:For evaluation, we ignore identical query and document ids (default), please explicitly set ``ignore_identical_ids=False`` to ignore this.


INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:NDCG@1: 0.2302
INFO:financerag.tasks.BaseTask:NDCG@5: 0.3880
INFO:financerag.tasks.BaseTask:NDCG@10: 0.4232
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:MAP@1: 0.2302
INFO:financerag.tasks.BaseTask:MAP@5: 0.3426
INFO:financerag.tasks.BaseTask:MAP@10: 0.3567
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:Recall@1: 0.2302
INFO:financerag.tasks.BaseTask:Recall@5: 0.5238
INFO:financerag.tasks.BaseTask:Recall@10: 0.6349
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:P@1: 0.2302
INFO:financerag.tasks.BaseTask:P@5: 0.1048
INFO:financerag.tasks.BaseTask:P@10: 0.0635
INFO:financerag.tasks.BaseTask:For evaluation, we ignore identical query and document ids (default), please explicitly set ``ignore_identical_ids=False`` to ignore this.
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:NDCG@1: 0.1875
INFO:financerag.tasks.BaseTask:NDCG@5: 0.3135
INFO:financerag.tasks.B

-- hybrid --

ConvFinQA
	NDCG@1: 0.23016     NDCG@5: 0.38796     NDCG@10: 0.42317
	MAP@1: 0.23016      MAP@5: 0.34259      MAP@10: 0.35673
	Recall@1: 0.23016   Recall@5: 0.52381   Recall@10: 0.63492
	P@1: 0.23016        P@5: 0.10476        P@10: 0.06349

FinDER
	NDCG@1: 0.18750     NDCG@5: 0.31345     NDCG@10: 0.33999
	MAP@1: 0.15156      MAP@5: 0.26250      MAP@10: 0.27542
	Recall@1: 0.15156   Recall@5: 0.42500   Recall@10: 0.49453
	P@1: 0.18750        P@5: 0.10938        P@10: 0.06719

FinQABench
	NDCG@1: 0.73333     NDCG@5: 0.85515     NDCG@10: 0.86703
	MAP@1: 0.73333      MAP@5: 0.82778      MAP@10: 0.83333
	Recall@1: 0.73333   Recall@5: 0.93333   Recall@10: 0.96667
	P@1: 0.73333        P@5: 0.18667        P@10: 0.09667

FinQA
	NDCG@1: 0.24128     NDCG@5: 0.40683     NDCG@10: 0.44325
	MAP@1: 0.24128      MAP@5: 0.35761      MAP@10: 0.37305
	Recall@1: 0.24128   Recall@5: 0.55523   Recall@10: 0.66570
	P@1: 0.24128        P@5: 0.11105        P@10: 0.06657

FinanceBench
	NDCG@1: 0.2888