In [1]:
import json
import logging
import os
from typing import List

import pandas as pd
from sentence_transformers import CrossEncoder

from financerag.rerank import CrossEncoderReranker
from financerag.retrieval import DenseRetrieval, SentenceTransformerEncoder, BM25Retriever, BM25Model
from financerag.tasks import (
    BaseTask,
    ConvFinQA,
    FinDER,
    FinQABench,
    FinQA,
    FinanceBench,
    MultiHiertt,
    TATQA
)
# Step 1: Import necessary libraries
# --------------------------------------
# Import required libraries for document retrieval, reranking, and logging setup.
from sentence_transformers import CrossEncoder
import logging

from financerag.rerank import CrossEncoderReranker, RRFReranker
from financerag.retrieval import DenseRetrieval, BM25Retriever, SentenceTransformerEncoder
from financerag.tasks import FinDER

# Setup basic logging configuration to show info level messages.
logging.basicConfig(level=logging.INFO)


from task_runner import TaskRunner


import gc
import torch

  from tqdm.autonotebook import tqdm, trange


In [None]:
all_tasks: List[type] = [
    ConvFinQA,
    FinDER,
    FinQABench,
    FinQA,
    FinanceBench,
    MultiHiertt,
    TATQA
]

In [None]:
def run(task: BaseTask, results_dir: str = "./results"):
    print(f"Running {task.metadata.name}")
    encoder_model = SentenceTransformerEncoder(
        model_name_or_path='intfloat/e5-large-v2',
        query_prompt='query: ',
        doc_prompt='passage: ',
    )
    dense_retrieval_model = DenseRetrieval(
        model=encoder_model
    )

    bm25_model = BM25Model(task.corpus)
    sparse_model = BM25Retriever(
        model=bm25_model
    )

    retrieval_result = task.retrieve_hybrid(
        dense_retriever=dense_retrieval_model,
        sparse_retriever=sparse_model
    )

    print(f"Retrieved hybrid results for {len(retrieval_result)} queries. Here's an example of the top 5 documents for the first query:")

    for q_id, result in retrieval_result[0].items():
        print(f"\nQuery ID: {q_id}")
        # Sort the result to print the top 5 document ID and its score
        sorted_results = sorted(result.items(), key=lambda x: x[1], reverse=True)

        for i, (doc_id, score) in enumerate(sorted_results[:5]):
            print(f"  Document {i + 1}: Document ID = {doc_id}, Score = {score}")

        break  # Only show the first query

    for q_id, result in retrieval_result[1].items():
        print(f"\nQuery ID: {q_id}")
        # Sort the result to print the top 5 document ID and its score
        sorted_results = sorted(result.items(), key=lambda x: x[1], reverse=True)

        for i, (doc_id, score) in enumerate(sorted_results[:5]):
            print(f"  Document {i + 1}: Document ID = {doc_id}, Score = {score}")

        break  # Only show the first query

    # Delete references
    del encoder_model
    del dense_retrieval_model
    del bm25_model
    del sparse_model
    
    # Force garbage collection (CPU RAM)
    gc.collect()

    reranker = RRFReranker()
    
    
    # Clear GPU cache (Crucial for SentenceTransformer/PyTorch)
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    task.rerank(
        reranker=reranker,
        results=retrieval_result,
        top_k=100,  # Rerank the top 100 documents
        k=6
    )

    del reranker

    task.save_results(output_dir='./results')
    task.save_original_results(output_dir='./results')

In [4]:
for task in all_tasks:
    current_task: BaseTask = task(corpus_file='corpus_prep.jsonl', query_file='queries_prep.jsonl')
    TaskRunner.run_custom(current_task, run, results_dir='./results')
    evaluate_result = TaskRunner.evaluate(task, results_dir='./results')
    print(current_task.metadata.name)
    print(TaskRunner.format_results(evaluate_result))

INFO:financerag.common.loader:Loading Corpus...
INFO:financerag.common.loader:Loaded 2066 Documents.
INFO:financerag.common.loader:Corpus Example: {'id': 'dd4bff516', 'title': '', 'text': 'containerboard , kraft papers and saturating kraft .\nkapstone also owns victory packaging , a packaging solutions distribution company with facilities in the u.s. , canada and mexico .\nwe have included the financial results of kapstone in our corrugated packaging segment since the date of the acquisition .\non september 4 , 2018 , we completed the acquisition ( the 201cschl fcter acquisition 201d ) of schl fcter print pharma packaging ( 201cschl fcter 201d ) .\nschl fcter is a leading provider of differentiated paper and packaging solutions and a german-based supplier of a full range of leaflets and booklets .\nthe schl fcter acquisition allowed us to further enhance our pharmaceutical and automotive platform and expand our geographical footprint in europe to better serve our customers .\nwe have i

Running ConvFinQA


INFO:financerag.retrieval.dense:Encoding queries...
Batches: 100%|██████████| 7/7 [00:00<00:00,  7.98it/s]
INFO:financerag.retrieval.dense:Sorting corpus by document length...
INFO:financerag.retrieval.dense:Encoding corpus in batches... This may take a while.
INFO:financerag.retrieval.dense:Encoding batch 1/1...
Batches: 100%|██████████| 33/33 [00:43<00:00,  1.32s/it]
INFO:financerag.retrieval.bm25:Tokenizing queries with lower cases
INFO:financerag.tasks.BaseTask:Output directory set to: ./results/ConvFinQA
INFO:financerag.tasks.BaseTask:Saving top 10 results to CSV file: ./results/ConvFinQA/results.csv
INFO:financerag.tasks.BaseTask:Writing header ['query_id', 'corpus_id'] to CSV.
INFO:financerag.tasks.BaseTask:Top 10 results saved successfully to ./results/ConvFinQA/results.csv
INFO:financerag.tasks.BaseTask:Output directory set to: ./results/ConvFinQA
INFO:financerag.tasks.BaseTask:Saving original results to JSONL file: ./results/ConvFinQA/original_results.jsonl
INFO:financerag.ta

Retrieved hybrid results for 2 queries. Here's an example of the top 5 documents for the first query:

Query ID: qd4982518
  Document 1: Document ID = dd4b9f7f6, Score = 0.8448894619941711
  Document 2: Document ID = dd4b95a76, Score = 0.8383541107177734
  Document 3: Document ID = dd4c4c208, Score = 0.836830735206604
  Document 4: Document ID = dd4bbc202, Score = 0.8367441892623901
  Document 5: Document ID = dd4c4f7aa, Score = 0.8364241719245911

Query ID: qd4982518
  Document 1: Document ID = dd4982158, Score = 68.39888029898678
  Document 2: Document ID = dd496d5d2, Score = 67.16417497569476
  Document 3: Document ID = dd4be45d6, Score = 66.87313528496995
  Document 4: Document ID = dd4b87d18, Score = 66.75538979870812
  Document 5: Document ID = dd4b9f7f6, Score = 66.7295786426677
ConvFinQA
	NDCG@1: 0.23810     NDCG@5: 0.39528     NDCG@10: 0.43385
	MAP@1: 0.23810      MAP@5: 0.34987      MAP@10: 0.36581
	Recall@1: 0.23810   Recall@5: 0.53175   Recall@10: 0.65079
	P@1: 0.23810     

INFO:financerag.common.loader:Loaded 13867 Documents.
INFO:financerag.common.loader:Corpus Example: {'id': 'ADBE20230004', 'title': 'ADBE OVERVIEW', 'text': 'Adobe is a global technology company with a mission to change the world through personalized digital experiences. For over four decades, Adobe’s innovations have transformed how individuals, teams, businesses, enterprises, institutions, and governments engage and interact across all types of media. Our products, services and solutions are used around the world to imagine, create, manage, deliver, measure, optimize and engage with content across surfaces and fuel digital experiences. We have a diverse user base that includes consumers, communicators, creative professionals, developers, students, small and medium businesses and enterprises. We are also empowering creators by putting the power of artificial intelligence (“AI”) in their hands, and doing so in ways we believe are responsible. Our products and services help unleash crea

Running FinDER


INFO:financerag.retrieval.dense:Encoding queries...
Batches: 100%|██████████| 4/4 [00:00<00:00, 13.44it/s]
INFO:financerag.retrieval.dense:Sorting corpus by document length...
INFO:financerag.retrieval.dense:Encoding corpus in batches... This may take a while.
INFO:financerag.retrieval.dense:Encoding batch 1/1...
Batches: 100%|██████████| 217/217 [02:31<00:00,  1.43it/s]
INFO:financerag.retrieval.bm25:Tokenizing queries with lower cases
INFO:financerag.tasks.BaseTask:Output directory set to: ./results/FinDER
INFO:financerag.tasks.BaseTask:Saving top 10 results to CSV file: ./results/FinDER/results.csv
INFO:financerag.tasks.BaseTask:Writing header ['query_id', 'corpus_id'] to CSV.
INFO:financerag.tasks.BaseTask:Top 10 results saved successfully to ./results/FinDER/results.csv
INFO:financerag.tasks.BaseTask:Output directory set to: ./results/FinDER
INFO:financerag.tasks.BaseTask:Saving original results to JSONL file: ./results/FinDER/original_results.jsonl
INFO:financerag.tasks.BaseTask:

Retrieved hybrid results for 2 queries. Here's an example of the top 5 documents for the first query:

Query ID: q00001
  Document 1: Document ID = MSFT20230966, Score = 0.8539252877235413
  Document 2: Document ID = MSFT20230254, Score = 0.8452969789505005
  Document 3: Document ID = MSFT20230521, Score = 0.8438000679016113
  Document 4: Document ID = MSFT20230216, Score = 0.8412744998931885
  Document 5: Document ID = MSFT20230148, Score = 0.8291170597076416

Query ID: q00001
  Document 1: Document ID = MSFT20230134, Score = 52.51683902009371
  Document 2: Document ID = AAPL20230240, Score = 47.17830909983652
  Document 3: Document ID = ORCL20230061, Score = 46.770572979179235
  Document 4: Document ID = AMZN20230072, Score = 46.74749363158132
  Document 5: Document ID = AAPL20230021, Score = 46.69896817045515
FinDER
	NDCG@1: 0.21875     NDCG@5: 0.33058     NDCG@10: 0.36456
	MAP@1: 0.17500      MAP@5: 0.27995      MAP@10: 0.29655
	Recall@1: 0.17500   Recall@5: 0.43594   Recall@10: 0.

Generating train split: 92 examples [00:00, 28292.10 examples/s]
Casting the dataset: 100%|██████████| 92/92 [00:00<00:00, 45687.42 examples/s]
INFO:financerag.common.loader:Loaded 92 Documents.
INFO:financerag.common.loader:Corpus Example: {'id': 'd4aa0660c', 'title': '', 'text': 'Apple Inc.\nCONSOLIDATED STATEMENTS OF OPERATIONS\n(In millions, except number of shares which are reflected in thousands and per share amounts)\nYears ended\nSeptember 24,\n2022September 25,\n2021September 26,\n2020\nNet sales:\n   Products $ 316,199 $ 297,392 $ 220,747 \n   Services  78,129  68,425  53,768 \nTotal net sales  394,328  365,817  274,515 \nCost of sales:\n   Products  201,471  192,266  151,286 \n   Services  22,075  20,715  18,273 \nTotal cost of sales  223,546  212,981  169,559 \nGross margin  170,782  152,836  104,956 \nOperating expenses:\nResearch and development  26,251  21,914  18,752 \nSelling, general and administrative  25,094  21,973  19,916 \nTotal operating expenses  51,345  43,887

Running FinQABench


INFO:financerag.retrieval.dense:Encoding queries...
Batches: 100%|██████████| 2/2 [00:00<00:00,  7.67it/s]
INFO:financerag.retrieval.dense:Sorting corpus by document length...
INFO:financerag.retrieval.dense:Encoding corpus in batches... This may take a while.
INFO:financerag.retrieval.dense:Encoding batch 1/1...
Batches: 100%|██████████| 2/2 [00:01<00:00,  1.26it/s]
INFO:financerag.retrieval.bm25:Tokenizing queries with lower cases
INFO:financerag.tasks.BaseTask:Output directory set to: ./results/FinQABench
INFO:financerag.tasks.BaseTask:Saving top 10 results to CSV file: ./results/FinQABench/results.csv
INFO:financerag.tasks.BaseTask:Writing header ['query_id', 'corpus_id'] to CSV.
INFO:financerag.tasks.BaseTask:Top 10 results saved successfully to ./results/FinQABench/results.csv
INFO:financerag.tasks.BaseTask:Output directory set to: ./results/FinQABench
INFO:financerag.tasks.BaseTask:Saving original results to JSONL file: ./results/FinQABench/original_results.jsonl
INFO:financerag

Retrieved hybrid results for 2 queries. Here's an example of the top 5 documents for the first query:

Query ID: q4aa0b116
  Document 1: Document ID = d4aa0b1f2, Score = 0.9047430753707886
  Document 2: Document ID = d4aa0a52c, Score = 0.8628849387168884
  Document 3: Document ID = d4aa0a9e6, Score = 0.840498149394989
  Document 4: Document ID = d4aa10314, Score = 0.8331577777862549
  Document 5: Document ID = d4aa0a7d4, Score = 0.8311948180198669

Query ID: q4aa0b116
  Document 1: Document ID = d4aa0b1f2, Score = 118.86940773018863
  Document 2: Document ID = d4aa0a52c, Score = 68.78424469539279
  Document 3: Document ID = d4aa0a9e6, Score = 57.981997116901546
  Document 4: Document ID = d4aa1b854, Score = 54.31517768917417
  Document 5: Document ID = d4aa0a7d4, Score = 53.549703762687635
FinQABench
	NDCG@1: 0.73333     NDCG@5: 0.85515     NDCG@10: 0.86703
	MAP@1: 0.73333      MAP@5: 0.82778      MAP@10: 0.83333
	Recall@1: 0.73333   Recall@5: 0.93333   Recall@10: 0.96667
	P@1: 0.73333

Generating train split: 2789 examples [00:00, 85004.02 examples/s]
Casting the dataset: 100%|██████████| 2789/2789 [00:00<00:00, 100649.73 examples/s]
INFO:financerag.common.loader:Loaded 2789 Documents.
INFO:financerag.common.loader:Corpus Example: {'id': 'd61d9e858', 'title': '', 'text': 'performance graph the performance graph below shows the five-year cumulative total stockholder return on applied common stock during the period from october 25 , 2009 through october 26 , 2014 .\nthis is compared with the cumulative total return of the standard & poor 2019s 500 stock index and the rdg semiconductor composite index over the same period .\nthe comparison assumes $ 100 was invested on october 25 , 2009 in applied common stock and in each of the foregoing indices and assumes reinvestment of dividends , if any .\ndollar amounts in the graph are rounded to the nearest whole dollar .\nthe performance shown in the graph represents past performance and should not be considered an indication 

Running FinQA


INFO:financerag.retrieval.dense:Encoding queries...
Batches: 100%|██████████| 18/18 [00:02<00:00,  8.72it/s]
INFO:financerag.retrieval.dense:Sorting corpus by document length...
INFO:financerag.retrieval.dense:Encoding corpus in batches... This may take a while.
INFO:financerag.retrieval.dense:Encoding batch 1/1...
Batches: 100%|██████████| 44/44 [00:59<00:00,  1.35s/it]
INFO:financerag.retrieval.bm25:Tokenizing queries with lower cases
INFO:financerag.tasks.BaseTask:Output directory set to: ./results/FinQA
INFO:financerag.tasks.BaseTask:Saving top 10 results to CSV file: ./results/FinQA/results.csv
INFO:financerag.tasks.BaseTask:Writing header ['query_id', 'corpus_id'] to CSV.
INFO:financerag.tasks.BaseTask:Top 10 results saved successfully to ./results/FinQA/results.csv
INFO:financerag.tasks.BaseTask:Output directory set to: ./results/FinQA
INFO:financerag.tasks.BaseTask:Saving original results to JSONL file: ./results/FinQA/original_results.jsonl


Retrieved hybrid results for 2 queries. Here's an example of the top 5 documents for the first query:

Query ID: q61676968
  Document 1: Document ID = d61d31046, Score = 0.880416750907898
  Document 2: Document ID = d61d2c15e, Score = 0.8794994950294495
  Document 3: Document ID = d616769d6, Score = 0.8794718980789185
  Document 4: Document ID = d61e30eec, Score = 0.8405296802520752
  Document 5: Document ID = d61d2a890, Score = 0.8394184708595276

Query ID: q61676968
  Document 1: Document ID = d61d2a890, Score = 122.94560519072625
  Document 2: Document ID = d616769d6, Score = 119.97814019065902
  Document 3: Document ID = d61d2c15e, Score = 117.27416922513842
  Document 4: Document ID = d61d31046, Score = 115.17264395798294
  Document 5: Document ID = d61e30eec, Score = 99.02828757852983


INFO:financerag.tasks.BaseTask:Original results saved successfully to ./results/FinQA/original_results.jsonl
INFO:financerag.tasks.BaseTask:For evaluation, we ignore identical query and document ids (default), please explicitly set ``ignore_identical_ids=False`` to ignore this.
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:NDCG@1: 0.2296
INFO:financerag.tasks.BaseTask:NDCG@5: 0.4028
INFO:financerag.tasks.BaseTask:NDCG@10: 0.4400
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:MAP@1: 0.2296
INFO:financerag.tasks.BaseTask:MAP@5: 0.3497
INFO:financerag.tasks.BaseTask:MAP@10: 0.3653
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:Recall@1: 0.2296
INFO:financerag.tasks.BaseTask:Recall@5: 0.5639
INFO:financerag.tasks.BaseTask:Recall@10: 0.6773
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:P@1: 0.2296
INFO:financerag.tasks.BaseTask:P@5: 0.1128
INFO:financerag.tasks.BaseTask:P@10: 0.0677
INFO:financerag.common.loader:Loading C

FinQA
	NDCG@1: 0.22965     NDCG@5: 0.40279     NDCG@10: 0.44000
	MAP@1: 0.22965      MAP@5: 0.34966      MAP@10: 0.36533
	Recall@1: 0.22965   Recall@5: 0.56395   Recall@10: 0.67733
	P@1: 0.22965        P@5: 0.11279        P@10: 0.06773



Generating train split: 180 examples [00:00, 58575.12 examples/s]
Casting the dataset: 100%|██████████| 180/180 [00:00<00:00, 93080.35 examples/s]
INFO:financerag.common.loader:Loaded 180 Documents.
INFO:financerag.common.loader:Corpus Example: {'id': 'dd2af2336', 'title': 'PEPSICO_2022_10K', 'text': '6) Africa, Middle East and South Asia (AMESA), which includes all of our beverage and convenient food businesses in\nAfrica, the Middle East and South Asia; and\n7) Asia Pacific, Australia and New Zealand and China Region (APAC), which includes all of our beverage and convenient\nfood businesses in Asia Pacific, Australia and New Zealand, and China region.'}
INFO:financerag.common.loader:Loading Queries...
INFO:financerag.common.loader:Loaded 150 Queries.
INFO:financerag.common.loader:Query Example: {'id': 'qd2ac917a', 'text': 'What is the FY2019 - FY2020 total revenue growth rate for Block (formerly known as Square)? Answer in units of percents and round to one decimal place. Approach th

Running FinanceBench


INFO:financerag.retrieval.dense:Encoding queries...
Batches: 100%|██████████| 3/3 [00:00<00:00,  4.62it/s]
INFO:financerag.retrieval.dense:Sorting corpus by document length...
INFO:financerag.retrieval.dense:Encoding corpus in batches... This may take a while.
INFO:financerag.retrieval.dense:Encoding batch 1/1...
Batches: 100%|██████████| 3/3 [00:03<00:00,  1.07s/it]
INFO:financerag.retrieval.bm25:Tokenizing queries with lower cases
INFO:financerag.tasks.BaseTask:Output directory set to: ./results/FinanceBench
INFO:financerag.tasks.BaseTask:Saving top 10 results to CSV file: ./results/FinanceBench/results.csv
INFO:financerag.tasks.BaseTask:Writing header ['query_id', 'corpus_id'] to CSV.
INFO:financerag.tasks.BaseTask:Top 10 results saved successfully to ./results/FinanceBench/results.csv
INFO:financerag.tasks.BaseTask:Output directory set to: ./results/FinanceBench
INFO:financerag.tasks.BaseTask:Saving original results to JSONL file: ./results/FinanceBench/original_results.jsonl
INFO:

Retrieved hybrid results for 2 queries. Here's an example of the top 5 documents for the first query:

Query ID: qd2ac917a
  Document 1: Document ID = dd2acce74, Score = 0.8439295291900635
  Document 2: Document ID = dd2acd630, Score = 0.8393127918243408
  Document 3: Document ID = dd2ac8f0e, Score = 0.8217950463294983
  Document 4: Document ID = dd2ac6718, Score = 0.8063642978668213
  Document 5: Document ID = dd2abd7b2, Score = 0.8059011697769165

Query ID: qd2ac917a
  Document 1: Document ID = dd2adb8d4, Score = 64.31881308918526
  Document 2: Document ID = dd2ac6718, Score = 64.21580715214485
  Document 3: Document ID = dd2adb5be, Score = 64.14408119195403
  Document 4: Document ID = dd2afc3f4, Score = 56.34574905380225
  Document 5: Document ID = dd2ac8626, Score = 53.99007902241
FinanceBench
	NDCG@1: 0.37778     NDCG@5: 0.61532     NDCG@10: 0.67682
	MAP@1: 0.31111      MAP@5: 0.53815      MAP@10: 0.56592
	Recall@1: 0.31111   Recall@5: 0.78889   Recall@10: 0.97778
	P@1: 0.37778   

Generating train split: 10475 examples [00:00, 276932.46 examples/s]
Casting the dataset: 100%|██████████| 10475/10475 [00:00<00:00, 414128.76 examples/s]
INFO:financerag.common.loader:Loaded 10475 Documents.
INFO:financerag.common.loader:Corpus Example: {'id': 'd8e4ea4ac', 'title': '', 'text': '|  | Years Ended December 31, |\n|  | 2006 | 2005 |\n|  | (In millions) |\n| Investment return | $192 | $-26 |\n| Expense | 45 | 11 |\n| In-force/Persistency | -7 | -33 |\n| Policyholder dividends and other | -39 | -11 |\n| Total | $191 | $-59 |'}
INFO:financerag.common.loader:Loading Queries...
INFO:financerag.common.loader:Loaded 974 Queries.
INFO:financerag.common.loader:Query Example: {'id': 'q82d4c6ec', 'text': 'What was the sum of Fourth Quarter without those Fourth Quarter smaller than 0, in 2012? (in million)\n\nFourth Quarter, sum'}
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cuda
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransfo

Running MultiHiertt


INFO:financerag.retrieval.dense:Encoding queries...
Batches: 100%|██████████| 16/16 [00:01<00:00,  8.73it/s]
INFO:financerag.retrieval.dense:Sorting corpus by document length...
INFO:financerag.retrieval.dense:Encoding corpus in batches... This may take a while.
INFO:financerag.retrieval.dense:Encoding batch 1/1...
Batches: 100%|██████████| 164/164 [04:00<00:00,  1.47s/it]
INFO:financerag.retrieval.bm25:Tokenizing queries with lower cases
INFO:financerag.tasks.BaseTask:Output directory set to: ./results/MultiHiertt
INFO:financerag.tasks.BaseTask:Saving top 10 results to CSV file: ./results/MultiHiertt/results.csv
INFO:financerag.tasks.BaseTask:Writing header ['query_id', 'corpus_id'] to CSV.
INFO:financerag.tasks.BaseTask:Top 10 results saved successfully to ./results/MultiHiertt/results.csv
INFO:financerag.tasks.BaseTask:Output directory set to: ./results/MultiHiertt
INFO:financerag.tasks.BaseTask:Saving original results to JSONL file: ./results/MultiHiertt/original_results.jsonl


Retrieved hybrid results for 2 queries. Here's an example of the top 5 documents for the first query:

Query ID: q82d4c6ec
  Document 1: Document ID = d81a04f9e, Score = 0.8390624523162842
  Document 2: Document ID = d8d3fbbaa, Score = 0.8388010859489441
  Document 3: Document ID = d88cdb45a, Score = 0.8372913599014282
  Document 4: Document ID = d81a04fe4, Score = 0.8363529443740845
  Document 5: Document ID = d8f8bc700, Score = 0.8358454704284668

Query ID: q82d4c6ec
  Document 1: Document ID = d8f1c990c, Score = 70.34721232170189
  Document 2: Document ID = d87a9864e, Score = 66.17892232593634
  Document 3: Document ID = d89cc9268, Score = 65.12928260533583
  Document 4: Document ID = d8dae16c2, Score = 62.941028756641124
  Document 5: Document ID = d86f5ca32, Score = 62.397952123922174


INFO:financerag.tasks.BaseTask:Original results saved successfully to ./results/MultiHiertt/original_results.jsonl
INFO:financerag.tasks.BaseTask:For evaluation, we ignore identical query and document ids (default), please explicitly set ``ignore_identical_ids=False`` to ignore this.
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:NDCG@1: 0.1781
INFO:financerag.tasks.BaseTask:NDCG@5: 0.1144
INFO:financerag.tasks.BaseTask:NDCG@10: 0.1182
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:MAP@1: 0.0464
INFO:financerag.tasks.BaseTask:MAP@5: 0.0651
INFO:financerag.tasks.BaseTask:MAP@10: 0.0683
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:Recall@1: 0.0464
INFO:financerag.tasks.BaseTask:Recall@5: 0.0940
INFO:financerag.tasks.BaseTask:Recall@10: 0.1105
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:P@1: 0.1781
INFO:financerag.tasks.BaseTask:P@5: 0.0753
INFO:financerag.tasks.BaseTask:P@10: 0.0452
INFO:financerag.common.loader:Loa

MultiHiertt
	NDCG@1: 0.17808     NDCG@5: 0.11440     NDCG@10: 0.11816
	MAP@1: 0.04639      MAP@5: 0.06513      MAP@10: 0.06825
	Recall@1: 0.04639   Recall@5: 0.09398   Recall@10: 0.11054
	P@1: 0.17808        P@5: 0.07534        P@10: 0.04521



Generating train split: 2756 examples [00:00, 197446.44 examples/s]
Casting the dataset: 100%|██████████| 2756/2756 [00:00<00:00, 156167.28 examples/s]
INFO:financerag.common.loader:Loaded 2756 Documents.
INFO:financerag.common.loader:Corpus Example: {'id': 'd1b2e74c0', 'title': '', 'text': 'The following tables present the recorded investment by portfolio segment and by class, excluding commercial financing receivables and other miscellaneous financing receivables at December 31, 2019 and 2018. Commercial financing receivables are excluded from the presentation of financing receivables by portfolio segment, as they are short term in nature and the current estimated risk of loss and resulting impact to the company’s financing results are not material.\nWrite-offs of lease receivables and loan receivables were $16 million and $47 million, respectively, for the year ended December 31, 2019. Provisions for credit losses recorded for lease receivables and loan receivables were a release of

Running TAT-QA


INFO:financerag.retrieval.dense:Encoding queries...
Batches: 100%|██████████| 26/26 [00:02<00:00, 11.04it/s]
INFO:financerag.retrieval.dense:Sorting corpus by document length...
INFO:financerag.retrieval.dense:Encoding corpus in batches... This may take a while.
INFO:financerag.retrieval.dense:Encoding batch 1/1...
Batches: 100%|██████████| 44/44 [01:30<00:00,  2.06s/it]
INFO:financerag.retrieval.bm25:Tokenizing queries with lower cases


Retrieved hybrid results for 2 queries. Here's an example of the top 5 documents for the first query:

Query ID: q1a73c1d4
  Document 1: Document ID = d1b2ee16c, Score = 0.8175246119499207
  Document 2: Document ID = d1b3af812, Score = 0.8169326186180115
  Document 3: Document ID = d1b3576b2, Score = 0.8161620497703552
  Document 4: Document ID = d1b3aad30, Score = 0.8156298995018005
  Document 5: Document ID = d1b371530, Score = 0.8155505657196045

Query ID: q1a73c1d4
  Document 1: Document ID = d1b30ab0a, Score = 18.479142477631633
  Document 2: Document ID = d1b326d46, Score = 18.247789249761176
  Document 3: Document ID = d1b2f539a, Score = 18.182879631761047
  Document 4: Document ID = d1b3be84e, Score = 18.09071465674939
  Document 5: Document ID = d1b3bf28a, Score = 17.2841490621574


INFO:financerag.tasks.BaseTask:Output directory set to: ./results/TAT-QA
INFO:financerag.tasks.BaseTask:Saving top 10 results to CSV file: ./results/TAT-QA/results.csv
INFO:financerag.tasks.BaseTask:Writing header ['query_id', 'corpus_id'] to CSV.
INFO:financerag.tasks.BaseTask:Top 10 results saved successfully to ./results/TAT-QA/results.csv
INFO:financerag.tasks.BaseTask:Output directory set to: ./results/TAT-QA
INFO:financerag.tasks.BaseTask:Saving original results to JSONL file: ./results/TAT-QA/original_results.jsonl
INFO:financerag.tasks.BaseTask:Original results saved successfully to ./results/TAT-QA/original_results.jsonl
INFO:financerag.tasks.BaseTask:For evaluation, we ignore identical query and document ids (default), please explicitly set ``ignore_identical_ids=False`` to ignore this.
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:NDCG@1: 0.3012
INFO:financerag.tasks.BaseTask:NDCG@5: 0.4184
INFO:financerag.tasks.BaseTask:NDCG@10: 0.4536
INFO:financerag.task

TAT-QA
	NDCG@1: 0.30120     NDCG@5: 0.41837     NDCG@10: 0.45355
	MAP@1: 0.30120      MAP@5: 0.38273      MAP@10: 0.39731
	Recall@1: 0.30120   Recall@5: 0.52610   Recall@10: 0.63454
	P@1: 0.30120        P@5: 0.10522        P@10: 0.06345



INFO:financerag.tasks.BaseTask:For evaluation, we ignore identical query and document ids (default), please explicitly set ``ignore_identical_ids=False`` to ignore this.
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:NDCG@1: 0.1781
INFO:financerag.tasks.BaseTask:NDCG@5: 0.1144
INFO:financerag.tasks.BaseTask:NDCG@10: 0.1182
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:MAP@1: 0.0464
INFO:financerag.tasks.BaseTask:MAP@5: 0.0651
INFO:financerag.tasks.BaseTask:MAP@10: 0.0683
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:Recall@1: 0.0464
INFO:financerag.tasks.BaseTask:Recall@5: 0.0940
INFO:financerag.tasks.BaseTask:Recall@10: 0.1105
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:P@1: 0.1781
INFO:financerag.tasks.BaseTask:P@5: 0.0753
INFO:financerag.tasks.BaseTask:P@10: 0.0452
INFO:financerag.tasks.BaseTask:For evaluation, we ignore identical query and document ids (default), please explicitly set ``ignore_identical_ids=Fa

-- query expansion + keyword extraction + hybrid --

ConvFinQA
	NDCG@1: 0.23810     NDCG@5: 0.39528     NDCG@10: 0.43385
	MAP@1: 0.23810      MAP@5: 0.34987      MAP@10: 0.36581
	Recall@1: 0.23810   Recall@5: 0.53175   Recall@10: 0.65079
	P@1: 0.23810        P@5: 0.10635        P@10: 0.06508

FinDER
	NDCG@1: 0.21875     NDCG@5: 0.33058     NDCG@10: 0.36456
	MAP@1: 0.17500      MAP@5: 0.27995      MAP@10: 0.29655
	Recall@1: 0.17500   Recall@5: 0.43594   Recall@10: 0.53021
	P@1: 0.21875        P@5: 0.11250        P@10: 0.07188

FinQABench
	NDCG@1: 0.73333     NDCG@5: 0.85515     NDCG@10: 0.86703
	MAP@1: 0.73333      MAP@5: 0.82778      MAP@10: 0.83333
	Recall@1: 0.73333   Recall@5: 0.93333   Recall@10: 0.96667
	P@1: 0.73333        P@5: 0.18667        P@10: 0.09667

FinQA
	NDCG@1: 0.22965     NDCG@5: 0.40279     NDCG@10: 0.44000
	MAP@1: 0.22965      MAP@5: 0.34966      MAP@10: 0.36533
	Recall@1: 0.22965   Recall@5: 0.56395   Recall@10: 0.67733
	P@1: 0.22965        P@5: 0.11279        P@10

In [5]:
final_str = TaskRunner.save_metrics(tasks=all_tasks, title='query expansion + keyword extraction + hybrid', results_dir='results')
print(final_str)
TaskRunner.combine_results(tasks=all_tasks, results_dir='results')

INFO:financerag.tasks.BaseTask:For evaluation, we ignore identical query and document ids (default), please explicitly set ``ignore_identical_ids=False`` to ignore this.
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:NDCG@1: 0.2381
INFO:financerag.tasks.BaseTask:NDCG@5: 0.3953
INFO:financerag.tasks.BaseTask:NDCG@10: 0.4339
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:MAP@1: 0.2381
INFO:financerag.tasks.BaseTask:MAP@5: 0.3499
INFO:financerag.tasks.BaseTask:MAP@10: 0.3658
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:Recall@1: 0.2381
INFO:financerag.tasks.BaseTask:Recall@5: 0.5317
INFO:financerag.tasks.BaseTask:Recall@10: 0.6508
INFO:financerag.tasks.BaseTask:

INFO:financerag.tasks.BaseTask:P@1: 0.2381
INFO:financerag.tasks.BaseTask:P@5: 0.1064
INFO:financerag.tasks.BaseTask:P@10: 0.0651
INFO:financerag.tasks.BaseTask:For evaluation, we ignore identical query and document ids (default), please explicitly set ``ignore_identical_ids=Fa

-- query expansion + keyword extraction + hybrid --

ConvFinQA
	NDCG@1: 0.23810     NDCG@5: 0.39528     NDCG@10: 0.43385
	MAP@1: 0.23810      MAP@5: 0.34987      MAP@10: 0.36581
	Recall@1: 0.23810   Recall@5: 0.53175   Recall@10: 0.65079
	P@1: 0.23810        P@5: 0.10635        P@10: 0.06508

FinDER
	NDCG@1: 0.21875     NDCG@5: 0.33058     NDCG@10: 0.36456
	MAP@1: 0.17500      MAP@5: 0.27995      MAP@10: 0.29655
	Recall@1: 0.17500   Recall@5: 0.43594   Recall@10: 0.53021
	P@1: 0.21875        P@5: 0.11250        P@10: 0.07188

FinQABench
	NDCG@1: 0.73333     NDCG@5: 0.85515     NDCG@10: 0.86703
	MAP@1: 0.73333      MAP@5: 0.82778      MAP@10: 0.83333
	Recall@1: 0.73333   Recall@5: 0.93333   Recall@10: 0.96667
	P@1: 0.73333        P@5: 0.18667        P@10: 0.09667

FinQA
	NDCG@1: 0.22965     NDCG@5: 0.40279     NDCG@10: 0.44000
	MAP@1: 0.22965      MAP@5: 0.34966      MAP@10: 0.36533
	Recall@1: 0.22965   Recall@5: 0.56395   Recall@10: 0.67733
	P@1: 0.22965        P@5: 0.11279        P@10