In [5]:
import json
import os

import pandas as pd

from financerag.tasks import (
    BaseTask,
    ConvFinQA,
    FinanceBench,
    FinDER,
    FinQA,
    FinQABench,
    MultiHiertt,
    TATQA,
)

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
from task_runner import TaskRunner

In [7]:
import logging

# List of the specific loggers appearing in your output
noisy_loggers = [
    'financerag', 
    'financerag.common.loader',
    'sentence_transformers',
    'transformers'  # sentence_transformers usually relies on this
]

for logger_name in noisy_loggers:
    logging.getLogger(logger_name).setLevel(logging.ERROR)

In [8]:
from pprint import pprint
all_tasks = [
    ConvFinQA,
    FinDER,
    FinQABench,
    FinQA,
    FinanceBench,
    MultiHiertt,
    TATQA
]

In [9]:
# def format_results(results):
#     formatted_string = ""
#     for metric_group in results:
        
#         # Format each "Metric: value" with fixed width for perfect alignment
#         formatted = [
#             f"{k}: {v:.5f}".ljust(20)
#             for k, v in metric_group.items()
#         ]

#         # Print 3 per line like your output
#         for i in range(0, len(formatted), 3):
#             formatted_string += ("\t" + "".join(formatted[i:i+3]).rstrip()) + "\n"
    
#     return formatted_string

In [None]:
for task in all_tasks:
    task = task(load_data=False)
    results = TaskRunner.evaluate(task, results_dir='results_backup/results')
final_str = TaskRunner.save_metrics(tasks=all_tasks, title='Baseline', results_dir='results_backup/results')
print(final_str)

-- Baseline --

ConvFinQA
	NDCG@1: 0.23016     NDCG@5: 0.34283     NDCG@10: 0.38143
	MAP@1: 0.23016      MAP@5: 0.30688      MAP@10: 0.32288
	Recall@1: 0.23016   Recall@5: 0.45238   Recall@10: 0.57143
	P@1: 0.23016        P@5: 0.09048        P@10: 0.05714

FinDER
	NDCG@1: 0.25000     NDCG@5: 0.33630     NDCG@10: 0.37007
	MAP@1: 0.21875      MAP@5: 0.30514      MAP@10: 0.32170
	Recall@1: 0.21875   Recall@5: 0.39687   Recall@10: 0.48646
	P@1: 0.25000        P@5: 0.11250        P@10: 0.07188

FinQABench
	NDCG@1: 0.83333     NDCG@5: 0.86667     NDCG@10: 0.87630
	MAP@1: 0.83333      MAP@5: 0.85556      MAP@10: 0.85889
	Recall@1: 0.83333   Recall@5: 0.90000   Recall@10: 0.93333
	P@1: 0.83333        P@5: 0.18000        P@10: 0.09333

FinQA
	NDCG@1: 0.24128     NDCG@5: 0.31790     NDCG@10: 0.34974
	MAP@1: 0.24128      MAP@5: 0.29511      MAP@10: 0.30816
	Recall@1: 0.24128   Recall@5: 0.38663   Recall@10: 0.48547
	P@1: 0.24128        P@5: 0.07733        P@10: 0.04855

FinanceBench
	NDCG@1: 0.55

In [11]:
for task in all_tasks:
    task = task(load_data=False)
    results = TaskRunner.evaluate(task, results_dir='results_backup/results')
final_str = TaskRunner.save_metrics(tasks=all_tasks, title="Baseline + query_expansion(2nd place w. llama3.1:8b)", results_dir='results')
print(final_str)
    

-- Baseline + query_expansion(2nd place w. llama3.1:8b) --

ConvFinQA
	NDCG@1: 0.19841     NDCG@5: 0.32859     NDCG@10: 0.35899
	MAP@1: 0.19841      MAP@5: 0.29008      MAP@10: 0.30237
	Recall@1: 0.19841   Recall@5: 0.44444   Recall@10: 0.53968
	P@1: 0.19841        P@5: 0.08889        P@10: 0.05397

FinDER
	NDCG@1: 0.26562     NDCG@5: 0.35299     NDCG@10: 0.37919
	MAP@1: 0.23750      MAP@5: 0.31953      MAP@10: 0.33501
	Recall@1: 0.23750   Recall@5: 0.41563   Recall@10: 0.48125
	P@1: 0.26562        P@5: 0.11875        P@10: 0.07188

FinQABench
	NDCG@1: 0.80000     NDCG@5: 0.87599     NDCG@10: 0.88650
	MAP@1: 0.80000      MAP@5: 0.85667      MAP@10: 0.86083
	Recall@1: 0.80000   Recall@5: 0.93333   Recall@10: 0.96667
	P@1: 0.80000        P@5: 0.18667        P@10: 0.09667

FinQA
	NDCG@1: 0.21512     NDCG@5: 0.29774     NDCG@10: 0.32438
	MAP@1: 0.21512      MAP@5: 0.27321      MAP@10: 0.28384
	Recall@1: 0.21512   Recall@5: 0.37209   Recall@10: 0.45640
	P@1: 0.21512        P@5: 0.07442     