In [1]:
from datasets import load_dataset
import pandas as pd 

  from .autonotebook import tqdm as notebook_tqdm


# FinDER

In [2]:
# Load FinDER dataset
finder_data = load_dataset("Linq-AI-Research/FinanceRAG", "FinDER")

In [3]:
finder_data

DatasetDict({
    corpus: Dataset({
        features: ['_id', 'title', 'text'],
        num_rows: 13867
    })
    queries: Dataset({
        features: ['_id', 'title', 'text'],
        num_rows: 216
    })
})

In [4]:
corpus_data = [{
    "id": doc["_id"],
    "title": doc["title"],
    "text": doc["text"]
} for doc in finder_data['corpus']]  # Only taking the first 3 entries for example

df_corpus = pd.DataFrame(corpus_data)
df_corpus.index = df_corpus['id']
df_corpus.drop(columns=['id'], inplace=True)
df_corpus.index.name = ''
df_corpus

Unnamed: 0,title,text
,,
ADBE20230004,ADBE OVERVIEW,Adobe is a global technology company with a mi...
ADBE20230006,ADBE OFFERINGS,"We deliver a wide range of products, services ..."
ADBE20230007,ADBE OFFERINGS,"Digital Media. We provide products, services a..."
ADBE20230008,ADBE OFFERINGS,Digital Experience. We provide an integrated p...
ADBE20230010,ADBE OFFERINGS,"We offer a comprehensive suite of products, se..."
...,...,...
V20232000,V _______________,† Confidential treatment has been requested fo...
V20232001,V _______________,"* Management contract, compensatory plan or ar..."
V20232004,V _______________,+ Filed or furnished herewith. # Schedules hav...


In [5]:
queries_data = [{
    "id": query["_id"],
    "text": query["text"]
} for query in finder_data['queries']]  # Extract all queries

# Convert to pandas DataFrame
df_queries = pd.DataFrame(queries_data)
df_queries.index = df_queries['id']
df_queries.drop(columns=['id'], inplace=True)
df_queries.index.name = ''
df_queries

Unnamed: 0,text
,
q00001,What are the service and product offerings fro...
q00002,MSFT segment breakdown
q00003,Who are Microsoft`s key customers?
q00004,What is Microsoft`s business model
q00005,MSFT Capex commitment
...,...
q00214,How many distinct insurance underwriting group...
q00215,What is the ticker symbol for Berkshire Hathaw...
q00216,What is the largest operating segment of the B...


In [43]:
qrels = pd.read_csv('resources/finder_qrels.tsv', sep='\t')
qrels_dict = qrels.groupby('query_id')["corpus_id"].apply(list).to_dict()
qrels_dict

{'q00001': ['MSFT20230014', 'MSFT20230015'],
 'q00007': ['MSFT20231529'],
 'q00008': ['MSFT20231529'],
 'q00010': ['ADBE20231571', 'ADBE20231572', 'ADBE20230728', 'ADBE20231573'],
 'q00019': ['CPNG20230732'],
 'q00021': ['CPNG20230658'],
 'q00022': ['CPNG20230553'],
 'q00027': ['LIN20231133'],
 'q00028': ['LIN20231195'],
 'q00030': ['LIN20230064', 'LIN20230065', 'LIN20230066', 'LIN20230067'],
 'q00034': ['LIN20230551'],
 'q00039': ['ORCL20230738', 'ORCL20230739'],
 'q00042': ['ORCL20230129', 'ORCL20230130', 'ORCL20230131'],
 'q00043': ['ORCL20231527', 'ORCL20231529'],
 'q00044': ['ORCL20231505'],
 'q00048': ['NVDA20231260'],
 'q00062': ['PG20230221', 'PG20230805'],
 'q00067': ['PG20230429'],
 'q00070': ['PG20230438'],
 'q00071': ['PG20230440'],
 'q00078': ['DAL20230459'],
 'q00081': ['DAL20230513'],
 'q00092': ['TSLA20230391'],
 'q00095': ['TSLA20231453', 'TSLA20231454'],
 'q00099': ['NFLX20230692', 'NFLX20230006'],
 'q00103': ['NFLX20230387'],
 'q00108': ['NFLX20230380'],
 'q00111': [

In [38]:
def change_na_with_list(x):
    if isinstance(x, list):  
        return x
    elif pd.isna(x):  
        return []
    return x  

In [41]:
df_queries["Related Documents"] = df_queries.index.map(qrels_dict)
df_queries["Related Documents"] = df_queries["Related Documents"].apply(change_na_with_list)
df_queries

Unnamed: 0,text,Related Documents
,,
q00001,What are the service and product offerings fro...,"[MSFT20230014, MSFT20230015]"
q00002,MSFT segment breakdown,[]
q00003,Who are Microsoft`s key customers?,[]
q00004,What is Microsoft`s business model,[]
q00005,MSFT Capex commitment,[]
...,...,...
q00214,How many distinct insurance underwriting group...,[]
q00215,What is the ticker symbol for Berkshire Hathaw...,[]
q00216,What is the largest operating segment of the B...,[]


# Other Tasks