## FiQA-2018

In [4]:
# Step 1: Import packages
import pandas as pd
from datasets import load_dataset

# Step 2: Load FiQA dataset from Hugging Face
print("🔹 Loading FiQA-2018...")
# Load both the corpus (answers) and queries (questions)
fiqa_corpus = load_dataset("BeIR/fiqa", "corpus")
fiqa_queries = load_dataset("BeIR/fiqa", "queries")

# Check available splits (this will tell you the correct keys)
print("Corpus splits:", fiqa_corpus.keys())
print("Query splits:", fiqa_queries.keys())

# Access the first record from available split
print("\n✅ Corpus sample:")
print(fiqa_corpus[list(fiqa_corpus.keys())[0]][0])

print("\n✅ Query sample:")
print(fiqa_queries[list(fiqa_queries.keys())[0]][0])



🔹 Loading FiQA-2018...
Corpus splits: dict_keys(['corpus'])
Query splits: dict_keys(['queries'])

✅ Corpus sample:
{'_id': '3', 'title': '', 'text': "I'm not saying I don't like the idea of on-the-job training too, but you can't expect the company to do that. Training workers is not their job - they're building software. Perhaps educational systems in the U.S. (or their students) should worry a little about getting marketable skills in exchange for their massive investment in education, rather than getting out with thousands in student debt and then complaining that they aren't qualified to do anything."}

✅ Query sample:
{'_id': '0', 'title': '', 'text': 'What is considered a business expense on a business trip?'}


In [8]:
from random import sample

# Get the queries and corpus lists
queries = fiqa_queries["queries"]
corpus = list(fiqa_corpus["corpus"])

# Create synthetic QA pairs (take top N)
qa_pairs = []
for i in range(1000):  # Limit to 1000 for now to speed things up
    question = queries[i]["text"]
    answer = sample(corpus, 1)[0]["text"]  # Random answer
    qa_pairs.append({"question": question, "answer": answer})

# Convert to DataFrame
df_fiqa_qa = pd.DataFrame(qa_pairs)

print("✅ Sample QA pair:")
print(df_fiqa_qa.sample(1))

# Save as CSV for LLaMA training

df_fiqa_qa.to_csv("../data/fiqa_dataset/fiqa_synthetic_qa.csv", index=False)


✅ Sample QA pair:
                             question  \
291  How to categorize shared income?   

                                                answer  
291  I can go on a rant about how much I know, try ...  


## Financial PhraseBank Data


In [10]:
# --- Load Financial PhraseBank from CSV ---
import pandas as pd

csv_path = "../data/FinancialPhraseBank/financial_phrasebank.csv"  # Update to your actual filename

df_phrasebank = pd.read_csv(csv_path, header=None, names=["Sentiment", "Phrase"],encoding="ISO-8859-1")

# Filter out only known sentiments
valid_sentiments = ["positive", "neutral", "negative"]
df_phrasebank = df_phrasebank[df_phrasebank["Sentiment"].isin(valid_sentiments)]

# Encode sentiment labels
label_map = {"negative": 0, "neutral": 1, "positive": 2}
df_phrasebank["label"] = df_phrasebank["Sentiment"].map(label_map)

# Preview
print("✅ PhraseBank sample:")
print(df_phrasebank.sample(5))

# Save cleaned version
df_phrasebank.to_csv("../data/financial_phrasebank_cleaned.csv", index=False)

✅ PhraseBank sample:
     Sentiment                                             Phrase  label
376   positive  The disposal of Autotank will also strengthen ...      2
738   positive  Cencorp estimates that its net sales in the la...      2
58     neutral  At the request of Finnish media company Alma M...      1
2400   neutral  All YIT Capital Markets Day materials will be ...      1
1908   neutral  Panostaja treats its negotiating partners , su...      1
