In [9]:
import json
import pandas as pd
import ast
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from convfinqaloader import convfinqadfloader
from transformers import pipeline

In [2]:
pd.set_option('display.width', -1)
pd.set_option('max_colwidth', 1000)

### 1. Load and Flattion ConvFinQA JSON data
---

In [3]:
df = convfinqadfloader("data/convfinqatrain.json")

### 2. Combine relevant text fields for retrieval
---

In [4]:
def create_combined_text(row):
    """
    Combine key text fields to form a context string.
    Uses 'pre_text', 'dialogue_text' (if available), 'post_text', and 'execution_answer'.
    """
    texts = []
    if pd.notnull(row.get('pre_text')):
        texts.append("Pre-Text: " + row['pre_text'])
    if pd.notnull(row.get('dialogue_text')):
        texts.append("Dialogue: " + row['dialogue_text'])
    if pd.notnull(row.get('post_text')):
        texts.append("Post-Text: " + row['post_text'])
    if pd.notnull(row.get('execution_answer')):
        texts.append("Execution Answer: " + str(row['execution_answer']))
    return " | ".join(texts)


In [5]:
# Create a new column 'combined_text'
df['combined_text'] = df.apply(create_combined_text, axis=1)

### 3. Build a TF-IDF retrieval index
---

In [6]:
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['combined_text'])
print("TF-IDF matrix created with shape:", tfidf_matrix.shape)

TF-IDF matrix created with shape: (4378, 12743)


In [7]:
def query_dataset(query, top_n=3):
    """
    Given a query string, retrieve the top_n similar examples based on cosine similarity.
    """
    query_vec = vectorizer.transform([query])
    cosine_sim = cosine_similarity(query_vec, tfidf_matrix).flatten()
    top_indices = cosine_sim.argsort()[-top_n:][::-1]
    results = df.iloc[top_indices].copy()
    results['score'] = cosine_sim[top_indices]
    return results

### 4. Setup a Gen-AI model using HF Transformers
---

In [10]:
# We use a text-to-text generation model such as T5.
generator = pipeline("text2text-generation", model="t5-small")

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Device set to use mps:0


In [11]:
def generate_answer(query, context_docs, max_length=200):
    # Combine the context documents
    context = "\n".join(context_docs)
    # Construct a prompt that includes the query and the context.
    prompt = f"question: {query}\ncontext: {context}\nanswer:"
    # Generate the answer
    result = generator(prompt, max_length=max_length, do_sample=False)
    return result[0]['generated_text']

### 5. Execute RAG pipeline
---

In [12]:
query = "what was the percentage increase in litigation reserves in 2012?"

In [13]:
retrieved_results = query_dataset(query, top_n=3)

print("\nTop retrieved examples for query:", query)
context_docs = []
for i, row in retrieved_results.iterrows():
    snippet = row['combined_text'][:200] + "..." if len(row['combined_text']) > 200 else row['combined_text']
    print(f"ID: {row['id']}, Turn index: {row.get('turn_index')}, Score: {row['score']:.3f}")
    print("Context snippet:", snippet)
    print("----------")
    context_docs.append(snippet)

# Generate answer using the retrieved context
generated_answer = generate_answer(query, context_docs)
print("\nGenerated Answer:")
print(generated_answer)


Top retrieved examples for query: what was the percentage increase in litigation reserves in 2012?
ID: Single_DVN/2015/page_117.pdf-2, Turn index: 0, Score: 0.268
Context snippet: Pre-Text: devon energy corporation and subsidiaries notes to consolidated financial statements 2013 ( continued ) proved undeveloped reserves the following table presents the changes in devon 2019s to...
----------
ID: Single_DVN/2015/page_117.pdf-2, Turn index: 0, Score: 0.268
Context snippet: Pre-Text: devon energy corporation and subsidiaries notes to consolidated financial statements 2013 ( continued ) proved undeveloped reserves the following table presents the changes in devon 2019s to...
----------
ID: Single_DVN/2015/page_117.pdf-2, Turn index: 2, Score: 0.265
Context snippet: Pre-Text: devon energy corporation and subsidiaries notes to consolidated financial statements 2013 ( continued ) proved undeveloped reserves the following table presents the changes in devon 2019s to...
----------

Generated A