In [16]:
import os
import json
import pandas as pd
import ast
import numpy as np
import faiss
import openai

from typing import List
from IPython.display import Markdown, display, update_display
from dotenv import load_dotenv
from convfinqaloader import convfinqadfloader
from transformers import pipeline
from openai import OpenAI

In [2]:
pd.set_option('display.width', -1)
pd.set_option('max_colwidth', 1000)

In [None]:
load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

if api_key and api_key.startswith('sk-proj-') and len(api_key)>10:
    print("API key looks good so far")
else:
    print("There might be a problem with your API key? Please visit the troubleshooting notebook!")
    
openai = OpenAI()

API key looks good so far


In [17]:
client = OpenAI(
  api_key=os.environ['OPENAI_API_KEY'],  # this is also the default, it can be omitted
)

### 1. Load and Flattion ConvFinQA JSON data
---

In [3]:
df = convfinqadfloader("data/convfinqatrain.json")

### 2. Combine relevant text fields for retrieval
---

In [4]:
def create_combined_text(row):
    """
    Combine key text fields to form a context string.
    Uses 'pre_text', 'dialogue_text', 'post_text', and 'execution_answer'.
    """
    texts = []
    if pd.notnull(row.get('pre_text')):
        texts.append("Pre-Text: " + row['pre_text'])
    if pd.notnull(row.get('dialogue_text')):
        texts.append("Dialogue: " + row['dialogue_text'])
    if pd.notnull(row.get('post_text')):
        texts.append("Post-Text: " + row['post_text'])
    if pd.notnull(row.get('execution_answer')):
        texts.append("Execution Answer: " + str(row['execution_answer']))
    return " | ".join(texts)

In [5]:
# Create a new column 'combined_text'
df['combined_text'] = df.apply(create_combined_text, axis=1)

### 3. Build a retrieval index using OpenAI Embeddings and FAISS
---

In [37]:
def get_embedding(text, model="text-embedding-ada-002"):
    """
    Get the embedding of a text string using OpenAI's API.
    """
    response = client.embeddings.create(input=[text], model=model)
    return np.array(response.data[0].embedding, dtype=np.float32)

In [38]:
def compute_embeddings(texts, engine="text-embedding-ada-002"):
    """
    Compute embeddings for a list of texts.
    """
    embeddings = []
    for text in texts:
        emb = get_embedding(text, model=engine)
        embeddings.append(emb)
    return np.vstack(embeddings)

Run on 100 documents to save time (and credits!)

In [44]:
# Comment first line to embed all documents
subset_df = df.iloc[:100].copy()
doc_texts = subset_df['combined_text'].tolist()
print("Computing embeddings for {} documents...".format(len(doc_texts)))
document_embeddings = compute_embeddings(doc_texts)
embedding_dim = document_embeddings.shape[1]

Computing embeddings for 100 documents...


In [45]:
# Normalize embeddings for cosine similarity (using inner product search)
faiss.normalize_L2(document_embeddings)

In [46]:
# Build a Faiss index
index = faiss.IndexFlatIP(embedding_dim)
index.add(document_embeddings)
print("Faiss index built with {} vectors.".format(index.ntotal))

Faiss index built with 100 vectors.


In [47]:
def query_dataset(query, top_n=3, engine="text-embedding-ada-002"):
    """
    Given a query string, compute its embedding and retrieve the top_n similar documents using Faiss.
    """
    query_embedding = get_embedding(query, model=engine).reshape(1, -1)
    faiss.normalize_L2(query_embedding)
    distances, indices = index.search(query_embedding, top_n)
    results = df.iloc[indices[0]].copy()
    results['score'] = distances[0]
    return results

### 4. Query the model using gpt-4o-mini
---

In [50]:
def generate_answer(query, context_docs, max_tokens=200):
    """
    Generate an answer by combining the query with retrieved context using OpenAI's GPT40-mini model.
    A system prompt is added so the model acts as a friendly financial analyst bot that does not make up answers
    and only uses the data it has access to in order to answer numerical questions.
    
    Parameters:
      query: The question string.
      context_docs: A list of context strings retrieved from the dataset.
      max_tokens: Maximum number of tokens for the generated answer.
    
    Returns:
      str: The generated answer.
    """
    # Combine the retrieved context documents into a single string.
    #context = "\n".join(context_docs)
    
    # Create a messages list with a system prompt and the user's prompt.
    messages = [
         {
             "role": "system",
             "content": (
                 "You are a friendly financial analyst bot who is extremely knowledgeable about financial valuations, technical analysis and quantitative finance."
                 "Do not make up answers; only use the data provided to answer finance questions including financial calculation questions."
             )
         },
         {
             "role": "user",
             "content": f"Question: {query}\nContext: {context_docs}\nAnswer:"
         }
    ]
    
    # Call the Chat Completion API with the messages.
    response = openai.chat.completions.create(
         model='gpt-4o-mini',
         messages=messages,
         max_tokens=max_tokens,
         temperature=0.0,
         top_p=1.0,
         n=1,
         stop=["\n"]
    )
    
    # Extract the generated answer.
    answer = response.choices[0].message.content.strip()
    return answer

### 5. Execute the RAG pipeline
---

In [53]:
query = "what is the decrease in receivables compared to the same period a year ago?"
retrieved_results = query_dataset(query, top_n=3)
print("\nTop retrieved examples for query:", query)
context_docs = []
for i, row in retrieved_results.iterrows():
    snippet = row['combined_text'][:200] + "..." if len(row['combined_text']) > 200 else row['combined_text']
    print(f"ID: {row['id']}, Turn index: {row.get('turn_index')}, Score: {row['score']:.3f}")
    print("Context snippet:", snippet)
    print("----------")
    context_docs.append(snippet)
    
generated_answer = generate_answer(query, context_docs)
print("\nGenerated Answer:")
print(generated_answer)


Top retrieved examples for query: what is the decrease in receivables compared to the same period a year ago?
ID: Single_JKHY/2009/page_28.pdf-3, Turn index: 2, Score: 0.804
Context snippet: Pre-Text: 26 | 2009 annual report in fiscal 2008 , revenues in the credit union systems and services business segment increased 14% ( 14 % ) from fiscal 2007 . all revenue components within the segmen...
----------
ID: Single_JKHY/2009/page_28.pdf-3, Turn index: 3, Score: 0.804
Context snippet: Pre-Text: 26 | 2009 annual report in fiscal 2008 , revenues in the credit union systems and services business segment increased 14% ( 14 % ) from fiscal 2007 . all revenue components within the segmen...
----------
ID: Single_JKHY/2009/page_28.pdf-3, Turn index: 1, Score: 0.802
Context snippet: Pre-Text: 26 | 2009 annual report in fiscal 2008 , revenues in the credit union systems and services business segment increased 14% ( 14 % ) from fiscal 2007 . all revenue components within the segmen...
----------

