In [1]:
import json
import os
from tqdm import tqdm
from typing import List, Dict
from datasets import Dataset
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    context_entity_recall,
    answer_similarity,
    answer_correctness
)
from ragas import evaluate
from langchain.schema import Document
from langchain.chat_models import ChatOpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from dotenv import load_dotenv
from openai import OpenAI
from operator import itemgetter
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough

In [2]:
# Load environment variables
load_dotenv(override=True)
client = OpenAI(api_key=os.environ['OPENAI_API_KEY'])

### RAG Helper Function
---

In [3]:
def process_documents(json_data: List[Dict], target_samples: int = 100):
    """Process documents until we get exactly target_samples valid QA pairs"""
    queries = []
    ground_truths = []
    documents = []
    valid_count = 0
    
    print(f"Processing documents to find {target_samples} valid samples...")
    for entry in tqdm(json_data):
        # Check for valid QA data
        if entry.get("qa"):
            qa_data = entry["qa"]
            if isinstance(qa_data, dict) and "question" in qa_data and "answer" in qa_data:
                # Process the valid QA pair
                queries.append(qa_data["question"])
                answer = qa_data["answer"]
                if isinstance(answer, list):
                    ground_truths.append(answer[0])
                else:
                    ground_truths.append(answer)
                
                # Process document content
                doc_content = []
                if entry.get("pre_text"):
                    doc_content.extend(entry["pre_text"])
                if entry.get("table_ori"):
                    table_str = "\n".join(" | ".join(str(cell) for cell in row) for row in entry["table_ori"])
                    doc_content.append(table_str)
                if entry.get("post_text"):
                    doc_content.extend(entry["post_text"])
                
                documents.append(Document(
                    page_content="\n".join(doc_content),
                    metadata={"id": entry.get("id")}
                ))
                
                valid_count += 1
                if valid_count == target_samples:
                    break
    
    print(f"Found {valid_count} valid samples after processing {len(json_data)} documents")
    
    if valid_count < target_samples:
        raise ValueError(f"Could only find {valid_count} valid samples, but {target_samples} were requested")
    
    return queries, ground_truths, documents

In [4]:
def setup_rag_chain(documents):
    """Set up the RAG chain using RetrievalQA"""
    # Initialize models and vectorstore
    embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
    llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
    
    # Initialize Chroma vectorstore
    vectorstore = Chroma(
        collection_name="text_table_rag",
        embedding_function=embedding_model
    )
    
    # Add documents to vectorstore
    vectorstore.add_documents(documents)
    
    # Create prompt template
    PROMPT_TEMPLATE = """
    Go through the context and answer the given question strictly based on context.
    Use two sentences maximum and keep the answer concise.
    
    Context: {context}
    Question: {question}
    Answer:
    """
    
    # Create RetrievalQA chain
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        retriever=vectorstore.as_retriever(),
        return_source_documents=True,
        chain_type_kwargs={"prompt": PromptTemplate.from_template(PROMPT_TEMPLATE)}
    )
    
    return qa_chain

In [5]:
def evaluate_rag_pipeline(qa_chain, queries: List[str], ground_truths: List[str]):
    """Evaluate the RAG pipeline using RAGAS metrics"""
    results = []
    contexts = []
    
    print("Generating answers and collecting contexts...")
    for query in tqdm(queries):
        # Get response from RAG chain
        result = qa_chain({"query": query})
        results.append(result['result'])
        
        # Get contexts
        sources = result["source_documents"]
        contents = [source.page_content for source in sources]
        contexts.append(contents)
    
    # Prepare data for RAGAS evaluation
    data = {
        "question": queries,
        "answer": results,
        "contexts": contexts,
        "ground_truth": ground_truths  # Note: changed from ground_truths to ground_truth to match RAGAS expectations
    }
    
    # Create dataset and evaluate
    dataset = Dataset.from_dict(data)
    score = evaluate(
        dataset=dataset,
        metrics=[
            faithfulness,
            answer_relevancy,
            context_precision,
            context_recall,
            context_entity_recall,
            answer_similarity,
            answer_correctness
        ]
    )
    
    return score, results

In [6]:
def main():
    # Load data
    print("Loading data...")
    with open('./data/convfinqatrain.json', 'r') as f:
        json_data = json.load(f)
    
    # Process documents
    print("Processing documents...")
    queries, ground_truths, documents = process_documents(json_data, target_samples=100)
    print(f"Processed {len(queries)} questions and documents")
    
    # Setup RAG chain
    print("\nSetting up RAG pipeline...")
    qa_chain = setup_rag_chain(documents)
    
    # Evaluate the pipeline
    print("\nEvaluating RAG pipeline...")
    score, generated_answers = evaluate_rag_pipeline(qa_chain, queries, ground_truths)
    
    # Print results
    print("\nEvaluation Results:")
    score_df = score.to_pandas()
    print(score_df)
    
    # Save results
    score_df.to_csv("EvaluationScores.csv", encoding="utf-8", index=False)
    
    # Print mean scores
    metrics_to_average = [
        'faithfulness',
        'answer_relevancy', 
        'context_precision', 
        'context_recall',
        'context_entity_recall', 
        'semantic_similarity', 
        'answer_correctness'
    ]
    print("\nMean Scores:")
    print(score_df[metrics_to_average].mean(axis=0))
    
    # Print detailed comparison
    print("\nDetailed Comparison:")
    for i, (query, ground_truth, generated) in enumerate(zip(queries, ground_truths, generated_answers), 1):
        print(f"\nExample {i}:")
        print(f"Query: {query}")
        print(f"Ground Truth: {ground_truth}")
        print(f"Generated: {generated}")
        print("-" * 80)
    
    return score_df

In [8]:
if __name__ == "__main__":
    results = main()

Loading data...
Processing documents...
Processing documents to find 100 valid samples...


  4%|▍         | 131/3037 [00:00<00:00, 154297.62it/s]

Found 100 valid samples after processing 3037 documents
Processed 100 questions and documents

Setting up RAG pipeline...






Evaluating RAG pipeline...
Generating answers and collecting contexts...


100%|██████████| 100/100 [05:00<00:00,  3.01s/it]


Evaluating:   0%|          | 0/700 [00:00<?, ?it/s]

Exception raised in Job[119]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o-mini in organization org-saeSWFy7xaaLxaQqs0dGYL2n on tokens per min (TPM): Limit 200000, Used 195644, Requested 6102. Please try again in 523ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})
Exception raised in Job[81]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o-mini in organization org-saeSWFy7xaaLxaQqs0dGYL2n on tokens per min (TPM): Limit 200000, Used 194750, Requested 6040. Please try again in 237ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})
Exception raised in Job[122]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o-mini in organization org-saeSWFy7xaaLxaQqs0dGYL2n on tokens per min (TPM): Limit 200000, 


Evaluation Results:
                                           user_input  \
0   what was the percentage change in the net cash...   
1   what was the percent of the growth in the reve...   
2   what was the percentage change in net sales fr...   
3   what was the difference in percentage cumulati...   
4   what portion of the total shares subject to ou...   
..                                                ...   
95  what were 2001 total segment revenues in billi...   
96  what is the rate of return of an investment in...   
97  what was the percentage change in the reserve ...   
98  with 2014 closing stock price , what is the to...   
99  what was the percentage cumulative total share...   

                                   retrieved_contexts  \
0   [31mar201122064257 positions which were requir...   
1   [table of contents research and development ex...   
2   [in a new business model such as the retail se...   
3   [shareowner return performance graph the follo...   
4   [duri