In [1]:
import os
import json
import pandas as pd
import ast
import numpy as np
import faiss
import torch
 
from typing import List, Dict, Any, Tuple
from tqdm import tqdm
from transformers import pipeline
from transformers import AutoTokenizer, AutoModel, AutoModelForSeq2SeqLM
from sklearn.metrics.pairwise import cosine_similarity
from rouge_score import rouge_scorer

### Helper Function 1: Load data from source JSON into pandas DataFrame
---

In [2]:
def convfinqadfloader(filepath: str, max_rows: int=1000) -> pd.DataFrame:
    with open(filepath, 'r', encoding='utf-8') as file:
        data = json.load(file)

    if max_rows is not None:
        data = data[:max_rows]

    # Initialize list to store flattened data
    flattened_data = []
    
    # Process each record in the JSON data
    for item in data:
        # Create base item with common fields
        base_item = {
            'id': item.get('id'),
            'pre_text': ' '.join(item.get('pre_text', [])),
            'post_text': ' '.join(item.get('post_text', [])),
            'filename': item.get('filename'),
            'table': str(item.get('table')),
        }

    # handle annotation which can be either a dictionary or a list
        annotation = item.get('annotation', {})
        if isinstance(annotation, dict):
            # Extract dialogue information
            dialogue_break = annotation.get('dialogue_break', [])
            turn_program = annotation.get('turn_program', [])
            qa_split = annotation.get('qa_split', [])
            exe_ans_list = annotation.get('exe_ans_list', [])
        
        # Create a row for each turn in the dialogue
            for idx in range(len(dialogue_break)):
                turn_data = {
                    'dialogue_text': dialogue_break[idx] if idx < len(dialogue_break) else None,
                    'turn_program': turn_program[idx] if idx < len(turn_program) else None,
                    'qa_split': qa_split[idx] if idx < len(qa_split) else None,
                    'execution_answer': exe_ans_list[idx] if idx < len(exe_ans_list) else None,
                    'turn_index': idx
                }
                
                # Combine base item with turn data
                combined_data = {**base_item, **turn_data}
                flattened_data.append(combined_data)
        
        # Handle potential QA pairs stored directly
        if 'qa' in item:
            qa_data = {
                'question': item['qa'].get('question'),
                'answer': item['qa'].get('answer'),
                'explanation': item['qa'].get('explanation'),
                'program': item['qa'].get('program'),
                'execution_answer': item['qa'].get('exe_ans'),
                'turn_index': 0  # Single QA pair
            }
            combined_data = {**base_item, **qa_data}
            flattened_data.append(combined_data)
    
    # Create DataFrame from flattened data
    df = pd.DataFrame(flattened_data)

    # Convert appropriate columns to proper numeric types
    numeric_columns = ['turn_index', 'qa_split', 'execution_answer']
    for col in numeric_columns:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    
    return df

### Helper Function 2: Initialise embedding and generator models
---

In [3]:
def initialize_models(embedding_model_name: str = "sentence-transformers/all-mpnet-base-v2",
                     generator_name: str = "google/flan-t5-base"):
    """Initialize all required models."""
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    embedding_tokenizer = None
    generator_tokenizer = None

    # Initialize models
    embedding_tokenizer = AutoTokenizer.from_pretrained(embedding_model_name)
    embedding_model = AutoModel.from_pretrained(embedding_model_name).to(device)
    generator_tokenizer = AutoTokenizer.from_pretrained(generator_name)
    generator_model = AutoModelForSeq2SeqLM.from_pretrained(generator_name).to(device)
    
    embedding_dim = embedding_model.config.hidden_size
    
    print(f"Models initialized successfully. Using device: {device}")
    return (embedding_tokenizer, embedding_model, generator_tokenizer, 
            generator_model, device, embedding_dim)

### Helper Function 3: Get Embedding Function
---

In [4]:
def get_embedding(text: str, tokenizer, model, device, embedding_dim):
    """Get embeddings for a text using HuggingFace model."""
    try:
        inputs = tokenizer(text, return_tensors="pt", 
                         max_length=512, truncation=True,
                         padding=True).to(device)
        
        with torch.no_grad():
            outputs = model(**inputs)
            embedding = torch.mean(outputs.last_hidden_state, dim=1)
            
        return embedding.cpu().numpy().astype(np.float32)[0]
    except Exception as e:
        print(f"Error getting embedding: {e}")
        return np.zeros(embedding_dim, dtype=np.float32)

### Helper Function 4: Process and Index Data
---

In [5]:
def merge_text_content(row: pd.Series) -> str:
    """Merge all text content from a DataFrame row into a single coherent text."""
    merged_parts = []
    
    if pd.notna(row.get('pre_text')):
        merged_parts.append(f"Background Information: {row['pre_text']}")
    
    if pd.notna(row.get('table')):
        try:
            table_data = eval(row['table'])
            table_text = []
            for row_idx, table_row in enumerate(table_data):
                row_text = " | ".join(str(cell) for cell in table_row)
                table_text.append(f"Row {row_idx + 1}: {row_text}")
            merged_parts.append("Tabular Data: " + " ".join(table_text))
        except Exception as e:
            print(f"Error processing table: {e}")
    
    if pd.notna(row.get('dialogue_text')):
        merged_parts.append(f"Dialogue: {row['dialogue_text']}")
    
    if pd.notna(row.get('question')):
        merged_parts.append(f"Question Context: {row['question']}")
    if pd.notna(row.get('answer')):
        merged_parts.append(f"Reference Answer: {row['answer']}")
    
    if pd.notna(row.get('post_text')):
        merged_parts.append(f"Additional Context: {row['post_text']}")
    
    return " ".join(merged_parts)

In [6]:
def create_chunks(text: str, chunk_size: int = 512, chunk_overlap: int = 128) -> List[str]:
    """Split text into overlapping chunks while preserving semantic coherence."""
    sentences = text.split('. ')
    chunks = []
    current_chunk = []
    current_size = 0
    chunks = []
    
    for sentence in sentences:
        sentence_size = len(sentence)
        
        if current_size + sentence_size > chunk_size and current_chunk:
            chunks.append('. '.join(current_chunk) + '.')
            
            overlap_size = 0
            overlap_chunk = []
            for s in reversed(current_chunk):
                if overlap_size + len(s) <= chunk_overlap:
                    overlap_chunk.insert(0, s)
                    overlap_size += len(s)
                else:
                    break
            
            current_chunk = overlap_chunk
            current_size = overlap_size
        
        current_chunk.append(sentence)
        current_size += sentence_size
    
    if current_chunk:
        chunks.append('. '.join(current_chunk) + '.')
    
    return chunks

In [7]:
def process_and_index_data(df: pd.DataFrame, embedding_tokenizer, embedding_model, 
                          device, embedding_dim):
    """Process and index the DataFrame content using FAISS."""
    print("Processing documents...")
    all_chunks = []
    chunk_sources = []
    
    for _, row in tqdm(df.iterrows(), total=len(df)):
        merged_text = merge_text_content(row)
        text_chunks = create_chunks(merged_text)
        
        for chunk in text_chunks:
            metadata = {
                'id': row.get('id'),
                'filename': row.get('filename'),
                'turn_index': row.get('turn_index'),
                'contains_question': 'Question:' in chunk,
                'contains_answer': 'Answer:' in chunk
            }
            all_chunks.append((chunk, metadata))
    
    chunks = [chunk[0] for chunk in all_chunks]
    chunk_sources = [chunk[1] for chunk in all_chunks]
    
    faiss_index = faiss.IndexFlatL2(embedding_dim)
    
    print(f"Creating embeddings for {len(chunks)} chunks...")
    embeddings = []
    batch_size = 32
    
    for i in tqdm(range(0, len(chunks), batch_size)):
        batch = chunks[i:i + batch_size]
        batch_embeddings = [get_embedding(text, embedding_tokenizer, embedding_model, 
                                        device, embedding_dim) for text in batch]
        embeddings.extend(batch_embeddings)
    
    embeddings_array = np.array(embeddings, dtype=np.float32)
    faiss_index.add(embeddings_array)
    
    return chunks, chunk_sources, faiss_index

### Helper Function 5: Retriever Function
---

In [8]:
def retrieve(query: str, embedding_tokenizer, embedding_model, device, embedding_dim,
            chunks, chunk_sources, faiss_index, k: int = 3) -> List[Dict]:
    """Retrieve the k most relevant chunks for a query using FAISS."""
    query_embedding = get_embedding(query, embedding_tokenizer, embedding_model, 
                                  device, embedding_dim)
    
    distances, indices = faiss_index.search(
        query_embedding.reshape(1, -1),
        k
    )
    
    retrieved_chunks = []
    for i, idx in enumerate(indices[0]):
        retrieved_chunks.append({
            'text': chunks[idx],
            'similarity': float(1 / (1 + distances[0][i])),
            'metadata': chunk_sources[idx]
        })
    
    return retrieved_chunks

### Helper Function 6: Generate Answer
---

In [9]:
def generate_answer(query: str, retrieved_chunks: List[Dict], 
                   generator_tokenizer, generator_model, device) -> Dict:
    """Generate an answer using the retrieved context."""
    sorted_chunks = sorted(retrieved_chunks, key=lambda x: x['similarity'], reverse=True)
    
    context_parts = []
    for chunk in sorted_chunks:
        chunk_text = chunk['text']
        for marker in ['Question:', 'Answer:', 'Reference Answer:']:
            if marker in chunk_text:
                chunk_text = chunk_text.split(marker)[0]
        context_parts.append(chunk_text.strip())
    
    context = ' '.join(context_parts)
    
    prompt = (
        f"Based on the following context, please provide a detailed and accurate "
        f"answer to the question. Consider all relevant information from the context.\n\n"
        f"Context: {context}\n\n"
        f"Question: {query}\n\n"
        f"Answer:"
    )
    
    inputs = generator_tokenizer(prompt, return_tensors="pt", 
                               max_length=1024, truncation=True).to(device)
    
    outputs = generator_model.generate(
        inputs["input_ids"],
        max_length=200,
        min_length=30,
        num_beams=4,
        temperature=0.7
    )
    
    generated_answer = generator_tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    source_documents = {chunk['metadata'].get('filename') for chunk in retrieved_chunks}
    
    return {
        'answer': generated_answer,
        'source_documents': list(source_documents),
        'context_used': context,
        'top_chunk_similarity': retrieved_chunks[0]['similarity'] if retrieved_chunks else 0
    }

### Helper Function 7: Evaluation Pipeline
---

In [10]:
def evaluate_pipeline(test_df: pd.DataFrame, models_and_data: tuple) -> Dict[str, float]:
    """Evaluate the RAG pipeline's performance on a test dataset."""
    (embedding_tokenizer, embedding_model, generator_tokenizer, generator_model,
     device, embedding_dim, chunks, chunk_sources, faiss_index) = models_and_data
    
    print("Evaluating pipeline performance...")
    
    rouge_scorer_instance = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    metrics = {
        'retrieval_precision': [],
        'rouge1_f1': [],
        'rouge2_f1': [],
        'rougeL_f1': [],
        'answer_similarity': []
    }
    
    for _, row in tqdm(test_df.iterrows(), total=len(test_df)):
        if pd.isna(row.get('question')) or pd.isna(row.get('answer')):
            continue
            
        retrieved = retrieve(row['question'], embedding_tokenizer, embedding_model,
                           device, embedding_dim, chunks, chunk_sources, faiss_index)
        
        result = generate_answer(row['question'], retrieved, generator_tokenizer,
                               generator_model, device)
        
        relevant_docs = set([row.get('filename')])
        retrieved_docs = set(result['source_documents'])
        if relevant_docs and retrieved_docs:
            precision = len(relevant_docs.intersection(retrieved_docs)) / len(retrieved_docs)
            metrics['retrieval_precision'].append(precision)
        
        rouge_scores = rouge_scorer_instance.score(result['answer'], row['answer'])
        metrics['rouge1_f1'].append(rouge_scores['rouge1'].fmeasure)
        metrics['rouge2_f1'].append(rouge_scores['rouge2'].fmeasure)
        metrics['rougeL_f1'].append(rouge_scores['rougeL'].fmeasure)
        
        ref_embedding = get_embedding(row['answer'], embedding_tokenizer, embedding_model,
                                    device, embedding_dim)
        gen_embedding = get_embedding(result['answer'], embedding_tokenizer, embedding_model,
                                    device, embedding_dim)
        
        similarity = cosine_similarity(
            ref_embedding.reshape(1, -1),
            gen_embedding.reshape(1, -1)
        )[0][0]
        metrics['answer_similarity'].append(similarity)
    
    evaluation_results = {
        'avg_retrieval_precision': np.mean(metrics['retrieval_precision']),
        'avg_rouge1_f1': np.mean(metrics['rouge1_f1']),
        'avg_rouge2_f1': np.mean(metrics['rouge2_f1']),
        'avg_rougeL_f1': np.mean(metrics['rougeL_f1']),
        'avg_answer_similarity': np.mean(metrics['answer_similarity']),
        'num_evaluated_samples': len(metrics['rouge1_f1'])
    }
    
    return evaluation_results

In [13]:
def run_pipeline(data_path: str, max_rows: int = 100):
    """Run the complete pipeline from data loading to evaluation."""
    # Initialize models
    print("Initializing models...")
    model_components = initialize_models()
    (embedding_tokenizer, embedding_model, generator_tokenizer, 
     generator_model, device, embedding_dim) = model_components
    
    # Load data
    print("\nLoading data...")
    df = convfinqadfloader(data_path, max_rows)
    
    # Split into train/test
    print("\nSplitting data into train/test sets...")
    train_size = int(0.8 * len(df))
    train_df = df.iloc[:train_size]
    test_df = df.iloc[train_size:]
    
    # Process and index training data
    print("\nProcessing and indexing training data...")
    chunks, chunk_sources, faiss_index = process_and_index_data(
        train_df, embedding_tokenizer, embedding_model, device, embedding_dim
    )
    
    # Combine all components needed for evaluation
    models_and_data = (embedding_tokenizer, embedding_model, generator_tokenizer,
                      generator_model, device, embedding_dim, chunks, chunk_sources,
                      faiss_index)
    
    # Evaluate pipeline
    print("\nEvaluating pipeline...")
    evaluation_results = evaluate_pipeline(test_df, models_and_data)
    
    # Print results
    print("\nEvaluation Results:")
    print(f"Number of evaluated samples: {evaluation_results['num_evaluated_samples']}")
    print(f"Average Retrieval Precision: {evaluation_results['avg_retrieval_precision']:.4f}")
    print(f"Average ROUGE-1 F1: {evaluation_results['avg_rouge1_f1']:.4f}")
    print(f"Average ROUGE-2 F1: {evaluation_results['avg_rouge2_f1']:.4f}")
    print(f"Average ROUGE-L F1: {evaluation_results['avg_rougeL_f1']:.4f}")
    print(f"Average Answer Similarity: {evaluation_results['avg_answer_similarity']:.4f}")
    
    return evaluation_results, models_and_data

In [14]:
results, models_and_data = run_pipeline("./data/convfinqatrain.json")

Initializing models...
Models initialized successfully. Using device: cpu

Loading data...

Splitting data into train/test sets...

Processing and indexing training data...
Processing documents...


100%|██████████| 344/344 [00:00<00:00, 9239.62it/s]


Creating embeddings for 3568 chunks...


100%|██████████| 112/112 [04:32<00:00,  2.44s/it]



Evaluating pipeline...
Evaluating pipeline performance...


100%|██████████| 87/87 [02:18<00:00,  1.59s/it]


Evaluation Results:
Number of evaluated samples: 16
Average Retrieval Precision: 0.0000
Average ROUGE-1 F1: 0.0022
Average ROUGE-2 F1: 0.0000
Average ROUGE-L F1: 0.0022
Average Answer Similarity: 0.2468



