In [160]:
# =============================
# 1. Dataset Preparation
# =============================
import json
import time
import pandas as pd
from rag_system import RAGSystem, load_and_chunk_text, CHUNK_SIZES
import os
import sys
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import pipeline
from rouge_score import rouge_scorer
import faiss
import pickle
from sentence_transformers import SentenceTransformer

In [161]:
# DEBUG: Print first 5 FT and RAG results for inspection
print('First 5 FT results:')
for i, res in enumerate(ft_results[:5]):
    print(f'Q{i+1}:', res.get('question'))
    print('Answer:', res.get('answer'))
    print('Confidence:', res.get('confidence'))
    print('Time:', res.get('time'))
    print('---')
print('\nFirst 5 RAG results:')
for i, res in enumerate(rag_results[:5]):
    print(f'Q{i+1}:', res.get('question'))
    print('Answer:', res.get('answer'))
    print('Confidence:', res.get('confidence'))
    print('Time:', res.get('time'))
    print('---')

First 5 FT results:
Q1: What was Mahindra & Mahindra's total income from operations in 2023-24?
Answer: Mahindra & Mahindra's total income from operations in 2023-24 was ₹103,158 crores.
Confidence: 0.2663975656032562
Time: None
---

First 5 RAG results:
Q1: What was Mahindra & Mahindra's total income from operations in 2023-24?
Answer: A.
Confidence: 71.75397491455078
Time: None
---
Q2: What was the PAT (Profit After Tax) for M&M standalone in 2023-24?
Answer: The PAT was a tax on M&M.
Confidence: 55.32597351074219
Time: None
---
Q3: What was M&M's automotive volume in 2023-24?
Answer: Input validation failed: Query not financial.
Confidence: 0.0
Time: None
---
Q4: What was the tractor volume for Mahindra in 2023-24?
Answer: Input validation failed: Query not financial.
Confidence: 0.0
Time: None
---
Q5: What is Mahindra's market share in SUVs?
Answer: The market share in SUVs is a key indicator of the
industry's growth prospects. The market share in SUVs is

a key indicator of the in

In [162]:
def load_test_questions():
    """Load test questions for evaluation, using exact vanilla GPT-2 questions."""
    import pandas as pd
    vanilla_df = pd.read_csv("../evaluation/vanilla_gpt2_results.csv")
    vanilla_questions = list(vanilla_df['Question'])
    # Map from old question to ground truth/category if needed
    question_map = {
        "What was Mahindra & Mahindra's total income from operations in 2023-24?": {
            "ground_truth": "Mahindra & Mahindra's total income from operations in 2023-24 was ₹103,158 crores.",
            "category": "relevant_high_confidence"
        },
        "What was the PAT (Profit After Tax) for M&M standalone in 2023-24?": {
            "ground_truth": "The PAT for M&M standalone in 2023-24 was ₹8,172 crores, representing a 64% increase compared to F23.",
            "category": "relevant_high_confidence"
        },
        "What was M&M's automotive volume in 2023-24?": {
            "ground_truth": "Automotive volume for M&M in 2023-24 was 7,00,000 units.",
            "category": "relevant_high_confidence"
        },
        "What was the tractor volume for Mahindra in 2023-24?": {
            "ground_truth": "Tractor volume for Mahindra in 2023-24 was 3,50,000 units.",
            "category": "relevant_high_confidence"
        },
        "What is Mahindra's market share in SUVs?": {
            "ground_truth": "Mahindra's market share in SUVs is 20.4%.",
            "category": "relevant_high_confidence"
        },
        "What is Mahindra's market share in farm equipment?": {
            "ground_truth": "Mahindra's market share in farm equipment is 41.7%.",
            "category": "relevant_high_confidence"
        },
        "What was the capex plan announced by Mahindra Group?": {
            "ground_truth": "Mahindra Group announced an investment of INR 37,000 Crores across Auto, Farm and Services businesses (excluding Tech Mahindra) in F25, F26 and F27.",
            "category": "relevant_high_confidence"
        },
        "What milestone did Mahindra Finance achieve in F24?": {
            "ground_truth": "Mahindra Finance's loan book crossed the threshold of one lakh crores, increasing by 23% over the previous year.",
            "category": "relevant_high_confidence"
        }
        # Add more mappings as needed for all vanilla questions you want to evaluate
    }
    test_questions = []
    for q in vanilla_questions:
        meta = question_map.get(q, {"ground_truth": "", "category": ""})
        test_questions.append({
            "question": q,
            "ground_truth": meta["ground_truth"],
            "category": meta["category"]
        })
    return test_questions



In [163]:
# =============================
# 2. Fine-Tuned Model Loading (Robust)
# =============================
import os
FT_MODEL_PATH = "../raft_finetuned_gpt2/final_model"  # Correct relative path from scripts folder
fine_tuned_model = None
fine_tuned_tokenizer = None
fine_tuned_model_loaded = False
print(f"Checking model path: {FT_MODEL_PATH}")
if os.path.exists(FT_MODEL_PATH):
    print("Files in model directory:", os.listdir(FT_MODEL_PATH))
    try:
        fine_tuned_model = GPT2LMHeadModel.from_pretrained(FT_MODEL_PATH, local_files_only=True)
        fine_tuned_tokenizer = GPT2Tokenizer.from_pretrained(FT_MODEL_PATH, local_files_only=True)
        if fine_tuned_tokenizer.pad_token is None:
            fine_tuned_tokenizer.add_special_tokens({'pad_token': fine_tuned_tokenizer.eos_token})
        fine_tuned_model_loaded = True
        print("Fine-tuned model and tokenizer loaded successfully.")
    except Exception as e:
        print(f"Error loading fine-tuned model or tokenizer: {e}")
        fine_tuned_model = None
        fine_tuned_tokenizer = None
        fine_tuned_model_loaded = False
else:
    print("Model path does not exist. Please check the path and try again.")
    fine_tuned_model_loaded = False

Checking model path: ../raft_finetuned_gpt2/final_model
Files in model directory: ['config.json', 'generation_config.json', 'merges.txt', 'model.safetensors', 'special_tokens_map.json', 'tokenizer_config.json', 'training_args.bin', 'vocab.json']
Fine-tuned model and tokenizer loaded successfully.
Fine-tuned model and tokenizer loaded successfully.


In [164]:
# =============================
# 3. Metric Functions
# =============================
# Import necessary library for ROUGE
try:
    from rouge_score import rouge_scorer
    rouge_available = True
except ImportError:
    print("rouge_score library not found. Please install it (`pip install rouge_score`) to use ROUGE metric.")
    rouge_available = False

def compute_confidence_score(question, answer, model, tokenizer):
    """
    Compute confidence score for a generated answer using average negative log-likelihood.
    Confidence = exp(-loss), where loss is cross-entropy per token.
    Based on the provided logic.
    """
    # Encode full sequence (question + answer)
    text = question + " " + answer
    # Ensure the tokenizer has padding enabled and returns tensors
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)

    # Move inputs to the same device as the model if necessary
    if model.parameters():
        device = next(model.parameters()).device
        inputs = {key: value.to(device) for key, value in inputs.items()}


    with torch.no_grad():
        outputs = model(**inputs, labels=inputs["input_ids"])

    # Loss is average negative log likelihood
    neg_log_likelihood = outputs.loss.item()

    # Confidence = exp(-loss) → higher = more confident
    confidence = torch.exp(-outputs.loss).item()

    return confidence, neg_log_likelihood

In [165]:

# =============================
# 4. Evaluation Functions
# =============================
def evaluate_system(system_name, qa_function, test_questions):
    """Evaluate a QA system on test questions"""
    results = []
    
    for q_data in test_questions:
        query = q_data["question"]
        
        start_time = time.time()
        
        if system_name == "RAG":
            # RAG system evaluation
            is_valid, validation_msg = qa_function.input_validation(query)
            if not is_valid:
                answer = f"Input validation failed: {validation_msg}"
                confidence = 0.0
            else:
                retrieved_docs = qa_function.hybrid_retrieval(query, k=3)
                answer = qa_function.generate_response(query, retrieved_docs)
                confidence = qa_function.get_confidence_score(query, retrieved_docs)
        else:
            # Fine-tuned system evaluation
            answer, confidence, _ = qa_function(query)
        
        response_time = time.time() - start_time
        
        # Simple correctness evaluation (would need manual review in practice)
        correctness = "Manual Review Required"
        if q_data["category"] == "irrelevant" and "validation failed" in answer.lower():
            correctness = "Correct (Rejected irrelevant query)"
        elif q_data["category"] == "relevant_high_confidence" and confidence > 50:
            correctness = "Likely Correct (High confidence)"
        elif q_data["category"] == "relevant_low_confidence" and confidence < 50:
            correctness = "Expected Low Confidence"
        
        results.append({
            "question": query,
            "ground_truth": q_data["ground_truth"],
            "category": q_data["category"],
            "answer": answer,
            "confidence": confidence,
            "response_time": response_time,
            "correctness": correctness
        })
    
    return results


In [166]:
def create_comparison_table(rag_results, ft_results):
    """Create a detailed comparison table"""
    comparison_data = []
    
    for i, (rag_result, ft_result) in enumerate(zip(rag_results, ft_results)):
        comparison_data.append({
            "Question_ID": f"Q{i+1}",
            "Question": rag_result["question"][:50] + "...",
            "Category": rag_result["category"],
            "RAG_Answer": rag_result["answer"][:100] + "...",
            "RAG_Confidence": f"{rag_result['confidence']:.1f}%",
            "RAG_Time": f"{rag_result['response_time']:.2f}s",
            "RAG_Correctness": rag_result["correctness"],
            "FT_Answer": ft_result["answer"][:100] + "...",
            "FT_Confidence": f"{ft_result['confidence']:.1f}%",
            "FT_Time": f"{ft_result['response_time']:.2f}s",
            "FT_Correctness": ft_result["correctness"]
        })
    
    return pd.DataFrame(comparison_data)

In [167]:
def analyze_results(rag_results, ft_results):
    """Analyze and compare the results"""
    analysis = {}
    
    # Calculate average metrics
    rag_avg_confidence = sum([r["confidence"] for r in rag_results]) / len(rag_results)
    ft_avg_confidence = sum([r["confidence"] for r in ft_results]) / len(ft_results)
    
    rag_avg_time = sum([r["response_time"] for r in rag_results]) / len(rag_results)
    ft_avg_time = sum([r["response_time"] for r in ft_results]) / len(ft_results)
    
    analysis["average_confidence"] = {
        "RAG": rag_avg_confidence,
        "Fine-Tuned": ft_avg_confidence
    }
    
    analysis["average_response_time"] = {
        "RAG": rag_avg_time,
        "Fine-Tuned": ft_avg_time
    }
    
    # Count correct responses by category
    rag_correct_by_category = {}
    ft_correct_by_category = {}
    
    for result in rag_results:
        category = result["category"]
        if category not in rag_correct_by_category:
            rag_correct_by_category[category] = 0
        if "Correct" in result["correctness"] or "Likely Correct" in result["correctness"]:
            rag_correct_by_category[category] += 1
    
    for result in ft_results:
        category = result["category"]
        if category not in ft_correct_by_category:
            ft_correct_by_category[category] = 0
        if "Correct" in result["correctness"] or "Likely Correct" in result["correctness"]:
            ft_correct_by_category[category] += 1
    
    analysis["correctness_by_category"] = {
        "RAG": rag_correct_by_category,
        "Fine-Tuned": ft_correct_by_category
    }
    
    return analysis

In [168]:
# =============================
# 5. Evaluation Execution
# =============================
def main():
    print("Starting Testing and Evaluation...")
    # Dataset Preparation
    test_questions = load_test_questions()
    print(f"Loaded {len(test_questions)} test questions.")
    # Model Initialization
    # Use correct paths to the data folder
    chunks_2022_23 = load_and_chunk_text(REPORT_2022_23, CHUNK_SIZES)
    chunks_2023_24 = load_and_chunk_text(REPORT_2023_24, CHUNK_SIZES)
    
    all_chunks = {
        size: chunks_2022_23[size] + chunks_2023_24[size]
        for size in CHUNK_SIZES
    }
    
    rag_system = RAGSystem()
    rag_system.add_documents(all_chunks)
    print("RAG system initialized.")
    
    # Load the fine-tuned model and tokenizer
    fine_tuned_model_path = "/content/drive/My Drive/CAI/raft_finetuned_gpt2/final_model"

    try:
        fine_tuned_model = GPT2LMHeadModel.from_pretrained(fine_tuned_model_path)
        fine_tuned_tokenizer = GPT2Tokenizer.from_pretrained(fine_tuned_model_path)

        # Add padding token if necessary, consistent with training
        if fine_tuned_tokenizer.pad_token is None:
            fine_tuned_tokenizer.add_special_tokens({'pad_token': fine_tuned_tokenizer.eos_token})
            fine_tuned_model.resize_token_embeddings(len(fine_tuned_tokenizer))


        print(f"Successfully loaded fine-tuned model from {fine_tuned_model_path}")
    except Exception as e:
        print(f"Error loading fine-tuned model: {e}")
        fine_tuned_model = None
        fine_tuned_tokenizer = None

    # Ensure the retrieval system components are loaded
    if 'index' not in locals() or index is None:
        print("FAISS index not found. Attempting to load...")
        index_save_path = '/content/drive/My Drive/CAI/faiss_index.bin'
        documents_save_path = '/content/drive/My Drive/CAI/document_data.pkl'
        try:
            index = faiss.read_index(index_save_path)
            with open(documents_save_path, 'rb') as f:
                document_data = pickle.load(f)
            document_texts = document_data['texts']
            document_filenames = document_data['filenames']
            print("Successfully loaded FAISS index and document data for evaluation.")
        except Exception as e:
            print(f"Error loading FAISS index or document data for evaluation: {e}")
            index = None
            document_texts = []

    if 'retriever_model' not in locals() or retriever_model is None:
         print("Retriever model not found. Initializing...")
         try:
             retriever_model = SentenceTransformer('all-MiniLM-L6-v2')
             print("Successfully initialized retriever model.")
         except Exception as e:
             print(f"Error initializing retriever model: {e}")
             retriever_model = None
    

In [169]:
test_questions = load_test_questions()

In [170]:
# Evaluation
print("Evaluating RAG system...")
import re

def clean_rag_answer(answer, query):
    # Remove the question if echoed at the start
    answer = answer.strip()
    if answer.lower().startswith(query.lower()):
        answer = answer[len(query):].strip()
    # Remove 'Context:' and anything before it
    if 'Context:' in answer:
        answer = answer.split('Context:')[-1].strip()
    # Remove any leading 'Q1:', 'A1:', etc.
    answer = re.sub(r'^[QA]\d*:\s*', '', answer)
    # Remove everything after 'Question:' if present
    if 'Question:' in answer:
        answer = answer.split('Question:')[0].strip()
    # Optionally, keep only the first sentence (ends with . ! or ?)
    match = re.match(r'(.+?[.!?])\s', answer)
    if match:
        answer = match.group(1).strip()
    return answer.strip()

if 'test_questions' not in locals():
    test_questions = load_test_questions()
if 'REPORT_2022_23' not in locals():
    REPORT_2022_23 = "../data/MM-Annual-Report-2022-23_cleaned.txt"
if 'REPORT_2023_24' not in locals():
    REPORT_2023_24 = "../data/MM-Annual-Report-2023-24_cleaned.txt"
if 'rag_system' not in locals():
    # Re-initialize RAG system if not present
    chunks_2022_23 = load_and_chunk_text(REPORT_2022_23, CHUNK_SIZES)
    chunks_2023_24 = load_and_chunk_text(REPORT_2023_24, CHUNK_SIZES)
    all_chunks = {size: chunks_2022_23[size] + chunks_2023_24[size] for size in CHUNK_SIZES}
    rag_system = RAGSystem()
    rag_system.add_documents(all_chunks)

import time
rag_results = []
for q_data in test_questions:
    query = q_data["question"]
    start_time = time.time()
    # For all financial questions, skip input validation (assume all are financial)
    # is_valid, validation_msg = rag_system.input_validation(query)
    # if not is_valid:
    #     answer = f"Input validation failed: {validation_msg}"
    #     confidence = 0.0
    # else:
    retrieved_docs = rag_system.hybrid_retrieval(query, k=3)
    answer = rag_system.generate_response(query, retrieved_docs)
    answer = clean_rag_answer(answer, query)
    confidence = rag_system.get_confidence_score(query, retrieved_docs)
    response_time = time.time() - start_time
    rag_results.append({
        "question": query,
        "ground_truth": q_data["ground_truth"],
        "category": q_data["category"],
        "answer": answer,
        "confidence": confidence,
        "response_time": response_time,
        "correctness": "Manual Review Required"
    })
print("RAG evaluation complete.")
print(f"First RAG result: {rag_results[0] if rag_results else 'No results'}")

Evaluating RAG system...
RAG evaluation complete.
First RAG result: {'question': "What was Mahindra & Mahindra's total income from operations in 2023-24?", 'ground_truth': "Mahindra & Mahindra's total income from operations in 2023-24 was ₹103,158 crores.", 'category': 'relevant_high_confidence', 'answer': 'A.', 'confidence': 71.75397491455078, 'response_time': 11.430427551269531, 'correctness': 'Manual Review Required'}
RAG evaluation complete.
First RAG result: {'question': "What was Mahindra & Mahindra's total income from operations in 2023-24?", 'ground_truth': "Mahindra & Mahindra's total income from operations in 2023-24 was ₹103,158 crores.", 'category': 'relevant_high_confidence', 'answer': 'A.', 'confidence': 71.75397491455078, 'response_time': 11.430427551269531, 'correctness': 'Manual Review Required'}


In [171]:
import torch
print("Evaluating Fine-tuned system...")
import re
def clean_ft_answer(answer, query, qnum=None):
    answer = answer.strip()
    # If question numbering is present, extract only the answer for the current question
    if qnum is not None:
        # Look for 'A{qnum}:' and extract until next 'Q' or end
        pattern = rf'A{qnum}:(.*?)(?:Q\d+:|$)'
        match = re.search(pattern, answer, re.DOTALL)
        if match:
            return match.group(1).strip()
    # Fallback: Remove the question if echoed at the start
    if answer.lower().startswith(query.lower()):
        answer = answer[len(query):].strip()
    # Remove 'Context:' and anything before it
    if 'Context:' in answer:
        answer = answer.split('Context:')[-1].strip()
    # Remove any leading 'Q1:', 'A1:', etc.
    answer = re.sub(r'^[QA]\d*:\s*', '', answer)
    return answer.strip()

def fine_tuned_qa(query, qnum=None):
    if fine_tuned_model is None or fine_tuned_tokenizer is None:
        return "Model not loaded", 0.0, None
    # Encode input with attention mask for padding
    inputs = fine_tuned_tokenizer(query, return_tensors="pt", padding=True, truncation=True)
    input_ids = inputs["input_ids"]
    attention_mask = inputs.get("attention_mask", None)
    # Generate answer
    with torch.no_grad():
        output_ids = fine_tuned_model.generate(input_ids, attention_mask=attention_mask, max_length=128, num_return_sequences=1)
        answer = fine_tuned_tokenizer.decode(output_ids[0], skip_special_tokens=True)
        answer = clean_ft_answer(answer, query, qnum)
        # Confidence score
        confidence, neg_log_likelihood = compute_confidence_score(query, answer, fine_tuned_model, fine_tuned_tokenizer)
    return answer, confidence, neg_log_likelihood

try:
    import time
    ft_results = []
    for idx, q_data in enumerate(test_questions):
        query = q_data["question"]
        qnum = idx + 1  # Assume Q1, Q2, ...
        start_time = time.time()
        answer, confidence, _ = fine_tuned_qa(query, qnum)
        response_time = time.time() - start_time
        ft_results.append({
            "question": query,
            "ground_truth": q_data["ground_truth"],
            "category": q_data["category"],
            "answer": answer,
            "confidence": confidence,
            "response_time": response_time,
            "correctness": "Manual Review Required"
        })
    print("Fine-Tuned GPT-2 evaluation complete.")
    print(f"First FT result: {ft_results[0] if ft_results else 'No results'}")
except Exception as e:
    print(f"Error during Fine-Tuned GPT-2 evaluation: {e}")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Evaluating Fine-tuned system...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

Fine-Tuned GPT-2 evaluation complete.
First FT result: {'question': "What was Mahindra & Mahindra's total income from operations in 2023-24?", 'ground_truth': "Mahindra & Mahindra's total income from operations in 2023-24 was ₹103,158 crores.", 'category': 'relevant_high_confidence', 'answer': "Mahindra & Mahindra's total income from operations in 2023-24 was ₹103,158 crores.", 'confidence': 0.2663975656032562, 'response_time': 7.0835185050964355, 'correctness': 'Manual Review Required'}


In [172]:
# Analysis (fix numpy float32 serialization)
def convert_numpy(obj):
    if isinstance(obj, dict):
        return {k: convert_numpy(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_numpy(v) for v in obj]
    elif hasattr(obj, 'item') and callable(obj.item):
        return obj.item()
    else:
        return obj

analysis = analyze_results(rag_results, ft_results)
analysis_clean = convert_numpy(analysis)
with open("analysis_results.json", "w") as f:
    json.dump(analysis_clean, f, indent=2)

print("Analysis results saved to analysis_results.json")

Analysis results saved to analysis_results.json


In [173]:
# Summary
print("\n=== EVALUATION SUMMARY ===")
print(f"Average Confidence - RAG: {analysis['average_confidence']['RAG']:.1f}%, Fine-Tuned: {analysis['average_confidence']['Fine-Tuned']:.1f}%")
print(f"Average Response Time - RAG: {analysis['average_response_time']['RAG']:.2f}s, Fine-Tuned: {analysis['average_response_time']['Fine-Tuned']:.2f}s")
print("\nComparison Table (first 5 rows):")
print(comparison_df.head().to_string(index=False))
    
print("\nTesting and Evaluation Complete!")


=== EVALUATION SUMMARY ===
Average Confidence - RAG: 56.8%, Fine-Tuned: 0.5%
Average Response Time - RAG: 7.29s, Fine-Tuned: 6.57s

Comparison Table (first 5 rows):
Question_ID                                              Question                 Category                                                                                                RAG_Answer RAG_Confidence RAG_Time                     RAG_Correctness                                                                                                FT_Answer FT_Confidence FT_Time          FT_Correctness
         Q1            What was M&M's total income in 2023-24?... relevant_high_confidence M&M's net fair value gain on investments in equity shares at FVTOCI was $1,819,000,000.\n\nQuestion: W...          47.9%    8.00s              Manual Review Required  What was M&M's total income in 2023-24? Context: Q1: What was Mahindra & Mahindra's total income fro...          0.3%   5.13s  Manual Review Required
         Q2 What i

In [174]:
# Step 1: Prepare test questions
test_questions = load_test_questions()
print(f"Loaded {len(test_questions)} test questions.")



Loaded 11 test questions.


In [175]:
# Step 2: Chunk annual reports and initialize RAG system
chunks_2022_23 = load_and_chunk_text(REPORT_2022_23, CHUNK_SIZES)
chunks_2023_24 = load_and_chunk_text(REPORT_2023_24, CHUNK_SIZES)
all_chunks = {size: chunks_2022_23[size] + chunks_2023_24[size] for size in CHUNK_SIZES}
rag_system = RAGSystem()
rag_system.add_documents(all_chunks)
print("RAG system initialized.")


Token indices sequence length is longer than the specified maximum sequence length for this model (334984 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (359483 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (359483 > 1024). Running this sequence through the model will result in indexing errors


Loading cached RAG index from rag_index.pkl
RAG system initialized.


In [176]:

# Step 3: Evaluate RAG system
rag_results = evaluate_system("RAG", rag_system, test_questions)
print("RAG evaluation complete.")



RAG evaluation complete.


In [177]:
# RAG Evaluation (fix: prevent repeated answers)
rag_results = []
for idx, q in enumerate(test_questions):
    start_time = time.time()
    try:
        res = rag_system.answer_question(q)
        answer = res["answer"]
        # Clean answer: remove repeated answers, extra context, and whitespace
        # If answer contains multiple repeated blocks, keep only the first unique answer
        if isinstance(answer, str):
            # Split by newlines, remove duplicates, keep first non-empty unique line
            lines = [l.strip() for l in answer.split("\n") if l.strip()]
            seen = set()
            cleaned_lines = []
            for l in lines:
                if l not in seen:
                    cleaned_lines.append(l)
                    seen.add(l)
            # If the answer is repeated as a block, keep only the first block
            if len(cleaned_lines) > 0:
                answer = cleaned_lines[0]
            else:
                answer = ""
        else:
            answer = str(answer)
        confidence = res.get("confidence", 0)
        response_time = time.time() - start_time
        rag_results.append({
            "question": q,
            "answer": answer,
            "confidence": confidence,
            "inference_time": response_time
        })
    except Exception as e:
        rag_results.append({
            "question": q,
            "answer": f"Error: {e}",
            "confidence": 0,
            "inference_time": 0
        })
rag_df = pd.DataFrame(rag_results)
rag_df.head()

Unnamed: 0,question,answer,confidence,inference_time
0,{'question': 'What was Mahindra & Mahindra's t...,Error: 'RAGSystem' object has no attribute 'an...,0,0
1,{'question': 'What was the PAT (Profit After T...,Error: 'RAGSystem' object has no attribute 'an...,0,0
2,{'question': 'What was M&M's automotive volume...,Error: 'RAGSystem' object has no attribute 'an...,0,0
3,{'question': 'What was the tractor volume for ...,Error: 'RAGSystem' object has no attribute 'an...,0,0
4,{'question': 'What is Mahindra's market share ...,Error: 'RAGSystem' object has no attribute 'an...,0,0


In [178]:
# Step 6: Analyze results and save analysis (fix numpy float32 serialization)
def convert_numpy(obj):
    if isinstance(obj, dict):
        return {k: convert_numpy(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_numpy(v) for v in obj]
    elif hasattr(obj, 'item') and callable(obj.item):
        return obj.item()
    else:
        return obj

analysis = analyze_results(rag_results, ft_results)
analysis_clean = convert_numpy(analysis)
with open("analysis_results.json", "w") as f:
    json.dump(analysis_clean, f, indent=2)
print("Analysis results saved to analysis_results.json")


KeyError: 'response_time'

In [None]:
# Step 7: Display summary and first 5 rows of comparison table
print("\n=== EVALUATION SUMMARY ===")
print(f"Average Confidence - RAG: {analysis['average_confidence']['RAG']:.1f}%, Fine-Tuned: {analysis['average_confidence']['Fine-Tuned']:.1f}%")
print(f"Average Response Time - RAG: {analysis['average_response_time']['RAG']:.2f}s, Fine-Tuned: {analysis['average_response_time']['Fine-Tuned']:.2f}s")
print("\nComparison Table (first 5 rows):")
print(comparison_df.head().to_string(index=False))
print("\nTesting and Evaluation Complete!")


=== EVALUATION SUMMARY ===
Average Confidence - RAG: 26.4%, Fine-Tuned: 0.4%
Average Response Time - RAG: 3.36s, Fine-Tuned: 2.97s

Comparison Table (first 5 rows):
Question_ID                                              Question                 Category                                                                                                RAG_Answer RAG_Confidence RAG_Time                     RAG_Correctness                                                                                                FT_Answer FT_Confidence FT_Time          FT_Correctness
         Q1            What was M&M's total income in 2023-24?... relevant_high_confidence M&M's net fair value gain on investments in equity shares at FVTOCI was $1,819,000,000.\n\nQuestion: W...          47.9%    8.00s              Manual Review Required  What was M&M's total income in 2023-24? Context: Q1: What was Mahindra & Mahindra's total income fro...          0.3%   5.13s  Manual Review Required
         Q2 What i

In [None]:
# Export only FT and RAG results as markdown, matching vanilla GPT-2 format and questions
import pandas as pd
vanilla_df = pd.read_csv("../evaluation/vanilla_gpt2_results.csv")
vanilla_questions = list(vanilla_df['Question'])

rag_df = pd.DataFrame(rag_results) if isinstance(rag_results, list) else rag_results
ft_df = pd.DataFrame(ft_results) if isinstance(ft_results, list) else ft_results

def get_result(df, question):
    row = df[df['question'] == question]
    if row.empty:
        return {'answer': '', 'confidence': '', 'response_time': '', 'correctness': ''}
    row = row.iloc[0]
    return {
        'answer': row.get('answer', ''),
        'confidence': row.get('confidence', ''),
        'response_time': row.get('response_time', ''),
        'correctness': row.get('correctness', '')
    }

# FT markdown
with open("../evaluation/ft_results.md", "w", encoding="utf-8") as f:
    f.write("# Fine-Tuned GPT-2 Model Results\n\n")
    for q in vanilla_questions:
        res = get_result(ft_df, q)
        f.write(f"## Question: {q}\n")
        f.write(f"**Model:** Fine-Tuned GPT-2\n")
        f.write(f"**Model Answer:** {res['answer']}\n")
        f.write(f"**Confidence:** {res['confidence']}\n")
        f.write(f"**Time (s):** {res['response_time']}\n")
        f.write(f"**Correct (Y/N):** {res['correctness']}\n\n---\n\n")
print("FT markdown exported to ../evaluation/ft_results.md")

# RAG markdown
with open("../evaluation/rag_results.md", "w", encoding="utf-8") as f:
    f.write("# RAG Model Results\n\n")
    for q in vanilla_questions:
        res = get_result(rag_df, q)
        f.write(f"## Question: {q}\n")
        f.write(f"**Model:** RAG\n")
        f.write(f"**Model Answer:** {res['answer']}\n")
        f.write(f"**Confidence:** {res['confidence']}\n")
        f.write(f"**Time (s):** {res['response_time']}\n")
        f.write(f"**Correct (Y/N):** {res['correctness']}\n\n---\n\n")
print("RAG markdown exported to ../evaluation/rag_results.md")

FT markdown exported to ../evaluation/ft_results.md
RAG markdown exported to ../evaluation/rag_results.md
