# Task 3: Building the RAG Core Logic and Evaluation

## Objective
To build the retrieval and generation pipeline and evaluate its effectiveness.

## Steps:
1. Implement retriever function
2. Design robust prompt template
3. Implement generator with LLM
4. Create comprehensive evaluation framework
5. Run qualitative evaluation with test questions
6. Analyze results and provide recommendations

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sentence_transformers import SentenceTransformer
import chromadb
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import torch
import json
import os
import time
from typing import List, Dict, Any, Optional
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

print("✅ Libraries imported successfully")
print(f"🔧 PyTorch version: {torch.__version__}")
print(f"🔧 CUDA available: {torch.cuda.is_available()}")

## 1. Load Vector Store and Initialize Components

In [None]:
# Load configuration and vector store
vector_store_path = '../vector_store'
config_path = os.path.join(vector_store_path, 'config.json')

if not os.path.exists(config_path):
    print("❌ Vector store not found. Please run Task 2 first.")
    raise FileNotFoundError("Run notebook 02_embedding_creation.ipynb first")

# Load configuration
with open(config_path, 'r') as f:
    config = json.load(f)

print("=== LOADING VECTOR STORE ===")
print(f"📋 Configuration:")
for key, value in config.items():
    print(f"  {key}: {value}")

# Initialize embedding model
print(f"\n🔧 Loading embedding model: {config['model_name']}")
embedding_model = SentenceTransformer(config['model_name'])
print(f"✅ Embedding model loaded")

# Initialize ChromaDB client
print(f"\n🔧 Connecting to ChromaDB...")
chroma_client = chromadb.PersistentClient(path=vector_store_path)
collection = chroma_client.get_collection(name=config['collection_name'])
print(f"✅ Connected to collection: {config['collection_name']}")
print(f"📊 Collection count: {collection.count():,} chunks")

## 2. Implement Retriever Function

In [None]:
class ComplaintRetriever:
    """Retriever for complaint embeddings"""
    
    def __init__(self, collection, embedding_model):
        self.collection = collection
        self.embedding_model = embedding_model
    
    def retrieve(self, query: str, n_results: int = 5, 
                product_filter: Optional[str] = None) -> List[Dict[str, Any]]:
        """Retrieve relevant complaint chunks for a given query"""
        
        try:
            # Generate query embedding
            query_embedding = self.embedding_model.encode(
                [query], 
                normalize_embeddings=True,
                convert_to_numpy=True
            )[0]
            
            # Prepare filter
            where_clause = None
            if product_filter and product_filter != "All Products":
                where_clause = {"product": product_filter}
            
            # Search vector store
            results = self.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=n_results,
                where=where_clause,
                include=['documents', 'metadatas', 'distances']
            )
            
            # Format results
            retrieved_chunks = []
            for i in range(len(results['ids'][0])):
                chunk = {
                    'id': results['ids'][0][i],
                    'text': results['documents'][0][i],
                    'metadata': results['metadatas'][0][i],
                    'similarity_score': 1 - results['distances'][0][i],  # Convert distance to similarity
                    'distance': results['distances'][0][i]
                }
                retrieved_chunks.append(chunk)
            
            return retrieved_chunks
            
        except Exception as e:
            print(f"Error retrieving chunks: {e}")
            return []
    
    def get_retrieval_stats(self, chunks: List[Dict[str, Any]]) -> Dict[str, Any]:
        """Get statistics about retrieved chunks"""
        if not chunks:
            return {}
        
        similarities = [chunk['similarity_score'] for chunk in chunks]
        products = [chunk['metadata']['product'] for chunk in chunks]
        issues = [chunk['metadata']['issue'] for chunk in chunks]
        
        return {
            'num_chunks': len(chunks),
            'avg_similarity': np.mean(similarities),
            'min_similarity': np.min(similarities),
            'max_similarity': np.max(similarities),
            'products': list(set(products)),
            'issues': list(set(issues)),
            'top_similarity': similarities[0] if similarities else 0
        }

# Initialize retriever
retriever = ComplaintRetriever(collection, embedding_model)
print("✅ Retriever initialized")

# Test retrieval
test_query = "billing issues with credit cards"
print(f"\n🔍 Testing retrieval with query: '{test_query}'")

start_time = time.time()
test_results = retriever.retrieve(test_query, n_results=3)
retrieval_time = time.time() - start_time

print(f"⏱️  Retrieval time: {retrieval_time*1000:.1f}ms")
print(f"📊 Retrieved {len(test_results)} chunks")

if test_results:
    stats = retriever.get_retrieval_stats(test_results)
    print(f"📈 Average similarity: {stats['avg_similarity']:.3f}")
    print(f"🏷️  Products found: {stats['products']}")
    
    print(f"\n📝 Sample results:")
    for i, chunk in enumerate(test_results[:2]):
        print(f"  {i+1}. Similarity: {chunk['similarity_score']:.3f} | Product: {chunk['metadata']['product']}")
        print(f"     Text: {chunk['text'][:100]}...")

## 3. Design Prompt Template

### Prompt Engineering Strategy:
- **Clear Role Definition**: Establish the AI as a financial analyst assistant
- **Context Grounding**: Explicitly instruct to use only provided context
- **Structured Output**: Guide the format of responses
- **Fallback Handling**: Handle cases with insufficient information
- **Evidence-Based**: Encourage citing specific examples

In [None]:
class PromptTemplate:
    """Prompt template for RAG system"""
    
    def __init__(self):
        self.template = """You are a financial analyst assistant for CrediTrust Financial. Your task is to analyze customer complaints and provide helpful insights to product managers, support teams, and compliance officers.

Based on the following customer complaint excerpts, please answer the user's question. Use only the information provided in the context below. If the context doesn't contain enough information to answer the question completely, state that clearly.

CONTEXT:
{context}

QUESTION: {question}

INSTRUCTIONS:
- Provide a clear, concise answer based solely on the context above
- Highlight key patterns, trends, or recurring themes if relevant
- If specific numbers, frequencies, or statistics can be inferred, include them
- Mention the financial products involved when relevant
- If the context is insufficient, say "Based on the available complaints, I don't have enough information to fully answer this question."
- Keep your response focused and actionable for business stakeholders

ANSWER:"""
    
    def create_context(self, chunks: List[Dict[str, Any]], max_length: int = 2000) -> str:
        """Create context string from retrieved chunks"""
        context_parts = []
        current_length = 0
        
        for i, chunk in enumerate(chunks):
            # Format chunk with metadata
            chunk_text = f"[Source {i+1} - {chunk['metadata']['product']} - {chunk['metadata']['issue']}]: {chunk['text']}"
            
            if current_length + len(chunk_text) > max_length:
                break
            
            context_parts.append(chunk_text)
            current_length += len(chunk_text)
        
        return "\n\n".join(context_parts)
    
    def create_prompt(self, query: str, chunks: List[Dict[str, Any]]) -> str:
        """Create complete prompt from query and retrieved chunks"""
        context = self.create_context(chunks)
        
        return self.template.format(
            context=context,
            question=query
        )
    
    def get_context_stats(self, context: str) -> Dict[str, Any]:
        """Get statistics about the context"""
        return {
            'length': len(context),
            'word_count': len(context.split()),
            'source_count': context.count('[Source'),
            'products_mentioned': len(set([
                line.split(' - ')[1] for line in context.split('\n') 
                if line.startswith('[Source') and ' - ' in line
            ]))
        }

# Initialize prompt template
prompt_template = PromptTemplate()
print("✅ Prompt template initialized")

# Test prompt creation
test_prompt = prompt_template.create_prompt(test_query, test_results)
context_stats = prompt_template.get_context_stats(prompt_template.create_context(test_results))

print(f"\n📝 Test prompt statistics:")
print(f"  Total length: {len(test_prompt):,} characters")
print(f"  Context length: {context_stats['length']:,} characters")
print(f"  Context word count: {context_stats['word_count']:,} words")
print(f"  Number of sources: {context_stats['source_count']}")
print(f"  Products in context: {context_stats['products_mentioned']}")

print(f"\n📄 Sample prompt (first 500 chars):")
print(test_prompt[:500] + "...")

## 4. Implement Generator with LLM

In [None]:
class ComplaintGenerator:
    """Generator for complaint analysis responses"""
    
    def __init__(self, model_name: str = "microsoft/DialoGPT-small"):
        self.model_name = model_name
        self.llm_pipeline = None
        self.max_response_length = 500
        
        self._initialize_llm()
    
    def _initialize_llm(self):
        """Initialize the language model"""
        try:
            print(f"🔧 Loading LLM: {self.model_name}")
            device = 0 if torch.cuda.is_available() else -1
            
            self.llm_pipeline = pipeline(
                "text-generation",
                model=self.model_name,
                device=device,
                max_length=512,
                do_sample=True,
                temperature=0.7,
                pad_token_id=50256,
                truncation=True
            )
            print(f"✅ LLM loaded successfully")
            
        except Exception as e:
            print(f"⚠️  Could not load advanced LLM: {e}")
            print(f"🔄 Using fallback response generation")
            self.llm_pipeline = None
    
    def generate_response(self, prompt: str) -> str:
        """Generate response using LLM or fallback method"""
        if self.llm_pipeline is None:
            return self._generate_fallback_response(prompt)
        
        try:
            # Generate response using the LLM
            response = self.llm_pipeline(
                prompt,
                max_length=len(prompt) + self.max_response_length,
                num_return_sequences=1,
                temperature=0.7,
                do_sample=True,
                pad_token_id=50256
            )
            
            # Extract the generated text
            generated_text = response[0]['generated_text']
            
            # Extract only the answer part
            if "ANSWER:" in generated_text:
                answer = generated_text.split("ANSWER:")[-1].strip()
            else:
                answer = generated_text[len(prompt):].strip()
            
            return answer if answer else "I apologize, but I couldn't generate a proper response based on the available information."
            
        except Exception as e:
            print(f"Error generating response: {e}")
            return self._generate_fallback_response(prompt)
    
    def _generate_fallback_response(self, prompt: str) -> str:
        """Generate rule-based response when LLM is not available"""
        # Extract context and question from prompt
        if "CONTEXT:" in prompt and "QUESTION:" in prompt:
            context_start = prompt.find("CONTEXT:") + len("CONTEXT:")
            question_start = prompt.find("QUESTION:") + len("QUESTION:")
            
            context = prompt[context_start:prompt.find("QUESTION:")].strip()
            question = prompt[question_start:prompt.find("INSTRUCTIONS:")].strip()
            
            # Simple analysis based on context
            if context:
                # Count sources
                source_count = context.count("[Source")
                
                # Extract products mentioned
                products = []
                for product in ["Credit card", "Personal loan", "BNPL", "Savings account", "Money transfers"]:
                    if product.lower() in context.lower():
                        products.append(product)
                
                response = f"Based on {source_count} relevant complaint(s)"
                if products:
                    response += f" related to {', '.join(products)}"
                response += f", here are the key insights:\n\n"
                
                # Extract key themes
                themes = []
                if "billing" in context.lower() or "charge" in context.lower():
                    themes.append("• Billing and charging issues are prominent concerns")
                if "customer service" in context.lower() or "support" in context.lower():
                    themes.append("• Customer service quality is a recurring theme")
                if "fraud" in context.lower() or "unauthorized" in context.lower():
                    themes.append("• Fraud and unauthorized transactions are reported")
                if "payment" in context.lower():
                    themes.append("• Payment-related issues are frequently mentioned")
                if "access" in context.lower() or "login" in context.lower():
                    themes.append("• Account access problems are noted")
                if "fee" in context.lower():
                    themes.append("• Fee-related complaints are present")
                
                if themes:
                    response += "\n".join(themes) + "\n\n"
                
                response += "The complaints show various customer concerns that may require attention from the relevant product teams."
                
                return response
            else:
                return "I don't have enough relevant complaint information to answer this question."
        
        return "I apologize, but I couldn't process your question properly."

# Initialize generator
generator = ComplaintGenerator()
print("✅ Generator initialized")

# Test generation
print(f"\n🤖 Testing response generation...")
start_time = time.time()
test_response = generator.generate_response(test_prompt)
generation_time = time.time() - start_time

print(f"⏱️  Generation time: {generation_time:.2f} seconds")
print(f"📝 Response length: {len(test_response)} characters")
print(f"\n🤖 Generated response:")
print(test_response)

## 5. Complete RAG Pipeline

In [None]:
class RAGPipeline:
    """Complete RAG pipeline for complaint analysis"""
    
    def __init__(self, retriever, prompt_template, generator):
        self.retriever = retriever
        self.prompt_template = prompt_template
        self.generator = generator
    
    def query(self, question: str, product_filter: Optional[str] = None, 
              n_results: int = 5) -> Dict[str, Any]:
        """Process a complete RAG query"""
        
        start_time = time.time()
        
        try:
            # Step 1: Retrieve relevant chunks
            retrieval_start = time.time()
            retrieved_chunks = self.retriever.retrieve(
                question, n_results=n_results, product_filter=product_filter
            )
            retrieval_time = time.time() - retrieval_start
            
            if not retrieved_chunks:
                return {
                    'question': question,
                    'answer': "I couldn't find any relevant complaint information to answer your question.",
                    'sources': [],
                    'context_used': "",
                    'product_filter': product_filter,
                    'retrieval_time': retrieval_time,
                    'generation_time': 0,
                    'total_time': time.time() - start_time
                }
            
            # Step 2: Create prompt
            prompt = self.prompt_template.create_prompt(question, retrieved_chunks)
            
            # Step 3: Generate response
            generation_start = time.time()
            answer = self.generator.generate_response(prompt)
            generation_time = time.time() - generation_start
            
            # Step 4: Format sources for display
            sources = []
            for i, chunk in enumerate(retrieved_chunks):
                source = {
                    'index': i + 1,
                    'text': chunk['text'][:200] + "..." if len(chunk['text']) > 200 else chunk['text'],
                    'full_text': chunk['text'],
                    'product': chunk['metadata']['product'],
                    'issue': chunk['metadata']['issue'],
                    'similarity_score': round(chunk['similarity_score'], 3),
                    'company': chunk['metadata'].get('company', 'Unknown'),
                    'state': chunk['metadata'].get('state', 'Unknown')
                }
                sources.append(source)
            
            total_time = time.time() - start_time
            
            return {
                'question': question,
                'answer': answer,
                'sources': sources,
                'context_used': self.prompt_template.create_context(retrieved_chunks),
                'product_filter': product_filter,
                'num_sources': len(sources),
                'retrieval_time': retrieval_time,
                'generation_time': generation_time,
                'total_time': total_time,
                'retrieval_stats': self.retriever.get_retrieval_stats(retrieved_chunks)
            }
            
        except Exception as e:
            return {
                'question': question,
                'answer': f"I encountered an error while processing your question: {str(e)}",
                'sources': [],
                'context_used': "",
                'product_filter': product_filter,
                'error': str(e)
            }

# Initialize complete RAG pipeline
rag_pipeline = RAGPipeline(retriever, prompt_template, generator)
print("✅ Complete RAG pipeline initialized")

# Test complete pipeline
print(f"\n🔄 Testing complete RAG pipeline...")
test_result = rag_pipeline.query("What are the main issues customers face with credit cards?")

print(f"\n📊 Pipeline Performance:")
print(f"  Retrieval time: {test_result.get('retrieval_time', 0)*1000:.1f}ms")
print(f"  Generation time: {test_result.get('generation_time', 0):.2f}s")
print(f"  Total time: {test_result.get('total_time', 0):.2f}s")
print(f"  Sources found: {test_result.get('num_sources', 0)}")

print(f"\n💬 Generated Answer:")
print(test_result['answer'])

if test_result['sources']:
    print(f"\n📚 Top Sources:")
    for source in test_result['sources'][:2]:
        print(f"  {source['index']}. {source['product']} - Similarity: {source['similarity_score']}")
        print(f"     {source['text']}")

## 6. Comprehensive Evaluation Framework

In [None]:
# Define comprehensive test questions
test_questions = [
    {
        'question': 'What are the main issues customers face with credit cards?',
        'expected_themes': ['billing', 'fees', 'fraud', 'unauthorized charges'],
        'product_filter': 'Credit card',
        'category': 'product_specific'
    },
    {
        'question': 'Why are people unhappy with BNPL services?',
        'expected_themes': ['payment issues', 'terms', 'late fees'],
        'product_filter': 'Buy Now, Pay Later (BNPL)',
        'category': 'product_specific'
    },
    {
        'question': 'What problems do customers report with personal loans?',
        'expected_themes': ['approval', 'terms', 'interest rates'],
        'product_filter': 'Personal loan',
        'category': 'product_specific'
    },
    {
        'question': 'What are common complaints about savings accounts?',
        'expected_themes': ['fees', 'access', 'interest'],
        'product_filter': 'Savings account',
        'category': 'product_specific'
    },
    {
        'question': 'What issues do customers have with money transfers?',
        'expected_themes': ['delays', 'fees', 'failed transfers'],
        'product_filter': 'Money transfers',
        'category': 'product_specific'
    },
    {
        'question': 'Which financial product has the most fraud-related complaints?',
        'expected_themes': ['fraud comparison', 'unauthorized transactions'],
        'product_filter': None,
        'category': 'comparative'
    },
    {
        'question': 'What are customers saying about customer service quality?',
        'expected_themes': ['service quality', 'response time', 'helpfulness'],
        'product_filter': None,
        'category': 'cross_product'
    },
    {
        'question': 'Are there any patterns in billing disputes across products?',
        'expected_themes': ['billing issues', 'unauthorized charges', 'fee disputes'],
        'product_filter': None,
        'category': 'pattern_analysis'
    },
    {
        'question': 'What are the most frequent complaint types?',
        'expected_themes': ['complaint frequency', 'issue distribution'],
        'product_filter': None,
        'category': 'analytical'
    },
    {
        'question': 'How do customers describe unauthorized transactions?',
        'expected_themes': ['fraud descriptions', 'unauthorized activity'],
        'product_filter': None,
        'category': 'descriptive'
    }
]

print(f"📋 Defined {len(test_questions)} test questions across {len(set(q['category'] for q in test_questions))} categories")
print(f"📊 Categories: {', '.join(set(q['category'] for q in test_questions))}")

In [None]:
# Run comprehensive evaluation
def run_evaluation(rag_pipeline, test_questions):
    """Run comprehensive evaluation of the RAG system"""
    
    print("=== RUNNING COMPREHENSIVE EVALUATION ===")
    print(f"Testing {len(test_questions)} questions...\n")
    
    results = []
    
    for i, test_case in enumerate(tqdm(test_questions, desc="Evaluating questions")):
        question = test_case['question']
        expected_themes = test_case['expected_themes']
        product_filter = test_case['product_filter']
        category = test_case['category']
        
        # Get RAG response
        result = rag_pipeline.query(question, product_filter=product_filter)
        
        # Calculate evaluation metrics
        has_sources = len(result['sources']) > 0
        answer_length = len(result['answer'])
        
        # Check if answer mentions relevant products
        product_keywords = ['credit card', 'loan', 'bnpl', 'savings', 'transfer', 'money']
        mentions_product = any(keyword in result['answer'].lower() for keyword in product_keywords)
        
        # Check for expected themes
        themes_found = sum(1 for theme in expected_themes 
                          if theme.lower() in result['answer'].lower())
        theme_coverage = themes_found / len(expected_themes) if expected_themes else 0
        
        # Calculate quality score
        quality_score = calculate_quality_score(result, has_sources, mentions_product, theme_coverage)
        
        evaluation_result = {
            'question_id': i + 1,
            'question': question,
            'category': category,
            'product_filter': product_filter or 'All',
            'answer': result['answer'],
            'answer_preview': result['answer'][:150] + "..." if len(result['answer']) > 150 else result['answer'],
            'num_sources': len(result['sources']),
            'has_sources': has_sources,
            'answer_length': answer_length,
            'mentions_product': mentions_product,
            'expected_themes': expected_themes,
            'themes_found': themes_found,
            'theme_coverage': theme_coverage,
            'quality_score': quality_score,
            'top_similarity': result['sources'][0]['similarity_score'] if result['sources'] else 0,
            'retrieval_time': result.get('retrieval_time', 0),
            'generation_time': result.get('generation_time', 0),
            'total_time': result.get('total_time', 0),
            'sources': result['sources']
        }
        
        results.append(evaluation_result)
    
    return results

def calculate_quality_score(result, has_sources, mentions_product, theme_coverage):
    """Calculate quality score from 1-5"""
    score = 1
    
    # Has sources
    if has_sources:
        score += 1
    
    # Good similarity
    if result['sources'] and result['sources'][0]['similarity_score'] > 0.7:
        score += 1
    
    # Reasonable answer length
    if 50 <= len(result['answer']) <= 400:
        score += 1
    
    # Theme coverage or product mention
    if theme_coverage > 0.3 or mentions_product:
        score += 1
    
    return min(score, 5)

# Run evaluation
evaluation_results = run_evaluation(rag_pipeline, test_questions)
results_df = pd.DataFrame(evaluation_results)

print(f"\n✅ Evaluation completed!")
print(f"📊 Processed {len(evaluation_results)} questions")

## 7. Evaluation Analysis and Results

In [None]:
# Calculate comprehensive metrics
def calculate_evaluation_metrics(results_df):
    """Calculate comprehensive evaluation metrics"""
    
    total_questions = len(results_df)
    
    metrics = {
        'total_questions': total_questions,
        'questions_with_sources': results_df['has_sources'].sum(),
        'source_retrieval_rate': results_df['has_sources'].mean() * 100,
        'avg_sources_per_question': results_df['num_sources'].mean(),
        'avg_answer_length': results_df['answer_length'].mean(),
        'questions_mentioning_products': results_df['mentions_product'].sum(),
        'product_mention_rate': results_df['mentions_product'].mean() * 100,
        'avg_top_similarity': results_df['top_similarity'].mean(),
        'avg_theme_coverage': results_df['theme_coverage'].mean() * 100,
        'avg_quality_score': results_df['quality_score'].mean(),
        'high_quality_responses': (results_df['quality_score'] >= 4).sum(),
        'avg_retrieval_time': results_df['retrieval_time'].mean() * 1000,  # ms
        'avg_generation_time': results_df['generation_time'].mean(),
        'avg_total_time': results_df['total_time'].mean()
    }
    
    # Category-wise analysis
    category_metrics = {}
    for category in results_df['category'].unique():
        cat_df = results_df[results_df['category'] == category]
        category_metrics[category] = {
            'count': len(cat_df),
            'avg_quality': cat_df['quality_score'].mean(),
            'source_rate': cat_df['has_sources'].mean() * 100,
            'avg_similarity': cat_df['top_similarity'].mean()
        }
    
    metrics['category_analysis'] = category_metrics
    
    return metrics

# Calculate metrics
metrics = calculate_evaluation_metrics(results_df)

print("=== EVALUATION RESULTS ===")
print(f"\n📊 Overall Performance:")
print(f"  Total questions: {metrics['total_questions']}")
print(f"  Source retrieval rate: {metrics['source_retrieval_rate']:.1f}%")
print(f"  Average sources per question: {metrics['avg_sources_per_question']:.1f}")
print(f"  Product mention rate: {metrics['product_mention_rate']:.1f}%")
print(f"  Average similarity score: {metrics['avg_top_similarity']:.3f}")
print(f"  Average theme coverage: {metrics['avg_theme_coverage']:.1f}%")
print(f"  Average quality score: {metrics['avg_quality_score']:.1f}/5")
print(f"  High quality responses (≥4): {metrics['high_quality_responses']}/{metrics['total_questions']}")

print(f"\n⏱️  Performance Timing:")
print(f"  Average retrieval time: {metrics['avg_retrieval_time']:.1f}ms")
print(f"  Average generation time: {metrics['avg_generation_time']:.2f}s")
print(f"  Average total time: {metrics['avg_total_time']:.2f}s")

print(f"\n📈 Category Analysis:")
for category, cat_metrics in metrics['category_analysis'].items():
    print(f"  {category.replace('_', ' ').title()}:")
    print(f"    Questions: {cat_metrics['count']}")
    print(f"    Avg quality: {cat_metrics['avg_quality']:.1f}/5")
    print(f"    Source rate: {cat_metrics['source_rate']:.1f}%")
    print(f"    Avg similarity: {cat_metrics['avg_similarity']:.3f}")

In [None]:
# Create comprehensive visualizations
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))

# Quality score distribution
quality_counts = results_df['quality_score'].value_counts().sort_index()
ax1.bar(quality_counts.index, quality_counts.values, color='skyblue', alpha=0.7)
ax1.set_title('Distribution of Quality Scores', fontweight='bold', fontsize=14)
ax1.set_xlabel('Quality Score (1-5)')
ax1.set_ylabel('Number of Questions')
ax1.set_xticks(range(1, 6))
for i, v in enumerate(quality_counts.values):
    ax1.text(quality_counts.index[i], v + 0.1, str(v), ha='center', fontweight='bold')

# Performance by category
category_quality = results_df.groupby('category')['quality_score'].mean().sort_values(ascending=True)
ax2.barh(range(len(category_quality)), category_quality.values, color='lightcoral', alpha=0.7)
ax2.set_yticks(range(len(category_quality)))
ax2.set_yticklabels([cat.replace('_', ' ').title() for cat in category_quality.index])
ax2.set_title('Average Quality Score by Category', fontweight='bold', fontsize=14)
ax2.set_xlabel('Average Quality Score')
ax2.set_xlim(0, 5)
for i, v in enumerate(category_quality.values):
    ax2.text(v + 0.1, i, f'{v:.1f}', va='center', fontweight='bold')

# Similarity vs Quality correlation
scatter = ax3.scatter(results_df['top_similarity'], results_df['quality_score'], 
                     c=results_df['num_sources'], cmap='viridis', alpha=0.7, s=60)
ax3.set_title('Similarity Score vs Quality Score', fontweight='bold', fontsize=14)
ax3.set_xlabel('Top Similarity Score')
ax3.set_ylabel('Quality Score')
plt.colorbar(scatter, ax=ax3, label='Number of Sources')

# Response time analysis
ax4.hist(results_df['total_time'], bins=15, alpha=0.7, color='gold', edgecolor='black')
ax4.set_title('Distribution of Response Times', fontweight='bold', fontsize=14)
ax4.set_xlabel('Total Time (seconds)')
ax4.set_ylabel('Frequency')
ax4.axvline(results_df['total_time'].mean(), color='red', linestyle='--', 
           label=f'Mean: {results_df["total_time"].mean():.2f}s')
ax4.legend()

plt.suptitle('RAG System Evaluation Analysis', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

# Correlation analysis
print(f"\n🔗 Correlation Analysis:")
correlations = results_df[['quality_score', 'top_similarity', 'num_sources', 'answer_length', 'theme_coverage']].corr()
print(f"Quality Score correlations:")
quality_corr = correlations['quality_score'].drop('quality_score').sort_values(ascending=False)
for metric, corr in quality_corr.items():
    print(f"  {metric}: {corr:.3f}")

In [None]:
# Detailed results table
print("=== DETAILED EVALUATION RESULTS ===")
print("\nTop performing questions:")
top_questions = results_df.nlargest(3, 'quality_score')[['question', 'category', 'quality_score', 'top_similarity', 'num_sources']]
for _, row in top_questions.iterrows():
    print(f"\n✅ Q: {row['question'][:60]}...")
    print(f"   Category: {row['category']} | Quality: {row['quality_score']}/5 | Similarity: {row['top_similarity']:.3f} | Sources: {row['num_sources']}")

print("\nLowest performing questions:")
low_questions = results_df.nsmallest(3, 'quality_score')[['question', 'category', 'quality_score', 'top_similarity', 'num_sources']]
for _, row in low_questions.iterrows():
    print(f"\n⚠️  Q: {row['question'][:60]}...")
    print(f"   Category: {row['category']} | Quality: {row['quality_score']}/5 | Similarity: {row['top_similarity']:.3f} | Sources: {row['num_sources']}")

# Sample responses
print("\n=== SAMPLE RESPONSES ===")
sample_result = results_df.iloc[0]
print(f"\n📝 Question: {sample_result['question']}")
print(f"🤖 Answer: {sample_result['answer']}")
print(f"📊 Quality Score: {sample_result['quality_score']}/5")
print(f"📚 Sources: {sample_result['num_sources']}")
if sample_result['sources']:
    print(f"🔍 Top Source: {sample_result['sources'][0]['product']} - {sample_result['sources'][0]['similarity_score']:.3f}")

## 8. Save Results and Generate Report

In [None]:
# Save evaluation results
os.makedirs('../results', exist_ok=True)

# Save detailed results
results_df.to_csv('../results/rag_evaluation_results.csv', index=False)
print(f"✅ Detailed results saved to: ../results/rag_evaluation_results.csv")

# Save metrics
with open('../results/evaluation_metrics.json', 'w') as f:
    json.dump(metrics, f, indent=2, default=str)
print(f"✅ Metrics saved to: ../results/evaluation_metrics.json")

# Generate analysis report
def generate_analysis_report(metrics, results_df):
    """Generate comprehensive analysis report"""
    
    report = {
        'strengths': [],
        'weaknesses': [],
        'recommendations': []
    }
    
    # Analyze strengths
    if metrics['source_retrieval_rate'] > 80:
        report['strengths'].append(f"High source retrieval rate ({metrics['source_retrieval_rate']:.1f}%) - system consistently finds relevant information")
    
    if metrics['avg_top_similarity'] > 0.7:
        report['strengths'].append(f"High similarity scores ({metrics['avg_top_similarity']:.3f}) indicate good semantic matching")
    
    if metrics['product_mention_rate'] > 60:
        report['strengths'].append(f"Good product awareness ({metrics['product_mention_rate']:.1f}%) - answers frequently mention relevant products")
    
    if metrics['avg_total_time'] < 3:
        report['strengths'].append(f"Fast response times ({metrics['avg_total_time']:.2f}s average) suitable for interactive use")
    
    # Analyze weaknesses
    no_source_questions = results_df[results_df['num_sources'] == 0]
    if len(no_source_questions) > 0:
        report['weaknesses'].append(f"{len(no_source_questions)} questions returned no sources")
    
    short_answers = results_df[results_df['answer_length'] < 50]
    if len(short_answers) > 0:
        report['weaknesses'].append(f"{len(short_answers)} questions produced very short answers")
    
    low_quality = results_df[results_df['quality_score'] < 3]
    if len(low_quality) > 0:
        report['weaknesses'].append(f"{len(low_quality)} questions received low quality scores (<3/5)")
    
    if metrics['avg_theme_coverage'] < 40:
        report['weaknesses'].append(f"Low theme coverage ({metrics['avg_theme_coverage']:.1f}%) - answers may miss expected topics")
    
    # Generate recommendations
    if metrics['avg_top_similarity'] < 0.6:
        report['recommendations'].append("Consider improving embedding model or chunking strategy for better semantic matching")
    
    if metrics['avg_answer_length'] > 300 or results_df['answer_length'].std() > 150:
        report['recommendations'].append("Standardize answer length for consistency and readability")
    
    report['recommendations'].extend([
        "Implement user feedback mechanism to continuously improve response quality",
        "Add confidence scores to help users assess answer reliability",
        "Consider fine-tuning the language model on financial complaint data",
        "Expand the evaluation dataset with more diverse question types",
        "Monitor system performance in production with real user queries"
    ])
    
    return report

# Generate analysis
analysis = generate_analysis_report(metrics, results_df)

print("\n=== ANALYSIS SUMMARY ===")
print("\n✅ Strengths:")
for strength in analysis['strengths']:
    print(f"  • {strength}")

print("\n⚠️ Areas for Improvement:")
for weakness in analysis['weaknesses']:
    print(f"  • {weakness}")

print("\n💡 Recommendations:")
for rec in analysis['recommendations']:
    print(f"  • {rec}")

# Save analysis
with open('../results/evaluation_analysis.json', 'w') as f:
    json.dump(analysis, f, indent=2)
print(f"\n✅ Analysis saved to: ../results/evaluation_analysis.json")

print("\n" + "="*60)
print("🎉 TASK 3 COMPLETED SUCCESSFULLY!")
print("="*60)
print(f"✅ Implemented complete RAG pipeline with retriever, prompt template, and generator")
print(f"✅ Evaluated system with {len(test_questions)} comprehensive test questions")
print(f"✅ Achieved {metrics['source_retrieval_rate']:.1f}% source retrieval rate")
print(f"✅ Average quality score: {metrics['avg_quality_score']:.1f}/5")
print(f"✅ Generated detailed analysis and recommendations")
print(f"📁 Results saved to: ../results/")
print("\n🚀 Ready for Task 4: Creating Interactive Chat Interface")