# 🤖 Intelligent Database & CSV Chatbot with RAG

This notebook demonstrates the capabilities of our chatbot with Retrieval-Augmented Generation (RAG) enhancement.

## Features:
- **Local AI Models**: Uses HuggingFace transformers
- **RAG Enhancement**: Improved context-aware responses
- **Database Integration**: Query and analyze SQL databases
- **CSV Analysis**: Business intelligence from CSV data
- **Completely Local**: No API keys required

In [None]:
# Setup and imports
import sys
import os
import pandas as pd
import numpy as np
from datetime import datetime

# Add project root to path
project_root = os.path.dirname(os.path.dirname(os.path.abspath('__file__')))
sys.path.append(project_root)

print("🚀 Setting up Chatbot with RAG...")
print(f"Project root: {project_root}")
print(f"Python version: {sys.version}")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")

# Check available dependencies
dependencies = {}

try:
    import transformers
    dependencies['transformers'] = transformers.__version__
    print(f"✅ Transformers: {transformers.__version__}")
except ImportError:
    dependencies['transformers'] = None
    print("❌ Transformers not available")

try:
    import sentence_transformers
    dependencies['sentence_transformers'] = sentence_transformers.__version__
    print(f"✅ Sentence Transformers: {sentence_transformers.__version__}")
except ImportError:
    dependencies['sentence_transformers'] = None
    print("❌ Sentence Transformers not available")

try:
    import chromadb
    dependencies['chromadb'] = chromadb.__version__
    print(f"✅ ChromaDB: {chromadb.__version__}")
except ImportError:
    dependencies['chromadb'] = None
    print("❌ ChromaDB not available")

try:
    import plotly
    dependencies['plotly'] = plotly.__version__
    print(f"✅ Plotly: {plotly.__version__}")
except ImportError:
    dependencies['plotly'] = None
    print("❌ Plotly not available")

print("\n" + "="*50)

In [None]:
# Initialize RAG Enhancement
print("🧠 Initializing RAG Enhancement...")

try:
    from src.chatbot.rag_enhancer import SimpleRAGEnhancer
    
    # Initialize RAG enhancer
    rag_enhancer = SimpleRAGEnhancer()
    rag_status = rag_enhancer.get_rag_status()
    
    print(f"✅ RAG initialized successfully!")
    print(f"📊 RAG Mode: {rag_status['rag_mode']}")
    print(f"🔧 Dependencies:")
    for dep, available in rag_status['dependencies_available'].items():
        status = "✅" if available else "❌"
        print(f"   {status} {dep}: {available}")
    
    # Display RAG status
    print(f"\n📈 Indexed Data:")
    print(f"   CSV chunks: {rag_status['indexed_data']['csv_chunks']}")
    print(f"   Conversation history: {rag_status['indexed_data']['conversation_history']}")
    
except ImportError as e:
    print(f"❌ RAG Enhancement not available: {e}")
    rag_enhancer = None
except Exception as e:
    print(f"❌ Error initializing RAG: {e}")
    rag_enhancer = None

print("\n" + "="*50)

In [None]:
# Create Sample Data for Testing
print("📊 Creating sample data for RAG testing...")

try:
    from data.create_sample_data import create_sample_sales_data, create_sample_customer_data
    
    # Create sample datasets
    sales_df = create_sample_sales_data()
    customer_df = create_sample_customer_data()
    
    print(f"✅ Sales data created: {sales_df.shape[0]} rows, {sales_df.shape[1]} columns")
    print(f"✅ Customer data created: {customer_df.shape[0]} rows, {customer_df.shape[1]} columns")
    
    # Display sample data
    print("\n📈 Sales Data Sample:")
    display(sales_df.head())
    
    print("\n👥 Customer Data Sample:")
    display(customer_df.head())
    
    # Index data for RAG if available
    if rag_enhancer:
        print("\n🧠 Indexing data for RAG...")
        rag_enhancer.index_csv_data(sales_df)
        print("✅ Sales data indexed for RAG retrieval")
        
        # Show updated RAG status
        updated_status = rag_enhancer.get_rag_status()
        print(f"📊 RAG Status after indexing:")
        print(f"   CSV chunks: {updated_status['indexed_data']['csv_chunks']}")
    
except Exception as e:
    print(f"❌ Error creating sample data: {e}")
    sales_df = None
    customer_df = None

print("\n" + "="*50)

In [None]:
# RAG vs Traditional Approach Comparison
print("🔍 Testing RAG vs Traditional Approach...")

if rag_enhancer and sales_df is not None:
    # Test questions for comparison
    test_questions = [
        "What's the total sales amount?",  # Traditional
        "Analyze sales patterns and trends in the data",  # RAG
        "Show me correlations between different variables",  # RAG  
        "SELECT COUNT(*) FROM sales",  # Traditional
        "Explain the relationship between product categories and sales performance",  # RAG
        "Calculate the average order value"  # Traditional
    ]
    
    print("🤔 Testing question classification:")
    print("-" * 40)
    
    for question in test_questions:
        should_use_rag = rag_enhancer.should_use_rag(question)
        approach = "🧠 RAG" if should_use_rag else "📊 Traditional"
        print(f"{approach}: {question}")
    
    print("\n" + "="*50)
    print("🧠 RAG Retrieval Test:")
    print("-" * 40)
    
    # Test RAG retrieval with analytical questions
    analytical_queries = [
        "sales patterns",
        "product performance",
        "customer behavior",
        "revenue trends"
    ]
    
    for query in analytical_queries:
        relevant_docs = rag_enhancer.retrieve_relevant_data(query, "csv", top_k=2)
        print(f"Query: '{query}' → Found {len(relevant_docs)} relevant chunks")
        
        if relevant_docs:
            for i, doc in enumerate(relevant_docs[:1]):  # Show first result only
                print(f"  📄 Chunk {i+1} (score: {doc.get('score', 'N/A')}):")
                content_preview = doc['content'][:200] + "..." if len(doc['content']) > 200 else doc['content']
                print(f"     {content_preview}")
                print()
    
else:
    print("❌ RAG enhancer or sample data not available")

print("\n" + "="*50)

In [None]:
# Interactive RAG Demo
print("🎮 Interactive RAG-Enhanced Chatbot Demo")

def simple_llm_mock(query, context):
    """Mock LLM function for demonstration"""
    if "sales" in query.lower():
        return f"Based on the data analysis: {query}"
    elif "pattern" in query.lower() or "trend" in query.lower():
        return f"Pattern analysis shows: {query}"
    elif "correlation" in query.lower():
        return f"Correlation analysis indicates: {query}"
    else:
        return f"Analysis result: {query}"

def demo_question(question):
    """Demo function to test a question with and without RAG"""
    print(f"\n❓ Question: '{question}'")
    print("-" * 60)
    
    if rag_enhancer:
        # Test if RAG should be used
        should_use_rag = rag_enhancer.should_use_rag(question)
        print(f"🤔 RAG Decision: {'Use RAG' if should_use_rag else 'Use Traditional'}")
        
        if should_use_rag:
            # Generate RAG-enhanced response
            try:
                rag_response = rag_enhancer.generate_rag_response(question, simple_llm_mock)
                print(f"🧠 RAG Response: {rag_response}")
            except Exception as e:
                print(f"❌ RAG Error: {e}")
        else:
            # Generate traditional response
            traditional_response = simple_llm_mock(question, "")
            print(f"📊 Traditional Response: {traditional_response}")
    else:
        print("❌ RAG not available - using traditional approach")
        print(f"📊 Response: {simple_llm_mock(question, '')}")

# Demo questions
demo_questions = [
    "What's the total sales in the dataset?",
    "Analyze the sales patterns and identify trends",
    "Show me correlations between product categories and revenue",
    "Count the number of unique customers",
    "Explain the relationship between time and sales performance"
]

print("🚀 Running demo with sample questions...")
for question in demo_questions:
    demo_question(question)

print("\n" + "="*60)
print("✅ Interactive demo complete!")
print("\n💡 Try your own questions:")
print("   - Analytical questions will use RAG")
print("   - Simple queries will use traditional approach")
print("   - RAG provides context-aware responses")

## 🎉 Demo Complete!

### What we demonstrated:
1. **RAG Enhancement Setup** - Automatic fallback to simple mode if dependencies missing
2. **Data Indexing** - CSV data chunked and indexed for semantic retrieval
3. **Smart Question Classification** - Automatic detection of when to use RAG vs traditional
4. **Context-Aware Responses** - RAG provides relevant data context to improve answers
5. **Graceful Degradation** - Works with or without advanced dependencies

### Key Benefits of RAG:
- **Better Context**: Responses include relevant data chunks from your datasets
- **Improved Accuracy**: LLM has access to specific data points when answering
- **Flexible**: Automatically chooses best approach based on question type
- **Local**: Everything runs on your machine, no API calls needed

### Next Steps:
1. **Install Advanced Dependencies**: `pip install sentence-transformers chromadb`
2. **Try the Web Interface**: `streamlit run local_demo.py`
3. **Upload Your Own Data**: Test with your CSV files
4. **Ask Analytical Questions**: Get insights from your data

### Learn More:
- Check out `local_demo.py` for the full web interface
- Run `python demo.py` for a command-line demo
- Explore the source code in `src/chatbot/rag_enhancer.py`