# RAG Chatbot for Primary School Students
## Knowledge Base: Cells and Chemistry of Life

This notebook demonstrates a complete RAG (Retrieval-Augmented Generation) chatbot system using:
- **Local LLM**: Ollama Gemma3:4b
- **Knowledge Base**: Cells and Chemistry of Life.pdf  
- **Vector Store**: FAISS with sentence transformers
- **Evaluation**: RAGAS framework with MLflow tracking
- **Target Audience**: Primary school students (ages 6-12)

In [None]:
import sys
import os
sys.path.append('..\src')

from dotenv import load_dotenv
load_dotenv()

from src import Config, DocumentProcessor, VectorStore, RAGChatbot, RAGASEvaluator
import mlflow
import mlflow.sklearn
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

print("All modules imported successfully!")

# Validate API key availability
config = Config()
api_key_available = config.validate_api_key()

if api_key_available:
    print("OpenAI API key loaded successfully!")
else:
    print("WARNING: OpenAI API key not found - RAGAS evaluation will fail")
    print("Please check your .env file")

# Initialize MLflow
mlflow.set_experiment("RAG_Chatbot_RAGAS_Evaluation")
print("MLflow experiment initialized!")

print(f"Looking for PDF: {config.DATA_DIR}/{config.PDF_FILE}")

## 2. Document Processing
Load and process the "Cells and Chemistry of Life.pdf" document

In [None]:
# Initialize document processor and load PDF
doc_processor = DocumentProcessor(config)

chunks = doc_processor.process_document()

if chunks:
    print(f"\n Document Processing Results:")
    print(f"   Total chunks: {len(chunks)}")
    print(f"   Average chunk size: {sum(len(c['content']) for c in chunks) // len(chunks)} characters")
    print(f"\n Sample chunk:")
    print(f"   {chunks[0]['content'][:200]}...")
else:
    print("No chunks created. Please check if the PDF file exists in the data/ directory.")

## 3. Vector Store Setup
Create embeddings and build FAISS index for similarity search

In [None]:
# Initialize vector store
vector_store = VectorStore(config)

# Try to load existing index, otherwise create new one
if not vector_store.load_index():
    print("Creating new vector index...")
    vector_store.setup_from_chunks(chunks)
    vector_store.save_index()

print(f"\nVector Store Ready!")
print(f" Index size: {vector_store.index.ntotal} vectors")
print(f" Embedding dimension: {vector_store.dimension}")

## 4. RAG Chatbot Initialization
Initialize the chatbot with Ollama integration

In [None]:
try:
    chatbot = RAGChatbot(vector_store, config)
    print("RAG Chatbot initialized successfully!")
    print(f"Model: {config.OLLAMA_MODEL}")
    print(f"Similarity threshold: {config.SIMILARITY_THRESHOLD}")
    print(f"Top-K retrieval: {config.TOP_K_DOCS}")
except Exception as e:
    print(f"Failed to initialize chatbot: {str(e)}")

## 6. Custom Question Testing
Ask your own questions to the chatbot

In [None]:
# Interactive question function with language detection
def ask_question(question: str):
    """Ask a question to the chatbot and display results with language info"""
    result = chatbot.answer(question)
    
    print(f"Question: {question}")
    print(f"Detected Language: {result['detected_language']}")
    print(f"Answer: {result['answer']}")
    print(f"Details: In scope: {result['in_scope']} | Confidence: {result['confidence']:.3f}")
    print("-" * 80)

# Test with English questions
print("=== ENGLISH QUESTIONS ===")
ask_question("What is the main function of mitochondria in a cell?")
ask_question("What is diffusion?")

# Test with Chinese questions
print("\n=== CHINESE QUESTIONS ===")
ask_question("细胞中线粒体的主要功能是什么？")  # What is the main function of mitochondria in a cell?
ask_question("什么是扩散？")  # What is diffusion?

# Test with Malay questions
print("\n=== MALAY QUESTIONS ===")
ask_question("Apakah fungsi utama mitokondria dalam sel?")  # What is the main function of mitochondria in a cell?
ask_question("Apakah itu resapan?")  # What is diffusion?

# Test out-of-scope questions in different languages
print("\n=== OUT-OF-SCOPE QUESTIONS ===")
ask_question("What is the weather today?")  # English
ask_question("今天天气怎么样？")  # Chinese: What's the weather today?
ask_question("Bagaimana cuaca hari ini?")  # Malay: What's the weather today?

## 7. RAGAS Evaluation with MLflow Tracking
Evaluate the chatbot performance using RAGAS metrics and track with MLflow

In [None]:
# Initialize RAGAS evaluator
evaluator = RAGASEvaluator(chatbot)

# Use sample test cases
test_cases = evaluator.create_sample_test_cases()

print("Test cases for evaluation:")
for i, case in enumerate(test_cases, 1):
    print(f"   {i}. {case['question']}")

print(f"\nRunning RAGAS evaluation with {len(test_cases)} test cases...")
scores = evaluator.evaluate_with_mlflow(test_cases, len(chunks))
evaluator.print_evaluation_report(scores, config.TARGET_RAGAS_SCORE)

print("Evaluation complete! Run 'mlflow ui' to view results.")