# Pythonic Personal Wellness Assistant

## Task 1: Imports and Utility

In [None]:
import os
import openai
from getpass import getpass

openai.api_key = getpass("OpenAI API Key: ")
os.environ["OPENAI_API_KEY"] = openai.api_key

In [None]:
# Enable nested async event loops in Jupyter notebooks
# Jupyter runs its own event loop, which normally prevents asyncio.run() from working.
# nest_asyncio patches asyncio to allow nested loops, enabling async code (like batch embeddings) to execute.

import nest_asyncio
nest_asyncio.apply()

> **üì¶ Local Library: `aimakerspace/`**
> 
> The imports below come from the **local `aimakerspace/` package** included in this repository‚Äînot from PyPI. This custom library provides:
> - `text_utils.py` ‚Üí `TextFileLoader`, `CharacterTextSplitter` for document loading and chunking
> - `vectordatabase.py` ‚Üí `VectorDatabase` for in-memory vector storage and similarity search
> - `openai_utils/` ‚Üí Wrappers for OpenAI embeddings, chat models, and prompt templates

In [None]:
# =============================================================================
# LOCAL LIBRARY IMPORTS: aimakerspace/
# These classes are defined in the local ./aimakerspace/ directory, NOT from PyPI
# =============================================================================
from aimakerspace.text_utils import TextFileLoader, CharacterTextSplitter  # ./aimakerspace/text_utils.py
from aimakerspace.vectordatabase import VectorDatabase  # ./aimakerspace/vectordatabase.py
import asyncio

## Task 2: Documents

> **üìÅ Local Data Directory: `data/`**
> 
> Source documents are stored in the **local `data/` directory**. This repository includes:
> - `data/HealthWellnessGuide.txt` ‚Äî Primary knowledge base for the wellness assistant
> 
> The `TextFileLoader` class (from local `aimakerspace/text_utils.py`) handles reading these files.

### Loading Source Documents

In [None]:
# =============================================================================
# LOCAL DATA: Loading from ./data/ directory
# TextFileLoader is from local aimakerspace/text_utils.py
# =============================================================================
text_loader = TextFileLoader("data/HealthWellnessGuide.txt")  # Local data file
documents = text_loader.load_documents()
len(documents)

In [None]:
print(documents[0][:100])

### Splitting Text Into Chunks

In [None]:
# CharacterTextSplitter is from local aimakerspace/text_utils.py
# Default: chunk_size=1000, chunk_overlap=200
text_splitter = CharacterTextSplitter()
split_documents = text_splitter.split_texts(documents)
len(split_documents)

In [None]:
split_documents[0:1]

## Task 3: Embeddings and Vectors

> **üì¶ Local Library: `aimakerspace/vectordatabase.py`**
> 
> The `VectorDatabase` class (imported earlier from local `aimakerspace/`) provides:
> - In-memory storage for text ‚Üí embedding vector mappings
> - `abuild_from_list()` ‚Äî Async method to embed and store documents
> - `search_by_text()` ‚Äî Cosine similarity search for semantic retrieval
> 
> Internally uses `EmbeddingModel` from `aimakerspace/openai_utils/embedding.py` which calls OpenAI's `text-embedding-3-small` (1536 dimensions).

### Populate Vector Database

In [None]:
# =============================================================================
# LOCAL LIBRARY: VectorDatabase from aimakerspace/vectordatabase.py
# Uses EmbeddingModel from aimakerspace/openai_utils/embedding.py internally
# =============================================================================
vector_db = VectorDatabase()
vector_db = asyncio.run(vector_db.abuild_from_list(split_documents))

### Validate Vector DB Retrieval

- currently uses semantic similarity

In [None]:
vector_db.search_by_text("What exercises help with lower back pain?", k=3)

## Task 4: Prompts

> **üì¶ Local Library: `aimakerspace/openai_utils/`**
> 
> The prompt and chat utilities below come from the **local `aimakerspace/openai_utils/` directory**:
> - `prompts.py` ‚Üí `UserRolePrompt`, `SystemRolePrompt`, `AssistantRolePrompt` for message formatting
> - `chatmodel.py` ‚Üí `ChatOpenAI` wrapper for OpenAI chat completions (uses `gpt-4.1-mini`)

In [None]:
# =============================================================================
# LOCAL LIBRARY IMPORTS: aimakerspace/openai_utils/
# These classes are defined in the local ./aimakerspace/openai_utils/ directory
# =============================================================================
from aimakerspace.openai_utils.prompts import (  # ./aimakerspace/openai_utils/prompts.py
    UserRolePrompt,
    SystemRolePrompt,
    AssistantRolePrompt,
)

from aimakerspace.openai_utils.chatmodel import ChatOpenAI  # ./aimakerspace/openai_utils/chatmodel.py

chat_openai = ChatOpenAI()
user_prompt_template = "{content}"
user_role_prompt = UserRolePrompt(user_prompt_template)
system_prompt_template = (
    "You are an expert in {expertise}, you always answer in a kind way."
)
system_role_prompt = SystemRolePrompt(system_prompt_template)

messages = [
    system_role_prompt.create_message(expertise="Python"),
    user_role_prompt.create_message(
        content="What is the best way to write a loop?"
    ),
]

response = chat_openai.run(messages)

In [None]:
print(response)

## Task 5: Retrieval Augmented Generation

> **üì¶ Local Library Components in RAG Pipeline**
> 
> The RAG pipeline below integrates all local `aimakerspace/` components:
> - `ChatOpenAI` ‚Äî LLM wrapper from `aimakerspace/openai_utils/chatmodel.py`
> - `VectorDatabase` ‚Äî Retriever from `aimakerspace/vectordatabase.py`
> - `SystemRolePrompt`, `UserRolePrompt` ‚Äî Message formatters from `aimakerspace/openai_utils/prompts.py`
> 
> **RAG Flow**: Query ‚Üí Vector similarity search (local) ‚Üí Context augmentation ‚Üí LLM generation

In [None]:
# =============================================================================
# RAG PROMPT TEMPLATES
# Using SystemRolePrompt and UserRolePrompt from local aimakerspace/openai_utils/prompts.py
# =============================================================================

RAG_SYSTEM_TEMPLATE = """You are a helpful personal wellness assistant that answers health and wellness questions based strictly on provided context.

Instructions:
- Only answer questions using information from the provided context
- If the context doesn't contain relevant information, respond with "I don't have information about that in my wellness knowledge base"
- Be accurate and cite specific parts of the context when possible
- Keep responses {response_style} and {response_length}
- Only use the provided context. Do not use external knowledge.
- Include a gentle reminder that users should consult healthcare professionals for medical advice when appropriate
- Only provide answers when you are confident the context supports your response."""

RAG_USER_TEMPLATE = """Context Information:
{context}

Number of relevant sources found: {context_count}
{similarity_scores}

Question: {user_query}

Please provide your answer based solely on the context above."""

rag_system_prompt = SystemRolePrompt(
    RAG_SYSTEM_TEMPLATE,
    strict=True,
    defaults={
        "response_style": "concise",
        "response_length": "brief"
    }
)

rag_user_prompt = UserRolePrompt(
    RAG_USER_TEMPLATE,
    strict=True,
    defaults={
        "context_count": "",
        "similarity_scores": ""
    }
)

In [None]:
class RetrievalAugmentedQAPipeline:
    """
    RAG Pipeline using local aimakerspace/ library components.
    
    Dependencies (all from local ./aimakerspace/ directory):
        - ChatOpenAI: LLM wrapper from aimakerspace/openai_utils/chatmodel.py
        - VectorDatabase: Vector store from aimakerspace/vectordatabase.py
        - SystemRolePrompt, UserRolePrompt: From aimakerspace/openai_utils/prompts.py
    
    The pipeline:
        1. Takes a user query
        2. Retrieves relevant chunks via VectorDatabase.search_by_text() (cosine similarity)
        3. Augments the prompt with retrieved context
        4. Generates a grounded response via ChatOpenAI
    """
    def __init__(self, llm: ChatOpenAI, vector_db_retriever: VectorDatabase, 
                 response_style: str = "detailed", include_scores: bool = False) -> None:
        self.llm = llm  # From local aimakerspace/openai_utils/chatmodel.py
        self.vector_db_retriever = vector_db_retriever  # From local aimakerspace/vectordatabase.py
        self.response_style = response_style
        self.include_scores = include_scores

    def run_pipeline(self, user_query: str, k: int = 4, **system_kwargs) -> dict:
        # Retrieve relevant contexts using local VectorDatabase
        context_list = self.vector_db_retriever.search_by_text(user_query, k=k)
        
        context_prompt = ""
        similarity_scores = []
        
        for i, (context, score) in enumerate(context_list, 1):
            context_prompt += f"[Source {i}]: {context}\n\n"
            similarity_scores.append(f"Source {i}: {score:.3f}")
        
        # Create system message with parameters (using local prompt classes)
        system_params = {
            "response_style": self.response_style,
            "response_length": system_kwargs.get("response_length", "detailed")
        }
        
        formatted_system_prompt = rag_system_prompt.create_message(**system_params)
        
        user_params = {
            "user_query": user_query,
            "context": context_prompt.strip(),
            "context_count": len(context_list),
            "similarity_scores": f"Relevance scores: {', '.join(similarity_scores)}" if self.include_scores else ""
        }
        
        formatted_user_prompt = rag_user_prompt.create_message(**user_params)

        return {
            "response": self.llm.run([formatted_system_prompt, formatted_user_prompt]), 
            "context": context_list,
            "context_count": len(context_list),
            "similarity_scores": similarity_scores if self.include_scores else None,
            "prompts_used": {
                "system": formatted_system_prompt,
                "user": formatted_user_prompt
            }
        }

In [None]:
# =============================================================================
# INSTANTIATE RAG PIPELINE
# Combines local components: ChatOpenAI + VectorDatabase (both from aimakerspace/)
# Data source: ./data/HealthWellnessGuide.txt (loaded and chunked earlier)
# =============================================================================
rag_pipeline = RetrievalAugmentedQAPipeline(
    vector_db_retriever=vector_db,  # Local VectorDatabase with embedded wellness docs
    llm=chat_openai,  # Local ChatOpenAI wrapper
    response_style="detailed",
    include_scores=True
)

result = rag_pipeline.run_pipeline(
    "What are some natural remedies for improving sleep quality?",
    k=3,
    response_length="comprehensive", 
    include_warnings=True,
    confidence_required=True
)

In [None]:
print(f"Response: {result['response']}")
print(f"\nContext Count: {result['context_count']}")
print(f"Similarity Scores: {result['similarity_scores']}")

### Activity #1:

Enhance your Personal Wellness Assistant in some way! 

> **üí° Tip: Modify the Local Library**
> 
> Many enhancements require changes to the **local `aimakerspace/` library**:
> - Add new distance metrics ‚Üí edit `aimakerspace/vectordatabase.py`
> - Support embedding dimension reduction ‚Üí edit `aimakerspace/openai_utils/embedding.py`
> - Add PDF support ‚Üí create new loader in `aimakerspace/text_utils.py`
> - Add new data sources ‚Üí place files in `data/` directory

In [None]:
### YOUR CODE HERE
# Enhancement ideas that use local library/data:
# - Modify aimakerspace/vectordatabase.py for new similarity metrics
# - Modify aimakerspace/openai_utils/embedding.py for dimension reduction
# - Add new loaders to aimakerspace/text_utils.py (e.g., PDFFileLoader)
# - Add new data files to data/ directory

### Experiment Tracker - thought experiment

Use a DataFrame to capture how parameter adjustments impact RAG results. Key parameters to experiment with:
- **k**: Number of retrieved chunks (impacts context breadth vs. relevance)
- **response_style**: "concise" | "detailed" (affects output verbosity)
- **response_length**: "brief" | "comprehensive" (affects output depth)
- **chunk_size/overlap**: Requires rebuilding vector_db (see Task 2)

> NOTE:  not part of the assignment but may help you understand concepts in you're familiar with similar ML and data science concepts.

## RAG Evaluation Datasets

Building toward systematic evaluation requires three foundational datasets:

| Dataset | Purpose | Analogy |
|---------|---------|---------|
| **Sources** | Document your chunks | "What's in the filing cabinet?" |
| **Golden Testset** | Define what good looks like | "The answer key" |
| **Evaluation Inputs** | Capture what you actually got | "The student's work" |

These datasets enable **vibe checking** ‚Äî building intuition about retrieval quality before introducing LLM-as-judge metrics in future sessions.

### Dataset 1: RAG Sources

Document every chunk with metadata. This helps you understand what's actually in your knowledge base.

In [None]:
import pandas as pd

# =============================================================================
# DATASET 1: WELLNESS RAG SOURCES
# Documents all chunks with metadata
# =============================================================================

def get_chunk_preview(chunk_text: str, max_len: int = 80) -> str:
    """Create a readable preview showing control characters for educational purposes.
    
    Shows \\n literally so students can see where newlines occur in the text.
    This helps understand chunking artifacts (e.g., fragments like ' Relief\\n').
    
    Note: \\n is a single character (ASCII 10), displayed as two chars for visibility.
    """
    # Take first max_len chars and replace newlines with visible \n
    preview = chunk_text[:max_len].replace('\n', '\\n').replace('\r', '\\r')
    return preview + "..." if len(chunk_text) > max_len else preview

# Build the sources dataset
wellness_rag_sources = pd.DataFrame({
    "chunk_id": range(len(split_documents)),
    "chunk_content": split_documents,
    "preview": [get_chunk_preview(chunk) for chunk in split_documents],
    "char_count": [len(chunk) for chunk in split_documents],
    "word_count": [len(chunk.split()) for chunk in split_documents],
    # Metadata about chunking settings - dynamically retrieved from splitter instance
    "chunk_size": text_splitter.chunk_size,
    "chunk_overlap": text_splitter.chunk_overlap,
    "source_file": "data/HealthWellnessGuide.txt",
})

print(f"üìö RAG Sources Dataset: {len(wellness_rag_sources)} chunks\n")
display(wellness_rag_sources[["chunk_id", "preview", "char_count", "word_count", "chunk_size", "chunk_overlap", "source_file"]].head(10))

### Dataset 2: Golden Testset

Define "what good looks like" ‚Äî curated question-answer pairs. This is your answer key for vibe checking.

In [None]:
# =============================================================================
# DATASET 2: WELLNESS GOLDEN TESTSET
# Curated Q&A pairs with expected retrievals - your "answer key"
# =============================================================================

# Define the golden testset
wellness_golden_testset_raw = [
    {
        "question_id": 1,
        "question": "What exercises help with lower back pain?",
        "reference_answer": "Recommended exercises for lower back pain include Cat-Cow Stretch, Bird Dog, Partial Crunches, Knee-to-Chest Stretch, and Pelvic Tilts.",
        "difficulty": "simple",
    },
    {
        "question_id": 2,
        "question": "What are natural remedies for improving sleep quality?",
        "reference_answer": "Natural sleep remedies include herbal teas (chamomile, valerian root), magnesium supplements, meditation, relaxation techniques, and good sleep hygiene practices.",
        "difficulty": "simple",
    },
    {
        "question_id": 3,
        "question": "How can I relieve neck and shoulder tension?",
        "reference_answer": "Exercises for neck and shoulder tension include Neck Rolls, Shoulder Shrugs, Chest Opener stretches, and Chin Tucks.",
        "difficulty": "simple",
    },
    {
        "question_id": 4,
        "question": "How can I manage stress naturally?",
        "reference_answer": "Natural stress management includes deep breathing exercises, meditation, regular physical activity, adequate sleep, and relaxation techniques.",
        "difficulty": "simple",
    },
    {
        "question_id": 5,
        "question": "What should I eat for better energy and nutrition?",
        "reference_answer": "For better energy, eat balanced meals with lean proteins, complex carbohydrates, healthy fats, and plenty of fruits and vegetables rich in vitamins and minerals.",
        "difficulty": "simple",
    },
    {
        "question_id": 6,
        "question": "How much water should I drink daily?",
        "reference_answer": "General hydration guidelines recommend drinking adequate water throughout the day, typically 8 glasses or adjusting based on activity level and climate.",
        "difficulty": "simple",
    },
    {
        "question_id": 7,
        "question": "How do exercise and sleep work together for overall wellness?",
        "reference_answer": "Exercise improves sleep quality by reducing stress and promoting physical tiredness, while adequate sleep supports muscle recovery and provides energy for physical activity.",
        "difficulty": "multi-hop",
    },
]

# Build DataFrame
wellness_golden_testset = pd.DataFrame(wellness_golden_testset_raw)

print(f"üéØ Golden Testset: {len(wellness_golden_testset)} questions\n")
display(wellness_golden_testset[["question_id", "question", "difficulty"]])

### Dataset 3: Evaluation Inputs

Capture what your retriever actually returns for each golden testset question. Compare retrieved vs. expected to compute simple metrics.

In [None]:
# =============================================================================
# DATASET 3: WELLNESS EVALUATION INPUTS
# Simple capture of RAG pipeline inputs and outputs (like GDELT dataset)
# =============================================================================

evaluation_inputs = []

for _, test_row in wellness_golden_testset.iterrows():
    # Run the RAG pipeline
    result = rag_pipeline.run_pipeline(test_row["question"], k=3)
    
    # Capture simple inputs/outputs
    evaluation_inputs.append({
        "user_input": test_row["question"],
        "retrieved_contexts": [ctx for ctx, score in result["context"]],
        "response": result["response"],
        "reference": test_row["reference_answer"],
    })

wellness_evaluation_inputs = pd.DataFrame(evaluation_inputs)

print(f"üìä Evaluation Inputs: {len(wellness_evaluation_inputs)} rows\n")
display(wellness_evaluation_inputs[["user_input", "response"]])

In [None]:
# View full dataset structure
print("üìã Full Dataset Columns:\n")
print(wellness_evaluation_inputs.columns.tolist())

print("\nüìÑ Sample row (first question):\n")
print(f"user_input: {wellness_evaluation_inputs.iloc[0]['user_input']}")
print(f"\nretrieved_contexts: {len(wellness_evaluation_inputs.iloc[0]['retrieved_contexts'])} chunks")
print(f"\nresponse (first 200 chars): {wellness_evaluation_inputs.iloc[0]['response'][:200]}...")
print(f"\nreference: {wellness_evaluation_inputs.iloc[0]['reference']}")

### Vibe Check: Compare Response to Reference

Use this to manually review if the LLM response captures the key information from the reference answer.

In [None]:
# =============================================================================
# VIBE CHECK: Side-by-side comparison of response vs reference
# =============================================================================

for idx, row in wellness_evaluation_inputs.iterrows():
    print(f"{'='*70}")
    print(f"Q{idx+1}: {row['user_input']}")
    print(f"{'='*70}")
    print(f"\nüìù RESPONSE (from LLM):\n{row['response'][:400]}...")
    print(f"\n‚úì REFERENCE (expected):\n{row['reference']}")
    print(f"\n")

In [None]:
# =============================================================================
# EXPORT DATASETS (Optional)
# =============================================================================

print("üì¶ Dataset Summary:\n")
print(f"  wellness_rag_sources:        {wellness_rag_sources.shape[0]} rows √ó {wellness_rag_sources.shape[1]} cols")
print(f"  wellness_golden_testset:     {wellness_golden_testset.shape[0]} rows √ó {wellness_golden_testset.shape[1]} cols")
print(f"  wellness_evaluation_inputs:  {wellness_evaluation_inputs.shape[0]} rows √ó {wellness_evaluation_inputs.shape[1]} cols")

print("\nüìã Evaluation Inputs Schema (matches GDELT pattern):")
print("  ‚Ä¢ user_input:          The question")
print("  ‚Ä¢ retrieved_contexts:  List of retrieved chunks")
print("  ‚Ä¢ response:            LLM's generated answer")
print("  ‚Ä¢ reference:           Expected answer from golden testset")

# Uncomment to export
# wellness_rag_sources.to_csv("wellness_rag_sources.csv", index=False)
# wellness_golden_testset.to_csv("wellness_golden_testset.csv", index=False)
# wellness_evaluation_inputs.to_csv("wellness_evaluation_inputs.csv", index=False)