# M4: Putting it all together

## Concepts Embedding

In [None]:
# =============================================================================
# Query ChromaDB - Check how many concepts are stored
# =============================================================================

import sys
sys.path.insert(0, '/app')

import chromadb

try:
    client = chromadb.HttpClient(host="chromadb", port=8000)
    collection_name = "concepts"
    
    print("=" * 60)
    print("ChromaDB Statistics")
    print("=" * 60)
    
    # Check if collection exists
    try:
        collection = client.get_collection(collection_name)
        count = collection.count()
        metadata = collection.metadata
        
        print(f"‚úÖ Collection: {collection_name}")
        print(f"‚úÖ Total Concepts: {count}")
        print(f"Metadata: {metadata}")
        
        if count == 0:
            print("\n‚ö†Ô∏è  ChromaDB collection exists but is empty!")
            print("   Run the embedding cell below to populate it.")
        else:
            print(f"\n‚úÖ ChromaDB has {count} concepts embedded and ready for semantic search!")
            
    except Exception as e:
        if "does not exist" in str(e) or "not found" in str(e).lower():
            print(f"‚ö†Ô∏è  Collection '{collection_name}' does not exist yet.")
            print("\nüìù Next steps:")
            print("   1. Make sure MongoDB has concepts (check the knowledge graph notebook)")
            print("   2. Run the embedding cell below to create the collection and embed concepts")
        else:
            print(f"‚ùå Error accessing collection: {e}")
    
except Exception as e:
    print("=" * 60)
    print("ChromaDB Statistics")
    print("=" * 60)
    print(f"‚ùå Cannot connect to ChromaDB: {e}")
    print("\nüîß Troubleshooting:")
    print("   1. Make sure ChromaDB container is running:")
    print("      docker-compose up -d chromadb")
    print("   2. Check container status:")
    print("      docker-compose ps chromadb")
    print("   3. Check logs:")
    print("      docker-compose logs chromadb")


ChromaDB Statistics
‚ö†Ô∏è  Collection 'concepts' does not exist yet.

üìù Next steps:
   1. Make sure MongoDB has concepts (check the knowledge graph notebook)
   2. Run the embedding cell below to create the collection and embed concepts


In [None]:
# =============================================================================
# M4: Embed Concepts into ChromaDB
# =============================================================================

import sys
sys.path.insert(0, '/app')

from src.retrieval.concept_embeddings import ConceptEmbedder

# Initialize embedder
embedder = ConceptEmbedder(
    mongo_uri="mongodb://erica:erica_password_123@mongodb:27017/",
    chroma_host="chromadb",
    chroma_port=8000,
)

# Embed all concepts (takes ~1-2 minutes for 3120 concepts)
embedder.embed_all_concepts(clear_existing=True)

# Check stats
print("\n" + "=" * 50)
print("ChromaDB Stats:")
print(embedder.get_stats())

Loading embedding model: all-MiniLM-L6-v2...
Model loaded. Embedding dimension: 384
Found 3120 concepts in MongoDB
  Embedded 100/3120 concepts
  Embedded 200/3120 concepts
  Embedded 300/3120 concepts
  Embedded 400/3120 concepts
  Embedded 500/3120 concepts
  Embedded 600/3120 concepts
  Embedded 700/3120 concepts
  Embedded 800/3120 concepts
  Embedded 900/3120 concepts
  Embedded 1000/3120 concepts
  Embedded 1100/3120 concepts
  Embedded 1200/3120 concepts
  Embedded 1300/3120 concepts
  Embedded 1400/3120 concepts
  Embedded 1500/3120 concepts
  Embedded 1600/3120 concepts
  Embedded 1700/3120 concepts
  Embedded 1800/3120 concepts
  Embedded 1900/3120 concepts
  Embedded 2000/3120 concepts
  Embedded 2100/3120 concepts
  Embedded 2200/3120 concepts
  Embedded 2300/3120 concepts
  Embedded 2400/3120 concepts
  Embedded 2500/3120 concepts
  Embedded 2600/3120 concepts
  Embedded 2700/3120 concepts
  Embedded 2800/3120 concepts
  Embedded 2900/3120 concepts
  Embedded 3000/3120 con

Loading embedding model: all-MiniLM-L6-v2...
Model loaded. Embedding dimension: 384
  Stopping...
  Stopping...
  Stopping...
  Stopping...
  Stopping...


Exception ignored in atexit callback: <function dump_compile_times at 0xffff0c9a7e20>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/site-packages/torch/_dynamo/utils.py", line 845, in dump_compile_times
    log.info(compile_times(repr="str", aggregate=True))
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/site-packages/torch/_dynamo/utils.py", line 831, in compile_times
    out += tabulate(rows, headers=("Function", "Runtimes (s)"))
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/site-packages/torch/_dynamo/utils.py", line 237, in tabulate
    import tabulate
  File "<frozen importlib._bootstrap>", line 1176, in _find_and_load
  File "<frozen importlib._bootstrap>", line 1138, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 1070, in _find_spec
  File "/usr/local/lib/python3.11/site-packages/streamlit/web/bootstrap.py", line 42, in signal_handler
    

  Stopping...


In [2]:
# =============================================================================
# Test Semantic Search
# =============================================================================

# Test queries
test_queries = [
    "What is backpropagation?",
    "How do neural networks learn?",
    "Explain gradient descent",
    "What is the difference between CNN and RNN?",
    "How does attention mechanism work in transformers?",
]

for query in test_queries:
    print(f"\n{'=' * 60}")
    print(f"Query: {query}")
    print("=" * 60)
    
    results = embedder.search(query, top_k=5)
    
    for i, r in enumerate(results, 1):
        print(f"\n{i}. {r['title']} (score: {r['score']}, difficulty: {r['difficulty']})")
        print(f"   {r['definition'][:150]}...")


Query: What is backpropagation?

1. Back-Propagation (score: 0.6454, difficulty: intermediate)
   An algorithm used to compute the gradient of the loss function with respect to the weights in a neural network, enabling the optimization of the netwo...

2. Backward Propagation (score: 0.6286, difficulty: intermediate)
   A method used in training neural networks to calculate the gradient of the loss function with respect to the weights of the network....

3. Backward Pass (score: 0.6013, difficulty: intermediate)
   The process of computing gradients of the loss function with respect to the parameters of a neural network, performed during backpropagation....

4. Stochastic Backpropagation (score: 0.6004, difficulty: intermediate)
   An optimization method used in neural network training that is faster for large and redundant problems, such as classification tasks....

5. Guided Backpropagation (score: 0.5644, difficulty: intermediate)
   A method for visualizing the important regions o

## Graph Retreival

In [1]:
# =============================================================================
# Test Graph Retrieval
# =============================================================================

import sys
sys.path.insert(0, '/app')

from src.retrieval.graph_retriever import GraphRetriever

retriever = GraphRetriever(neo4j_uri="bolt://neo4j:7687")

# Test with a concept
seed_concepts = ["Gradient Descent"]
subgraph = retriever.expand_seeds(seed_concepts)

print(f"Seed concepts: {subgraph.seed_concepts}")
print(f"\nRetrieved {len(subgraph.concepts)} concepts:")
for c in subgraph.concepts:
    print(f"  [{c.depth}] {c.title} ({c.relation_to_seed} of {c.seed_concept})")
    print(f"      {c.definition[:80]}..." if c.definition else "      (no definition)")

print(f"\nRetrieved {len(subgraph.resources)} resources:")
for r in subgraph.resources[:5]:
    print(f"  [{r.resource_type}] {r.title}")
    print(f"      Explains: {', '.join(r.concepts_explained[:3])}")

print(f"\nRetrieved {len(subgraph.examples)} examples:")
for e in subgraph.examples[:5]:
    print(f"  [{e.example_type}] {e.concept}")
    print(f"      {e.text[:80]}...")

print(f"\nPrerequisite chains:")
for chain in subgraph.prereq_chain:
    print(f"  {' ‚Üí '.join(chain)}")

# Get topological order for explanation
order = retriever.get_topological_order(subgraph.concepts)
print(f"\nTopological order (simple ‚Üí complex):")
for i, title in enumerate(order, 1):
    print(f"  {i}. {title}")

retriever.close()

Seed concepts: ['Gradient Descent']

Retrieved 15 concepts:
  [0] Gradient Descent (seed of Gradient Descent)
      An optimization algorithm used to minimize a function by iteratively moving towa...
  [1] L2 Regularization (prerequisite of Gradient Descent)
      A technique that adds a penalty equal to the square of the magnitude of coeffici...
  [1] Loss Function (prerequisite of Gradient Descent)
      A mathematical function that quantifies the difference between predicted and act...
  [1] Gradients (prerequisite of Gradient Descent)
      A vector of partial derivatives indicating the direction and rate of the steepes...
  [1] Back-Propagation (prerequisite of Gradient Descent)
      An algorithm used to compute the gradient of the loss function with respect to t...
  [1] Matrix Derivatives (prerequisite of Gradient Descent)
      The derivative of a function with respect to a matrix, often represented as a te...
  [1] Chain Rule (prerequisite of Gradient Descent)
      The chain

## Hybrid Retriever

In [1]:
# =============================================================================
# M4: Full Retrieval + Generation Pipeline Test
# =============================================================================

import sys
sys.path.insert(0, '/app')

from src.retrieval.hybrid_retriever import HybridRetriever
from src.generation.answer_generator import AnswerGenerator

# Initialize components
retriever = HybridRetriever(
    mongo_uri="mongodb://erica:erica_password_123@mongodb:27017/",
    chroma_host="chromadb",
    neo4j_uri="bolt://neo4j:7687",
)

generator = AnswerGenerator()  # Uses OPENROUTER_API_KEY from environment

# Test query
query = "How does backpropagation work in neural networks?"

print("=" * 60)
print(f"Query: {query}")
print("=" * 60)

# Step 1: Retrieve
print("\n[1] Retrieving context...")
result = retriever.retrieve(query)
print(result.summary())

# Step 2: Show what we're sending to the LLM
print("\n[2] Seed concepts found:")
for match in result.semantic_matches:
    print(f"   - {match['title']} (score: {match['score']})")

print("\n[3] Explanation order:")
for i, title in enumerate(result.ordered_concepts[:8], 1):
    print(f"   {i}. {title}")

# Step 3: Generate answer
print("\n[4] Generating answer...")
answer = generator.generate(result)

print("\n" + "=" * 60)
print("ERICA'S ANSWER:")
print("=" * 60)
print(answer)

# Cleanup
retriever.close()

Loading embedding model: all-MiniLM-L6-v2...
Model loaded. Embedding dimension: 384
Query: How does backpropagation work in neural networks?

[1] Retrieving context...
Query: How does backpropagation work in neural networks?
Seeds: Back-Propagation, Backward Propagation, Stochastic Backpropagation, Backward Pass, Forward Propagation
Concepts: 15
Resources: 58
Examples: 27
Order: Backward Propagation ‚Üí Stochastic Backpropagation ‚Üí Forward Propagation ‚Üí Residual Networks ‚Üí Recurrent Neural Networks (RNNs)...

[2] Seed concepts found:
   - Back-Propagation (score: 0.6162)
   - Backward Propagation (score: 0.5871)
   - Stochastic Backpropagation (score: 0.5814)
   - Backward Pass (score: 0.5661)
   - Forward Propagation (score: 0.5596)

[3] Explanation order:
   1. Backward Propagation
   2. Stochastic Backpropagation
   3. Forward Propagation
   4. Residual Networks
   5. Recurrent Neural Networks (RNNs)
   6. Hard Attention
   7. Feedforward Networks
   8. Back-Propagation

[4] G

## Launch Erica

In [1]:
# =============================================================================
# Launch Streamlit App
# =============================================================================

import subprocess
import webbrowser

# Start Streamlit in background
process = subprocess.Popen([
    "streamlit", "run", "src/app.py",
    "--server.port", "8501",
    "--server.address", "0.0.0.0"
])

print("Streamlit app starting at http://localhost:8501")
print("Press Ctrl+C to stop")

Streamlit app starting at http://localhost:8501
Press Ctrl+C to stop
