In [1]:
"""
============================================================================
Jupyter Notebook: Playground Pipeline Testing
============================================================================
Tests chunking, embedding, vector DB, and document ingestion exactly as
app_phase2.py does.

Workflow:
1. Load playground config
2. Create DocumentService via get_service_for_config
3. Test chunking strategies
4. Test embedding with different providers
5. Test vector DB operations (upsert, query)
6. Test full document ingestion (PDF, DOCX, TXT)
7. Test document listing and chunk retrieval
8. Test query/retrieval

All tests follow the same logic used in the Playground UI.
============================================================================
"""

import sys
import os
import logging
import uuid
from pathlib import Path
from datetime import datetime
import yaml

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

print("=" * 70)
print("PLAYGROUND PIPELINE TESTING")
print("=" * 70)
print(f"Start time: {datetime.now()}")
print(f"Working directory: {Path.cwd()}")
print("=" * 70)


PLAYGROUND PIPELINE TESTING
Start time: 2025-11-28 12:32:33.931966
Working directory: C:\Users\91917\Desktop\interview_preparation\Project\genai_multi_domain_platform


In [2]:
"""
============================================================================
CELL 1: Imports and Setup
============================================================================
"""
print("\n" + "=" * 70)
print("CELL 1: Imports and Setup")
print("=" * 70)

# Core imports
from core.playground_config_manager import PlaygroundConfigManager
from core.config_manager import ConfigManager, DomainConfig
from core.services.document_service import DocumentService, ValidationError, ProcessingError

# Try to import factories (with fallbacks)
try:
    from core.factories.embedding_factory import EmbeddingFactory

    print("‚úÖ EmbeddingFactory imported")
except ImportError as e:
    print(f"‚ö†Ô∏è  EmbeddingFactory not found: {e}")
    EmbeddingFactory = None

try:
    from core.factories.chunking_factory import ChunkingFactory

    print("‚úÖ ChunkingFactory imported")
except ImportError as e:
    print(f"‚ö†Ô∏è  ChunkingFactory not found: {e}")
    ChunkingFactory = None

try:
    from core.factories.vectorstore_factory import VectorStoreFactory

    print("‚úÖ VectorStoreFactory imported")
except ImportError as e:
    print(f"‚ö†Ô∏è  VectorStoreFactory not found: {e}")
    VectorStoreFactory = None

try:
    from core.pipeline.document_pipeline import DocumentPipeline

    print("‚úÖ DocumentPipeline imported")
except ImportError as e:
    print(f"‚ö†Ô∏è  DocumentPipeline not found: {e}")
    DocumentPipeline = None

try:
    from core.utils.file_parsers import extract_text_from_file

    print("‚úÖ File parsers imported")
except ImportError as e:
    print(f"‚ö†Ô∏è  File parsers not found: {e}")


    def extract_text_from_file(file_path, filename):
        # Fallback: simple text extraction
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            return {"text": f.read(), "metadata": {}}

print("\n‚úÖ All imports completed (with fallbacks where needed)")



CELL 1: Imports and Setup





‚úÖ EmbeddingFactory imported
‚úÖ ChunkingFactory imported
‚úÖ VectorStoreFactory imported
‚úÖ DocumentPipeline imported
‚úÖ File parsers imported

‚úÖ All imports completed (with fallbacks where needed)





In [3]:
"""
============================================================================
CELL 2: Load Playground Config
============================================================================
"""
print("\n" + "=" * 70)
print("CELL 2: Load Playground Config")
print("=" * 70)

pg_mgr = PlaygroundConfigManager()

# List available configs
all_configs = PlaygroundConfigManager.list_configs()
print(f"\nüìã Available playground configs: {len(all_configs)}")
for idx, cfg in enumerate(all_configs, 1):
    print(f"  {idx}. {cfg['name']} (session: {cfg['session_id']})")

if not all_configs:
    print("\n‚ö†Ô∏è  No playground configs found!")
    print("Creating a test config...")

    # Create minimal test config
    test_config = {
        "domain_id": "pipeline_test",
        "name": "pipeline_test",
        "display_name": "Pipeline Test Configuration",
        "description": "Config for testing pipeline components",
        "vector_store": {
            "provider": "chromadb",
            "collection_name": "pipeline_test_collection",
            "persist_directory": ".data/chromadb/pipeline_test"
        },
        "chunking": {
            "strategy": "recursive",
            "recursive": {
                "chunk_size": 500,
                "overlap": 50
            }
        },
        "embeddings": {
            "provider": "sentence_transformers",
            "model_name": "all-MiniLM-L6-v2",
            "device": "cpu",
            "batch_size": 32,
            "normalize": True
        },
        "retrieval": {
            "strategies": ["hybrid"],
            "top_k": 10,
            "similarity": "cosine",
            "hybrid": {
                "alpha": 0.7
            }
        },
        "security": {
            "allowed_file_types": ["pdf", "docx", "txt"],
            "max_file_size_mb": 50
        }
    }

    session_id = str(uuid.uuid4())[:8]
    saved_path = PlaygroundConfigManager.save_config("pipeline_test", session_id, test_config)
    print(f"‚úÖ Created test config: {saved_path}")

    # Refresh list
    all_configs = PlaygroundConfigManager.list_configs()

# Select first config for testing
test_config_name = all_configs[0]['name']
test_config_file = all_configs[0]['filename']

print(f"\nüéØ Using config: {test_config_name}")
print(f"   File: {test_config_file}")



2025-11-28 12:32:46,459 - core.playground_config_manager - INFO - Global config loaded successfully



CELL 2: Load Playground Config

üìã Available playground configs: 4
  1. legal_test_config_28112025 (session: 28112025)
  2. finance_test_config_28112025 (session: 28112025)
  3. legal_test_config (session: 5369612f)
  4. finance_test_config (session: 6d98906f)

üéØ Using config: legal_test_config_28112025
   File: legal_test_config_28112025_28112025.yaml


In [4]:
# ============================================================================
# CELL 3: Load and Validate Config (DomainConfig)
# ============================================================================

print("\n" + "=" * 70)
print("CELL 3: Load and Validate Config")
print("=" * 70)

# Load playground config
pg_config_dict = PlaygroundConfigManager.load_config(test_config_file)
print(f"‚úÖ Loaded playground config from: {test_config_file}")
print(f"   Keys: {list(pg_config_dict.keys())}")

# Merge with global defaults
merged_config = pg_mgr.merge_with_global(pg_config_dict)
print(f"‚úÖ Merged with global config")

# Ensure required fields
synth_domain_id = pg_config_dict.get("playground_name") or pg_config_dict.get("domain_id") or test_config_name
merged_config.setdefault("domain_id", synth_domain_id)
merged_config.setdefault("name", synth_domain_id)
merged_config.setdefault("display_name", synth_domain_id)

print(f"\nüìã Config details:")
print(f"   Domain ID: {merged_config['domain_id']}")
print(f"   Vector Store: {merged_config.get('vectorstore', {}).get('provider')}")
print(f"   Chunking: {merged_config.get('chunking', {}).get('strategy')}")
print(f"   Embeddings: {merged_config.get('embeddings', {}).get('provider')}")
print(f"   Retrieval: {merged_config.get('retrieval', {}).get('strategies')}")

# Validate with DomainConfig
try:
    domain_config = DomainConfig(**merged_config)
    print(f"\n‚úÖ DomainConfig validated successfully!")
    print(f"   Domain: {domain_config.domain_id}")
    print(f"   Vector Store: {domain_config.vectorstore.provider}")
    print(f"   Chunking: {domain_config.chunking.strategy}")
    print(f"   Embeddings: {domain_config.embeddings.provider}")
except Exception as e:
    print(f"\n‚ùå Validation failed: {e}")
    raise


2025-11-28 12:32:46,565 - core.playground_config_manager - INFO - Loaded playground config: legal_test_config_28112025_28112025.yaml



CELL 3: Load and Validate Config
‚úÖ Loaded playground config from: legal_test_config_28112025_28112025.yaml
   Keys: ['name', 'domain_id', 'description', 'vector_store', 'chunking', 'embeddings', 'retrieval', 'security', 'llm_rerank', 'playground_name', 'session_id', 'created_at', 'last_modified']
‚úÖ Merged with global config

üìã Config details:
   Domain ID: legal_test_config_28112025
   Vector Store: chromadb
   Chunking: recursive
   Embeddings: sentence_transformers
   Retrieval: ['hybrid']

‚úÖ DomainConfig validated successfully!
   Domain: legal_test_config_28112025
   Vector Store: chromadb
   Chunking: recursive
   Embeddings: sentence_transformers


In [5]:
# !pip uninstall sentence-transformers huggingface-hub tokenizers -y
# !pip install sentence-transformers==2.7.0 huggingface-hub==0.23.0 tokenizers==0.19.1


In [6]:
print("\n" + "=" * 70)
print("CELL 4: Create DocumentService")
print("=" * 70)

# This mimics get_service_for_config from app_phase2.py
try:
    # Try to create service with domain_config parameter
    doc_service = DocumentService(domain_config=domain_config)
    print(f"‚úÖ DocumentService created with domain_config parameter")
except TypeError:
    # Fallback: write temp domain YAML
    print("‚ö†Ô∏è  DocumentService doesn't accept domain_config param")
    print("   Writing temporary domain YAML...")

    temp_domain_name = f"{synth_domain_id}_temp"
    temp_domain_file = Path("configs/domains") / f"{temp_domain_name}.yaml"
    temp_domain_file.parent.mkdir(parents=True, exist_ok=True)

    domain_dict = domain_config.model_dump() if hasattr(domain_config, 'model_dump') else domain_config.dict()
    with open(temp_domain_file, 'w') as f:
        yaml.safe_dump(domain_dict, f)

    print(f"   Wrote: {temp_domain_file}")

    doc_service = DocumentService(domain_id=temp_domain_name)
    print(f"‚úÖ DocumentService created with temp domain: {temp_domain_name}")

print(f"\nüìä DocumentService initialized:")
print(f"   Domain: {doc_service.domain_id if hasattr(doc_service, 'domain_id') else 'N/A'}")
print(f"   Pipeline available: {hasattr(doc_service, 'pipeline')}")



CELL 4: Create DocumentService
‚ö†Ô∏è  DocumentService doesn't accept domain_config param
   Writing temporary domain YAML...


2025-11-28 12:32:46,637 - core.config_manager - INFO - ConfigManager initialized:
  Config dir: C:\Users\91917\Desktop\interview_preparation\Project\genai_multi_domain_platform\configs
  Global config: global_config.yaml
  Domains dir: configs\domains
  Templates dir: configs\templates
2025-11-28 12:32:46,639 - core.services.document_service - INFO - Initializing DocumentService for domain: legal_test_config_28112025_temp
2025-11-28 12:32:46,640 - core.config_manager - INFO - Loading domain config: legal_test_config_28112025_temp
2025-11-28 12:32:46,658 - core.config_manager - INFO - ‚úÖ Domain config loaded and validated: legal_test_config_28112025_temp
   Chunking: recursive
   Embeddings: sentence_transformers
   Vector Store: chromadb
   Retrieval: hybrid
2025-11-28 12:32:46,659 - core.pipeline.document_pipeline - INFO - Initializing DocumentPipeline for domain: legal_test_config_28112025
2025-11-28 12:32:46,660 - core.factories.embedding_factory - INFO - Creating embedder with pro

   Wrote: configs\domains\legal_test_config_28112025_temp.yaml


2025-11-28 12:32:51,718 - core.embeddings.sentence_transformer_embeddings - INFO - ‚úÖ Model loaded successfully!
   Model: all-mpnet-base-v2
   Dimension: 768
   Device: cpu
   Batch size: 32
   Normalize: True
2025-11-28 12:32:51,720 - core.factories.embedding_factory - INFO - Created SentenceTransformerEmbeddings: model=all-mpnet-base-v2, device=cpu, batch_size=32, normalize=True
2025-11-28 12:32:51,721 - core.pipeline.document_pipeline - INFO - ‚úÖ Embedding model created: all-mpnet-base-v2 (768-dim)
2025-11-28 12:32:51,722 - core.factories.chunking_factory - INFO - Creating chunker for strategy: recursive
2025-11-28 12:32:51,730 - core.chunking.recursive_chunker - INFO - Initialized RecursiveChunker: chunk_size=600, overlap=60, model=all-mpnet-base-v2
2025-11-28 12:32:51,731 - core.factories.chunking_factory - INFO - Created RecursiveChunker: chunk_size=600, overlap=60, model=all-mpnet-base-v2
2025-11-28 12:32:51,733 - core.pipeline.document_pipeline - INFO - ‚úÖ Chunker created: 

 create_vectorstore config provider='chromadb' collection_name='default_collection' index_type='hnsw' persist_directory='./data/chroma_db' cloud='aws' region='us-east-1' api_key=None dimension=None 


2025-11-28 12:32:52,575 - chromadb.telemetry.product.posthog - ERROR - Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
2025-11-28 12:32:52,582 - chromadb.telemetry.product.posthog - ERROR - Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given
2025-11-28 12:32:52,606 - core.vectorstores.chromadb_store - INFO - ‚úÖ ChromaDB initialized successfully!
   Collection: default_collection
   Existing vectors: 0
   Distance metric: cosine
   Persist directory: C:\Users\91917\Desktop\interview_preparation\Project\genai_multi_domain_platform\data\chroma_db
2025-11-28 12:32:52,607 - core.factories.vectorstore_factory - INFO - ‚úÖ ChromaDB store created:
   Collection: default_collection
   Directory: ./data/chroma_db
2025-11-28 12:32:52,608 - core.pipeline.document_pipeline - INFO - ‚úÖ Vector store created: chromadb
2025-11-28 12:32:52,608 - core.pipeline.document_pipeline - INFO 

RuntimeError: Failed to initialize any retrieval strategies:
  - vector_similarity: VectorSimilarityRetrieval.__init__() got an unexpected keyword argument 'embedding_model'
