# Test Refactored Architecture

This notebook tests the new modular system design with:
- Stage-specific processors
- Clean data loaders  
- Proper separation of concerns
- Input validation


In [None]:
import sys
sys.path.append('..')

from flows.processing.data_loaders import RawDataLoader, SummaryDataLoader
from flows.processing.stage_processors import summarize_item, categorize_item
from flows.processing.tasks import get_items, process_item
from pipeline.pipeline_state import PipelineStateManager
from pipeline.config import pipeline_stages
from src.utils.logging_utils import setup_logger

logger = setup_logger("test_refactored", "test_refactored.log")

print("✓ All imports successful")
print("=" * 60)


## Step 1: Test Data Loaders


In [None]:
# Test RawDataLoader
raw_loader = RawDataLoader()

# Find a test file
import os
from pathlib import Path

test_files = list(Path("../data/raw").rglob("*.json"))
if test_files:
    test_file = str(test_files[0])
    print(f"Testing with file: {test_file}")
    
    try:
        raw_data = raw_loader.load(test_file)
        print(f"✓ RawData loaded successfully:")
        print(f"  - ID: {raw_data.id}")
        print(f"  - Title: {raw_data.title}")
        print(f"  - Type: {raw_data.type}")
        print(f"  - Speakers: {raw_data.speakers}")
        print(f"  - Transcript length: {len(raw_data.transcript)} chars")
    except Exception as e:
        print(f"✗ RawData loading failed: {e}")
else:
    print("⚠ No test files found in data/raw/")


## Step 2: Test Stage Processors


In [None]:
# Test getting items for summarization
try:
    items = get_items(pipeline_stages.SUMMARIZE)
    print(f"✓ Found {len(items)} items ready for summarization")
    
    if items:
        item = items[0]
        print(f"  - Testing item: {item['id']}")
        print(f"  - Raw file: {item['raw_file_path']}")
        
        # Test summarization processor
        result = summarize_item(item)
        if result['success']:
            print(f"✓ Summarization successful:")
            print(f"  - Summary length: {len(result['result'].summary)} chars")
            print(f"  - Word count: {result['result'].summary_word_count}")
        else:
            print(f"✗ Summarization failed: {result['error']}")
    else:
        print("⚠ No items found for summarization")
        
except Exception as e:
    print(f"✗ Error testing stage processors: {e}")


## Step 3: Test Error Handling


In [None]:
# Test error handling with invalid data
print("Testing error handling...")

# Test with non-existent file
try:
    raw_loader.load("non_existent_file.json")
    print("✗ Should have failed with non-existent file")
except FileNotFoundError:
    print("✓ Correctly handled non-existent file")

# Test with invalid item for processing
invalid_item = {"id": "test", "raw_file_path": "fake_path.json"}
result = process_item(invalid_item, pipeline_stages.SUMMARIZE)
if not result['success']:
    print("✓ Correctly handled invalid item processing")
    print(f"  Error: {result['error']}")
else:
    print("✗ Should have failed with invalid item")

print("=" * 60)


## Summary

### ✅ **Improved System Design:**

**1. Single Responsibility Principle:**
- `RawDataLoader` → Only loads raw data
- `SummaryDataLoader` → Only loads summary data  
- `summarize_item` → Only handles summarization
- `categorize_item` → Only handles categorization

**2. Clean Abstractions:**
- Data loaders use dataclasses for type safety
- Stage processors have clear input/output contracts
- Error handling is consistent across all components

**3. Proper Validation:**
- Required field validation in data loaders
- Content validation in processors
- Graceful error handling with detailed messages

**4. Maintainable Architecture:**
- Easy to add new stages (just create new processor)
- Easy to modify data formats (just update dataclasses)
- Clear separation between loading, processing, and orchestration

**5. No More Problems:**
- ❌ No more giant if/elif blocks
- ❌ No more hard-coded values scattered around
- ❌ No more mixed abstraction levels
- ❌ No more tight coupling between components
