# Export CCE to JSONL for RAG

This notebook exports validated candidate content to JSONL format suitable for RAG systems.


## Setup


In [None]:
# Install SDK
!pip install -e /content/dorc-clients/sdk/python


In [None]:
import os
import json
from datetime import datetime
from pathlib import Path
from dorc_client import DorcClient

# Configure environment
os.environ["DORC_ENGINE_URL"] = "https://your-engine-url.run.app"
os.environ["DORC_TENANT_SLUG"] = "my-tenant"

# Create client
client = DorcClient()

# Create export directory
export_dir = Path("/content/dorc_exports")
export_dir.mkdir(exist_ok=True)
print(f"Export directory: {export_dir}")


## Step 1: Validate Content (or Use Existing Run)

Either validate new content or use an existing run_id from a previous validation.


In [None]:
# Option 1: Validate new content
candidate_text = """# My Validated Document

This content has been validated and is ready for export.
"""

candidate_id = "export-test-001"

print("Validating content...")
response = client.validate(
    candidate_text=candidate_text,
    candidate_id=candidate_id
)

run_id = response.run_id
print(f"Run ID: {run_id}")
print(f"Status: {response.pipeline_status}")
print(f"Summary: PASS={response.content_summary.pass_}, FAIL={response.content_summary.fail}, WARN={response.content_summary.warn}")

# Option 2: Use existing run_id (uncomment and set)
# run_id = "your-existing-run-id"
# response = client.get_run(run_id)


## Step 2: Check Validation Status (Optional)

Verify the validation passed or get user confirmation to proceed despite failures.


In [None]:
# Get latest run state
run_state = client.get_run(run_id)

summary = run_state.content_summary
print(f"Validation Status: {run_state.pipeline_status}")
print(f"PASS: {summary.pass_}, FAIL: {summary.fail}, WARN: {summary.warn}, ERROR: {summary.error}")

if summary.fail > 0:
    print("\n⚠️  WARNING: Validation has failures. Export anyway?")
    print("Set 'force_export = True' below to proceed.")
else:
    print("\n✅ Validation passed. Ready to export.")

# Set to True to export despite failures
force_export = False


## Step 3: Export to JSONL


In [None]:
# Check if we should export
if summary.fail > 0 and not force_export:
    print("❌ Export skipped due to validation failures.")
    print("Set 'force_export = True' in the cell above to export anyway.")
else:
    # Get full run details
    run_state = client.get_run(run_id)
    chunks = client.list_chunks(run_id)
    
    # Prepare export record
    export_record = {
        "tenant_slug": run_state.tenant_slug,
        "type": "cce",
        "id": run_state.meta.get("candidate_id", run_id),
        "run_id": run_id,
        "text": candidate_text,  # Note: In production, you'd retrieve this from storage
        "metadata": {
            "pipeline_status": run_state.pipeline_status,
            "content_summary": {
                "pass": summary.pass_,
                "fail": summary.fail,
                "warn": summary.warn,
                "error": summary.error,
            },
            "inserted_at": run_state.inserted_at,
            "chunk_count": len(chunks),
            **run_state.meta
        }
    }
    
    # Generate filename with timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = export_dir / f"cce_export_{timestamp}.jsonl"
    
    # Write JSONL file (one JSON object per line)
    with open(filename, "w") as f:
        json_line = json.dumps(export_record, ensure_ascii=False)
        f.write(json_line + "\n")
    
    print(f"✅ Exported to: {filename}")
    print(f"\nExport record:")
    print(f"  ID: {export_record['id']}")
    print(f"  Tenant: {export_record['tenant_slug']}")
    print(f"  Status: {export_record['metadata']['pipeline_status']}")
    print(f"  Chunks: {export_record['metadata']['chunk_count']}")
    print(f"  Content length: {len(export_record['text'])} characters")


## Step 4: Download Instructions


In [None]:
print("\n" + "="*60)
print("Download Instructions")
print("="*60)
print("\nTo download the exported file from Colab:")
print("\n1. Use the Colab file browser:")
print(f"   - Navigate to: {export_dir}")
print(f"   - Right-click the .jsonl file")
print("   - Select 'Download'")
print("\n2. Or use Python code:")
print(f"   from google.colab import files")
print(f"   files.download('{filename}')")
print("\n3. Or use the files module:")
print(f"   !cp {filename} /content/")
print(f"   # Then download from /content/ via file browser")
print("\n" + "="*60)


In [None]:
# Optional: Download directly (uncomment to use)
# from google.colab import files
# files.download(str(filename))
