# DeepSeek OCR Pipeline - Google Cloud Run GPU

This notebook runs the 3-stage OCR pipeline on Google Cloud Run with GPU support.

**Prerequisites:**
- Google Cloud credentials configured (service account key or application default credentials)
- Cloud Run GPU enabled in your project
- Required Python packages: `google-cloud-run`, `google-cloud-storage`, `google-cloud-artifact-registry`

**References:**
- [Cloud Run GPU Documentation](https://cloud.google.com/run/docs/configuring/services/gpu)
- [Supercharging Cloud Run with GPU Power](https://medium.com/google-cloud/supercharging-cloud-run-with-gpu-power-a-new-era-for-ai-workloads-3c54fcf60cae)

In [None]:
# Install required packages
!pip install -q google-cloud-run google-cloud-storage google-cloud-build google-auth

In [None]:
import os
import json
import time
from pathlib import Path

from google.cloud import run_v2
from google.cloud import storage
from google.protobuf import duration_pb2

In [None]:
# Option 1: Set credentials from a service account key file
# os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/path/to/your/service-account-key.json"

# Option 2: If running locally, run this in terminal first:
# gcloud auth application-default login

# Verify credentials
from google.auth import default
credentials, project = default()
print(f"Authenticated with project: {project}")

## Configuration

In [None]:
# GCP settings (update these for your project)
PROJECT_ID = project  # From authentication cell above
REGION = "us-east4"  # Cloud Run GPU available regions: us-central1, us-east4, europe-west4
BUCKET_NAME = f"{PROJECT_ID}-ocr"  # GCS bucket name

# Container image settings
IMAGE_URI = f"{REGION}-docker.pkg.dev/{PROJECT_ID}/deepseek-ocr/deepseek-ocr:latest"

# Project settings
PROJECT_NAME = "deepseek-ocr"

# Model and dataset settings
MODEL_NAME = "deepseek-ai/DeepSeek-OCR"
DATASET_NAME = "HuggingFaceM4/FineVision"
DATASET_CONFIG = "olmOCR-mix-0225-documents"
MAX_SAMPLES = 20

# GPU configuration
GPU_TYPE = "nvidia-l4"  # Cloud Run supports L4 GPUs
GPU_COUNT = 1
MEMORY = "32Gi"
CPU = "8"

# GCS output path (single location for all stages - dataset gets updated in place)
GCS_OUTPUT_URI = f"gs://{BUCKET_NAME}/{PROJECT_NAME}/pipeline"

# Base environment variables (passed to all stages)
BASE_ENV = {
    "MODEL_ID": MODEL_NAME,
    "DATASET_NAME": DATASET_NAME,
    "DATASET_CONFIG": DATASET_CONFIG,
    "MAX_SAMPLES": str(MAX_SAMPLES),
    "HF_HUB_ENABLE_HF_TRANSFER": "1",
    # Performance tuning
    "EXTRACT_BATCH_SIZE": "16",
    "EXTRACT_MAX_CONCURRENCY": "8",
    "GPU_MEMORY_UTILIZATION": "0.90",
    "VLLM_STARTUP_TIMEOUT": "900",
}

# Add HF token if available
if os.environ.get("HF_TOKEN"):
    BASE_ENV["HF_TOKEN"] = os.environ["HF_TOKEN"]

print(f"Project: {PROJECT_NAME}")
print(f"GCS Output URI: {GCS_OUTPUT_URI}")

## Create GCS Bucket

In [None]:
# Create GCS bucket if it doesn't exist
storage_client = storage.Client(project=PROJECT_ID)

try:
    bucket = storage_client.get_bucket(BUCKET_NAME)
    print(f"Bucket already exists: gs://{BUCKET_NAME}")
except Exception:
    bucket = storage_client.create_bucket(BUCKET_NAME, location=REGION)
    print(f"Created bucket: gs://{BUCKET_NAME}")

## Build Container Image

**Note:** Container building requires either:
1. Run from a machine with Docker installed and push to Artifact Registry
2. Use Cloud Build (requires `gcloud` CLI or Cloud Build API)
3. Use a pre-built image

For simplicity, we'll create the Dockerfile here and you can build it separately.

In [None]:
# Create Dockerfile for Cloud Run GPU
# Note: Build context is parent directory (..) since llm_ocr is there
dockerfile_content = '''FROM vllm/vllm-openai:latest

# Install uv for fast dependency management
RUN curl -LsSf https://astral.sh/uv/install.sh | sh
ENV PATH="/root/.local/bin:$PATH"

# Copy pipeline code (from parent directory context)
WORKDIR /app
COPY llm_ocr/ /app/llm_ocr/
COPY google-cloud-run/gcr_job_runner.py /app/

# Set environment variables
ENV PYTHONUNBUFFERED=1
ENV HF_HUB_ENABLE_HF_TRANSFER=1
ENV PYTHONPATH=/app:$PYTHONPATH

# Entry point - uv run reads deps from inline script metadata (PEP 723)
ENTRYPOINT ["uv", "run", "/app/gcr_job_runner.py"]
'''

dockerfile_path = Path("Dockerfile.cloudrun")
dockerfile_path.write_text(dockerfile_content)
print(f"Created {dockerfile_path}")
print("\nTo build and push the image, run these commands from batch-ocr-inference/:")
print(f"")
print(f"# Authenticate Docker with Artifact Registry")
print(f"gcloud auth configure-docker {REGION}-docker.pkg.dev")
print(f"")
print(f"# Create Artifact Registry repository (if needed)")
print(f"gcloud artifacts repositories create ocr-pipeline --repository-format=docker --location={REGION} --project={PROJECT_ID}")
print(f"")
print(f"# Build and push image (from batch-ocr-inference/ directory)")
print(f"docker build -f google-cloud-run/Dockerfile.cloudrun -t {IMAGE_URI} .")
print(f"docker push {IMAGE_URI}")

## Helper Functions (Python SDK)

In [None]:
def launch_stage(stage: str, env: dict = None):
    """Launch a pipeline stage as a Cloud Run job.
    
    Args:
        stage: Pipeline stage (extract, describe, assemble)
        env: Stage-specific environment variables (optional)
        
    Returns:
        job_name: Name of the created/running job
    """
    from google.cloud import run_v2
    from google.protobuf import duration_pb2
    
    job_name = f"{PROJECT_NAME}-{stage}"
    
    # Create client with regional endpoint
    client_options = {"api_endpoint": f"{REGION}-run.googleapis.com"}
    client = run_v2.JobsClient(client_options=client_options)
    
    parent = f"projects/{PROJECT_ID}/locations/{REGION}"
    job_path = f"{parent}/jobs/{job_name}"
    
    # Merge base env with stage-specific env
    full_env = {**BASE_ENV, "PIPELINE_STAGE": stage}
    if env:
        full_env.update(env)
    
    # Build environment variables list
    env_vars = [run_v2.EnvVar(name=k, value=str(v)) for k, v in full_env.items()]
    
    # Create job configuration
    job = run_v2.Job(
        template=run_v2.ExecutionTemplate(
            template=run_v2.TaskTemplate(
                # Disable GPU zonal redundancy (required for GPU jobs)
                gpu_zonal_redundancy_disabled=True,
                containers=[
                    run_v2.Container(
                        image=IMAGE_URI,
                        env=env_vars,
                        resources=run_v2.ResourceRequirements(
                            limits={
                                "cpu": CPU,
                                "memory": MEMORY,
                                "nvidia.com/gpu": str(GPU_COUNT),
                            }
                        ),
                    )
                ],
                node_selector=run_v2.NodeSelector(
                    accelerator=GPU_TYPE,
                ),
                timeout=duration_pb2.Duration(seconds=3600),
                max_retries=0,
            ),
        ),
        labels={"stage": stage},
    )
    
    # Delete existing job if it exists
    try:
        client.get_job(name=job_path)
        print(f"Deleting existing job: {job_name}")
        delete_op = client.delete_job(name=job_path)
        delete_op.result()
        import time
        time.sleep(5)
    except Exception as e:
        if "not found" not in str(e).lower() and "404" not in str(e):
            print(f"Warning: {e}")
    
    # Create job
    print(f"Creating job: {job_name}")
    request = run_v2.CreateJobRequest(parent=parent, job=job, job_id=job_name)
    operation = client.create_job(request=request)
    operation.result()
    
    # Run job
    print(f"Launching {stage} stage...")
    run_operation = client.run_job(name=job_path)
    
    print(f"Started job: {job_name}")
    print(f"Console: https://console.cloud.google.com/run/jobs/details/{REGION}/{job_name}/executions?project={PROJECT_ID}")
    
    return job_name


def wait_for_job(job_name: str, poll_interval: int = 30, timeout: int = 3600):
    """Wait for a Cloud Run job to complete."""
    from google.cloud import run_v2
    import time
    
    client_options = {"api_endpoint": f"{REGION}-run.googleapis.com"}
    exec_client = run_v2.ExecutionsClient(client_options=client_options)
    
    parent = f"projects/{PROJECT_ID}/locations/{REGION}/jobs/{job_name}"
    start_time = time.time()
    
    print(f"Waiting for job {job_name}...")
    
    while time.time() - start_time < timeout:
        executions = list(exec_client.list_executions(parent=parent))
        if not executions:
            time.sleep(poll_interval)
            continue
        
        latest = executions[0]
        
        if latest.succeeded_count > 0:
            print(f"  {job_name}: Completed ✓")
            return {"status": "Completed", "execution": latest}
        elif latest.failed_count > 0:
            print(f"  {job_name}: Failed ✗")
            for cond in latest.conditions:
                if cond.type_ == "Completed" and cond.state.name == "CONDITION_FAILED":
                    print(f"  Reason: {cond.message}")
            return {"status": "Failed", "execution": latest}
        else:
            print(f"  {job_name}: Running... (running={latest.running_count}, pending={latest.pending_count})")
        
        time.sleep(poll_interval)
    
    raise TimeoutError(f"Job {job_name} did not complete within {timeout}s")


def check_job_status(job_name: str):
    """Check the status of recent executions for a job."""
    from google.cloud import run_v2
    
    client_options = {"api_endpoint": f"{REGION}-run.googleapis.com"}
    exec_client = run_v2.ExecutionsClient(client_options=client_options)
    
    parent = f"projects/{PROJECT_ID}/locations/{REGION}/jobs/{job_name}"
    
    print(f"Recent executions for {job_name}:")
    for execution in exec_client.list_executions(parent=parent):
        status = "UNKNOWN"
        if execution.succeeded_count > 0:
            status = "SUCCEEDED ✓"
        elif execution.failed_count > 0:
            status = "FAILED ✗"
        elif execution.running_count > 0:
            status = "RUNNING..."
        elif execution.pending_count > 0:
            status = "PENDING"
        print(f"  {execution.name.split('/')[-1]}: {status}")


# Import IO and rendering utilities from llm_ocr
from llm_ocr.gcr_io import load_dataset_from_gcs
from llm_ocr.document import render_sample_markdown, display_markdown


def display_samples(dataset, num_samples: int = 2):
    """Display a few samples from the dataset."""
    from IPython.display import display
    
    print(f"Dataset: {len(dataset)} samples")
    print(f"Columns: {list(dataset.column_names)}")
    print()
    
    for i in range(min(num_samples, len(dataset))):
        sample = dataset[i]
        print(f"=== Sample {i}: {sample['sample_id']} ===")
        
        if sample.get('source_image'):
            print("Source image:")
            display(sample['source_image'])
        
        md = sample.get('document_markdown') or sample.get('document_markdown_text', '')
        if md:
            print(f"\nMarkdown preview ({len(md)} chars):")
            print(md[:500] + '...' if len(md) > 500 else md)
        
        final_md = sample.get('document_final_markdown') or sample.get('document_final_markdown_text', '')
        if final_md:
            print(f"\nFinal markdown preview ({len(final_md)} chars):")
            print(final_md[:500] + '...' if len(final_md) > 500 else final_md)
        
        figures = sample.get('extracted_figures', [])
        if figures:
            print(f"\nExtracted figures: {len(figures)}")
            for fig in figures[:2]:
                display(fig)
        print()


## Stage 1: Extract

In [None]:
# Stage 1: Extract
# Output dataset will be saved to GCS
stage1_env = {
    "GCS_OUTPUT_URI": GCS_OUTPUT_URI,
}

stage1_job = launch_stage("extract", stage1_env)

In [None]:
# Wait for extract to complete
stage1_result = wait_for_job(stage1_job)
print(f"Extract stage completed: {stage1_result['status']}")

In [None]:
# Load and display samples after Extract
ds_extract = load_dataset_from_gcs(f"{GCS_OUTPUT_URI}/dataset")
display_samples(ds_extract, num_samples=2)

## Stage 2: Describe

In [None]:
# Stage 2: Describe
# Updates dataset in place (same location as extract)
stage2_env = {
    "GCS_OUTPUT_URI": GCS_OUTPUT_URI,
    "GCS_INPUT_URI": f"{GCS_OUTPUT_URI}/dataset",
}

stage2_job = launch_stage("describe", stage2_env)

In [None]:
# Wait for Stage 2 to complete
# stage2_result = wait_for_job(stage2_job)
# print(f"Describe stage completed: {stage2_result['status']}")

check_job_status(stage2_job)

In [None]:
# Wait for describe to complete
describe_result = wait_for_job("deepseek-ocr-describe")
print(f"Describe stage completed: {describe_result['status']}")

In [None]:
# Load and display samples after Describe
ds_describe = load_dataset_from_gcs(f"{GCS_OUTPUT_URI}/dataset")
#display_samples(ds_describe, num_samples=2)

## Stage 3: Assemble

In [None]:
# Stage 3: Assemble
# Updates dataset in place + saves final markdown files
stage3_env = {
    "GCS_OUTPUT_URI": GCS_OUTPUT_URI,
    "GCS_INPUT_URI": f"{GCS_OUTPUT_URI}/dataset",
}

stage3_job = launch_stage("assemble", stage3_env)

In [None]:
# Wait for Stage 3 to complete (optional)
# stage3_result = wait_for_job(stage3_job)
# print(f"Assemble stage completed: {stage3_result['status']}")

check_job_status(stage3_job)

In [None]:
# Load and display final samples after Assemble
ds_final = load_dataset_from_gcs(f"{GCS_OUTPUT_URI}/dataset")
display_samples(ds_final, num_samples=2)

In [None]:
# Display rendered markdown with images for sample 1
# This properly renders figure: URIs using images from extracted_figures column
display_markdown(ds_final[1])


## Load Final Dataset

In [None]:
from datasets import load_from_disk
import tempfile

# Download and load final dataset
storage_client = storage.Client(project=PROJECT_ID)
bucket = storage_client.bucket(BUCKET_NAME)

# Download dataset files
dataset_prefix = "pipeline/assemble/dataset"
local_dir = tempfile.mkdtemp()

blobs = bucket.list_blobs(prefix=dataset_prefix)
for blob in blobs:
    rel_path = blob.name[len(dataset_prefix):].lstrip("/")
    if rel_path:
        local_path = Path(local_dir) / rel_path
        local_path.parent.mkdir(parents=True, exist_ok=True)
        blob.download_to_filename(str(local_path))

dataset = load_from_disk(local_dir)
print(f"Loaded dataset: {dataset}")
dataset

In [None]:
# View a sample
sample = dataset[0]
print("Sample keys:", list(sample.keys()))
print("\nFinal markdown preview:")
print(sample.get("document_final_markdown_text", "")[:1000])

## Cleanup

In [None]:
# Delete jobs (optional)
def delete_job(job_name: str):
    from google.cloud.run_v2 import JobsClient
    client = JobsClient()
    job_path = f"projects/{PROJECT_ID}/locations/{REGION}/jobs/{job_name}"
    try:
        client.delete_job(name=job_path)
        print(f"Deleted job: {job_name}")
    except Exception as e:
        print(f"Could not delete {job_name}: {e}")

# Uncomment to delete:
# delete_job("deepseek-ocr-extract")
# delete_job("deepseek-ocr-describe")
# delete_job("deepseek-ocr-assemble")