# 03 - Dashboard Launcher

**Zero Spark** notebook that launches the production Streamlit dashboard.

Responsibilities:
- Verify infrastructure readiness (Garage, Kafka, Spark)
- Launch Streamlit app on port 8501
- Health checks and monitoring

## Run Notes
- Start Docker with `docker compose up -d --build` before running this notebook.
- Run cells top-to-bottom: config, infra checks, launch, wait, monitor, shutdown.
- Infra check now aborts early if Kafka/Spark/Garage stays unreachable after retries.
- Streamlit stdout/stderr is piped to notebook logs to troubleshoot missing data.

## Verify Bucket Data
Verify Gold tables in bucket before launching dashboard.

In [1]:
import os
import logging
import boto3
from dotenv import load_dotenv
from deltalake import DeltaTable

load_dotenv()

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)



# S3/Garage Configuration
garage_endpoint = os.getenv("GARAGE_ENDPOINT", "http://garage:3900")
access_key = os.getenv("ACCESS_KEY", "minioadmin")
secret_key = os.getenv("SECRET_KEY", "minioadmin")
bucket_name = os.getenv("BUCKET_NAME", "datalake")

storage_options = {
    "AWS_ENDPOINT_URL": garage_endpoint,
    "AWS_ACCESS_KEY_ID": access_key,
    "AWS_SECRET_ACCESS_KEY": secret_key,
    "AWS_REGION": "us-east-1",
    "AWS_S3_ALLOW_UNSAFE_SSL": "true",
    "AWS_ALLOW_HTTP": "true",
}

# Initialize S3 client
s3_client = boto3.client(
    "s3",
    endpoint_url=garage_endpoint,
    aws_access_key_id=access_key,
    aws_secret_access_key=secret_key,
    region_name="us-east-1"
)

logger.info("=" * 50)
logger.info("BUCKET DATA VERIFICATION")
logger.info("=" * 50)

try:
    s3_client.head_bucket(Bucket=bucket_name)
    logger.info(f"OK: Bucket '{bucket_name}' exists")
except Exception as e:
    logger.error(f"ERROR: Bucket check failed: {e}")

logger.info(f"\nObjects in '{bucket_name}':")
try:
    response = s3_client.list_objects_v2(Bucket=bucket_name, MaxKeys=100)
    if "Contents" in response:
        for obj in response["Contents"]:
            key = obj["Key"]
            size = obj["Size"]
            logger.info(f"  {key:50} ({size:>10} bytes)")
    else:
        logger.warning("  (Empty bucket)")
except Exception as e:
    logger.error(f"  Error listing objects: {e}")

logger.info(f"\nGold Layer Status:")
gold_paths = {
    "Traffic": f"s3a://{bucket_name}/gold/traffic_by_country",
    "Metrics": f"s3a://{bucket_name}/gold/metrics_by_category",
}

for table_name, path in gold_paths.items():
    try:
        dt = DeltaTable(path, storage_options=storage_options)
        df = dt.to_pandas()
        record_count = len(df)
        cols = list(df.columns)
        logger.info(f"  OK - {table_name:15} | {record_count:5} rows | cols: {', '.join(cols[:3])}...")
    except FileNotFoundError:
        logger.warning(f"  PENDING - {table_name:15} | Not yet created (waiting for data)")
    except Exception as e:
        logger.error(f"  ERROR - {table_name:15} | {str(e)[:50]}")

logger.info("\n" + "=" * 50)
logger.info("Verification complete - Ready to launch dashboard")
logger.info("=" * 50 + "\n")

2026-01-23 18:13:33,806 - INFO - BUCKET DATA VERIFICATION
2026-01-23 18:13:33,816 - ERROR - ERROR: Bucket check failed: An error occurred (400) when calling the HeadBucket operation: Bad Request
2026-01-23 18:13:33,817 - INFO - 
Objects in 'datalake':
2026-01-23 18:13:33,837 - INFO -   bronze/flights/_delta_log/00000000000000000000.json (      2242 bytes)
2026-01-23 18:13:33,838 - INFO -   bronze/flights/_delta_log/00000000000000000001.json (      3091 bytes)
2026-01-23 18:13:33,839 - INFO -   bronze/flights/_delta_log/00000000000000000002.json (      3100 bytes)
2026-01-23 18:13:33,840 - INFO -   bronze/flights/_delta_log/00000000000000000003.json (      3092 bytes)
2026-01-23 18:13:33,840 - INFO -   bronze/flights/_delta_log/00000000000000000004.json (      3101 bytes)
2026-01-23 18:13:33,841 - INFO -   bronze/flights/_delta_log/00000000000000000005.json (      3088 bytes)
2026-01-23 18:13:33,842 - INFO -   bronze/flights/_delta_log/00000000000000000006.json (      3103 bytes)
2026-0

In [2]:
import os
import subprocess
import time
import requests
import logging
import socket
import threading
from typing import Dict, Tuple
from datetime import datetime

SERVICES: Dict[str, str] = {
    "Kafka": "tcp://kafka1:9092",
    "Spark Master": "http://spark:8080",
    "Garage S3": "tcp://garage:3903",
}

DASHBOARD_PORT = "8501"
DASHBOARD_URL = f"http://localhost:{DASHBOARD_PORT}"

logger.info("Configuration loaded")

2026-01-23 18:13:37,564 - INFO - Configuration loaded


In [3]:
def health_check_service(name: str, endpoint: str) -> Tuple[bool, str]:
    """Check service connectivity via HTTP or TCP."""
    # Kafka needs more time to accept connections
    timeout = 10 if "Kafka" in name else 3
    
    try:
        if endpoint.startswith("http://") or endpoint.startswith("https://"):
            response = requests.get(endpoint, timeout=timeout)
            if 200 <= response.status_code < 300:
                return True, f"OK ({response.status_code})"
            else:
                return False, f"HTTP {response.status_code}"
        elif endpoint.startswith("tcp://"):
            host_port = endpoint.replace("tcp://", "")
            host, port_str = host_port.split(":", 1)
            port = int(port_str)
            with socket.create_connection((host, port), timeout=timeout):
                return True, "OK (TCP)"
        else:
            return False, "Unsupported scheme"
    except requests.exceptions.Timeout:
        return False, "Timeout"
    except requests.exceptions.ConnectionError:
        return False, "Unreachable"
    except (socket.timeout, socket.error) as e:
        return False, f"Unreachable ({str(e)[:20]})"
    except Exception as e:
        return False, f"Error: {str(e)[:30]}"

def check_infrastructure() -> bool:
    """Verify all services are operational."""
    logger.info("Infrastructure Health Check")
    logger.info("-" * 40)
    
    all_ok = True
    for service_name, endpoint in SERVICES.items():
        ok, status = health_check_service(service_name, endpoint)
        logger.info(f"{service_name:20} {status}")
        if not ok:
            all_ok = False
    
    if not all_ok:
        logger.warning("Some services not ready")
        return False
    
    logger.info("All services healthy")
    return True

# Kafka takes longer to start - increase retries and wait time
max_retries = 10
wait_time = 15
ready = False

logger.info(f"Starting infrastructure checks (max {max_retries} attempts, {wait_time}s intervals)")
logger.info("Note: Kafka typically takes 30-60s to become ready after container start")
logger.info("")

for attempt in range(max_retries):
    if check_infrastructure():
        ready = True
        break
    if attempt < max_retries - 1:
        logger.info(f"Retry in {wait_time}s (attempt {attempt + 1}/{max_retries})")
        logger.info("")
        time.sleep(wait_time)
    else:
        logger.error("Infrastructure check failed after all retries")

if not ready:
    logger.error("")
    logger.error("=" * 60)
    logger.error("TROUBLESHOOTING:")
    logger.error("1. Verify Docker containers are running: docker ps")
    logger.error("2. Check Kafka logs: docker logs big-data-tan-kafka1-1")
    logger.error("3. Kafka may need more startup time (wait 60s and retry)")
    logger.error("4. Verify network: docker network ls")
    logger.error("=" * 60)
    raise SystemExit("Infrastructure check failed; aborting dashboard launch")

logger.info("")
logger.info("✓ All infrastructure services ready - proceeding with dashboard launch")

2026-01-23 18:13:39,727 - INFO - Starting infrastructure checks (max 10 attempts, 15s intervals)
2026-01-23 18:13:39,729 - INFO - Note: Kafka typically takes 30-60s to become ready after container start
2026-01-23 18:13:39,731 - INFO - 
2026-01-23 18:13:39,731 - INFO - Infrastructure Health Check
2026-01-23 18:13:39,732 - INFO - ----------------------------------------
2026-01-23 18:13:39,735 - INFO - Kafka                OK (TCP)
2026-01-23 18:13:40,175 - INFO - Spark Master         OK (200)
2026-01-23 18:13:40,177 - INFO - Garage S3            OK (TCP)
2026-01-23 18:13:40,178 - INFO - All services healthy
2026-01-23 18:13:40,178 - INFO - 
2026-01-23 18:13:40,179 - INFO - ✓ All infrastructure services ready - proceeding with dashboard launch


In [None]:
def stream_logs(process: subprocess.Popen) -> None:
    """Pipe Streamlit stdout/stderr into notebook logs."""
    def _pipe(stream, level: int) -> None:
        for line in iter(stream.readline, ""):
            if line:
                logger.log(level, f"[streamlit] {line.rstrip()}")
    threading.Thread(target=_pipe, args=(process.stdout, logging.INFO), daemon=True).start()
    threading.Thread(target=_pipe, args=(process.stderr, logging.ERROR), daemon=True).start()


def launch_streamlit() -> subprocess.Popen:
    """Launch Streamlit application with live bucket access."""
    logger.info("Launching Streamlit Dashboard")
    
    # Ensure environment variables are passed to Streamlit
    env = os.environ.copy()
    env.update({
        "GARAGE_ENDPOINT": os.getenv("GARAGE_ENDPOINT", "http://garage:3900"),
        "BUCKET_NAME": os.getenv("BUCKET_NAME", "datalake"),
        "ACCESS_KEY": os.getenv("ACCESS_KEY", "minioadmin"),
        "SECRET_KEY": os.getenv("SECRET_KEY", "minioadmin"),
    })
    
    cmd = [
        "streamlit", "run", "app.py",
        "--server.port", DASHBOARD_PORT,
        "--server.address", "0.0.0.0",
        "--client.showErrorDetails", "true",
        "--logger.level", "debug",
        "--client.toolbarMode", "minimal",
        "--server.runOnSave", "true",
        "--server.headless", "true"
    ]
    
    try:
        process = subprocess.Popen(
            cmd,
            cwd="/home/jovyan/work",
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True,
            bufsize=1,
            env=env
        )
        stream_logs(process)
        logger.info(f"Streamlit started (PID: {process.pid})")
        logger.info(f"Environment: BUCKET_NAME={env.get('BUCKET_NAME')}, GARAGE_ENDPOINT={env.get('GARAGE_ENDPOINT')}")
        return process
    except Exception as e:
        logger.error(f"Failed to launch Streamlit: {str(e)}")
        raise


dashboard_process = launch_streamlit()

2026-01-23 18:13:43,053 - INFO - Launching Streamlit Dashboard
2026-01-23 18:13:43,060 - INFO - Streamlit started (PID: 1499)
2026-01-23 18:13:43,061 - INFO - Environment: BUCKET_NAME=datalake, GARAGE_ENDPOINT=http://garage:3900


2026-01-23 18:13:43,937 - INFO - [streamlit] 
2026-01-23 18:13:43,939 - ERROR - [streamlit] 2026-01-23 18:13:43.939 Starting server...
2026-01-23 18:13:43,940 - INFO - [streamlit] Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
2026-01-23 18:13:43,942 - ERROR - [streamlit] 2026-01-23 18:13:43.939 Serving static content from /opt/conda/lib/python3.12/site-packages/streamlit/static
2026-01-23 18:13:43,942 - INFO - [streamlit] 
2026-01-23 18:13:43,946 - ERROR - [streamlit] 2026-01-23 18:13:43.945 Server started on port 8501
2026-01-23 18:13:43,947 - ERROR - [streamlit] 2026-01-23 18:13:43.946 Runtime state: RuntimeState.INITIAL -> RuntimeState.NO_SESSIONS_CONNECTED
2026-01-23 18:13:44,123 - INFO - [streamlit] 
2026-01-23 18:13:44,124 - INFO - [streamlit]   You can now view your Streamlit app in your browser.
2026-01-23 18:13:44,125 - INFO - [streamlit] 
2026-01-23 18:13:44,125 - ERROR - [streamlit] 2026-01-23 18:13:44.124 Setting up signal handler
2026-0

In [5]:
logger.info(f"Waiting for Streamlit on {DASHBOARD_URL}...")

max_wait = 30
elapsed = 0

while elapsed < max_wait:
    try:
        response = requests.get(DASHBOARD_URL, timeout=2)
        if response.status_code == 200:
            logger.info("Dashboard is running")
            break
    except requests.exceptions.ConnectionError:
        pass
    
    time.sleep(1)
    elapsed += 1

if elapsed >= max_wait:
    logger.warning("Dashboard startup timeout")

2026-01-23 18:13:46,947 - INFO - Waiting for Streamlit on http://localhost:8501...
2026-01-23 18:13:47,030 - INFO - Dashboard is running


In [6]:
logger.info(f"Dashboard Active - URL: {DASHBOARD_URL}")
logger.info(f"Spark UI: http://localhost:4040")
logger.info(f"Data Lake: {os.getenv('BUCKET_NAME', 'datalake')}")
logger.info("")
logger.info("Dashboard pulling live data from:")
logger.info(f"  s3a://{os.getenv('BUCKET_NAME', 'datalake')}/gold/traffic_by_country")
logger.info(f"  s3a://{os.getenv('BUCKET_NAME', 'datalake')}/gold/metrics_by_category")
logger.info("")

try:
    iteration = 0
    while True:
        iteration += 1
        
        if dashboard_process.poll() is not None:
            logger.error("Streamlit process terminated")
            break
        
        if iteration % 60 == 0:
            logger.info(f"Dashboard running ({iteration}s uptime) - pulling live data")
        
        time.sleep(1)

except KeyboardInterrupt:
    logger.info("Shutdown requested")

2026-01-23 18:13:51,395 - INFO - Dashboard Active - URL: http://localhost:8501
2026-01-23 18:13:51,396 - INFO - Spark UI: http://localhost:4040
2026-01-23 18:13:51,397 - INFO - Data Lake: datalake
2026-01-23 18:13:51,399 - INFO - 
2026-01-23 18:13:51,400 - INFO - Dashboard pulling live data from:
2026-01-23 18:13:51,401 - INFO -   s3a://datalake/gold/traffic_by_country
2026-01-23 18:13:51,402 - INFO -   s3a://datalake/gold/metrics_by_category
2026-01-23 18:13:51,403 - INFO - 
2026-01-23 18:14:50,550 - INFO - Dashboard running (60s uptime) - pulling live data
2026-01-23 18:15:50,736 - INFO - Dashboard running (120s uptime) - pulling live data
2026-01-23 18:16:46,130 - INFO - Shutdown requested


In [21]:
def shutdown_dashboard():
    """Gracefully stop Streamlit process."""
    logger.info("Shutting down dashboard")
    
    try:
        if dashboard_process.poll() is None:
            logger.info(f"Terminating process (PID: {dashboard_process.pid})")
            dashboard_process.terminate()
            
            try:
                dashboard_process.wait(timeout=5)
                logger.info("Streamlit terminated")
            except subprocess.TimeoutExpired:
                logger.warning("Forcing shutdown")
                dashboard_process.kill()
                dashboard_process.wait()
        else:
            logger.info("Process already stopped")
    except Exception as e:
        logger.error(f"Shutdown error: {str(e)}")
    
    logger.info("Shutdown complete")

shutdown_dashboard()

2026-01-23 18:06:48,799 - INFO - Shutting down dashboard
2026-01-23 18:06:48,801 - INFO - Process already stopped
2026-01-23 18:06:48,801 - INFO - Shutdown complete
