# Hydra Routing Debug Notebook

Tests every layer of the routing stack to identify issues.

## Routing Layers:
1. DNS Resolution
2. Traefik Entrypoint (port 80)
3. Traefik Router Matching
4. Traefik Middlewares (forwardAuth, stripPrefix)
5. Traefik Service Backend
6. Docker Network Connectivity
7. Container Service Response
8. Full Route Test (Through Traefik)
9. ForwardAuth Flow Test
10. GPU Access Test (nvidia-smi) - optional
11. Kubernetes Setup Verification - optional
12. K8s GPU Pod Test - optional

In [None]:
import requests
import socket
import json
import subprocess
import urllib3

# Suppress SSL warnings for self-signed certs in test environment
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

TRAEFIK_API = 'http://localhost:8081/api'
HYDRA_HOST = 'hydra.local'
TEST_USER = 'user1'

## Layer 1: DNS Resolution

In [None]:
def test_dns():
    try:
        ip = socket.gethostbyname(HYDRA_HOST)
        print(f'[OK] DNS: {HYDRA_HOST} -> {ip}')
        return True  # Return True if DNS resolution succeeds
    except socket.gaierror as e:
        print(f'[FAIL] DNS: {e}')
        print('  Add "127.0.0.1 hydra.local" to /etc/hosts')
        return False

test_dns()

## Layer 2: Traefik Entrypoint

In [None]:
def test_traefik_entrypoint():
    try:
        r = requests.get(f'{TRAEFIK_API}/entrypoints', timeout=5)
        entrypoints = r.json()
        print('Entrypoints:')
        for ep in entrypoints:
            print(f"  {ep['name']}: {ep.get('address', 'N/A')}")
        
        web_ep = next((e for e in entrypoints if e['name'] == 'web'), None)
        if web_ep:
            print(f'[OK] Web entrypoint: {web_ep.get("address")}')
            return True
        print('[FAIL] Web entrypoint not found')
        return False
    except Exception as e:
        print(f'[FAIL] Traefik API: {e}')
        return False

test_traefik_entrypoint()

## Layer 3: Traefik Router Matching

In [None]:
def test_traefik_routers():
    try:
        r = requests.get(f'{TRAEFIK_API}/http/routers', timeout=5)
        routers = r.json()
        
        print('HTTP Routers:')
        print('-' * 50)
        for router in routers:
            status = '[OK]' if router.get('status') == 'enabled' else '[--]'
            print(f"{status} {router['name']}")
            print(f"     Rule: {router.get('rule', 'N/A')}")
            print(f"     Service: {router.get('service', 'N/A')}")
            print(f"     Middlewares: {router.get('middlewares', [])}")
        
        student_routers = [r for r in routers if 'student' in r['name'].lower()]
        if student_routers:
            print(f'\n[OK] {len(student_routers)} student router(s)')
            return True
        print('\n[FAIL] No student routers found')
        return False
    except Exception as e:
        print(f'[FAIL] {e}')
        return False

test_traefik_routers()

## Layer 4: Traefik Middlewares

In [None]:
def test_traefik_middlewares():
    try:
        r = requests.get(f'{TRAEFIK_API}/http/middlewares', timeout=5)
        middlewares = r.json()
        
        print('HTTP Middlewares:')
        print('-' * 50)
        for mw in middlewares:
            status = '[OK]' if mw.get('status') == 'enabled' else '[--]'
            print(f"{status} {mw['name']}")
            if 'forwardAuth' in mw:
                print(f"     forwardAuth -> {mw['forwardAuth'].get('address', 'N/A')}")
            elif 'stripPrefix' in mw:
                print(f"     stripPrefix: {mw['stripPrefix'].get('prefixes', [])}")
        
        auth_mw = [m for m in middlewares if 'auth' in m['name'].lower()]
        strip_mw = [m for m in middlewares if 'strip' in m['name'].lower()]
        
        # Check for routers to determine if strip middleware is expected
        r = requests.get(f'{TRAEFIK_API}/http/routers', timeout=5)
        routers = r.json()
        has_non_jupyter_routes = any(
            'jupyter' not in router.get('name', '').lower() 
            for router in routers 
            if 'student' in router.get('name', '').lower()
        )
        
        if not auth_mw:
            print('\n[FAIL] Missing auth middleware')
            return False
        
        # Only require strip middleware if non-jupyter routes exist
        if has_non_jupyter_routes and not strip_mw:
            print('\n[FAIL] Missing strip middleware (non-jupyter routes present)')
            return False
        
        print(f'\n[OK] {len(auth_mw)} auth middleware(s)')
        if strip_mw:
            print(f'[OK] {len(strip_mw)} strip middleware(s)')
        return True
    except Exception as e:
        print(f'[FAIL] {e}')
        return False

test_traefik_middlewares()

## Layer 5: Traefik Services (Backend Resolution)

In [None]:
def test_traefik_services():
    try:
        r = requests.get(f'{TRAEFIK_API}/http/services', timeout=5)
        services = r.json()
        
        print('HTTP Services:')
        print('-' * 50)
        for svc in services:
            status = '[OK]' if svc.get('status') == 'enabled' else '[--]'
            print(f"{status} {svc['name']}")
            if 'loadBalancer' in svc:
                for server in svc['loadBalancer'].get('servers', []):
                    print(f"     -> {server.get('url', 'N/A')}")
        
        student_svcs = [s for s in services if 'student' in s['name'].lower()]
        if student_svcs:
            print(f'\n[OK] {len(student_svcs)} student service(s)')
            return True
        print('\n[FAIL] No student services')
        return False
    except Exception as e:
        print(f'[FAIL] {e}')
        return False

test_traefik_services()

## Layer 6: Docker Network Connectivity

In [None]:
def test_docker_network():
    try:
        result = subprocess.run(
            ['docker', 'network', 'inspect', 'hydra_students_net'],
            capture_output=True, text=True
        )
        if result.returncode != 0:
            print('[FAIL] Network hydra_students_net not found')
            return False
        
        network = json.loads(result.stdout)[0]
        containers = network.get('Containers', {})
        
        print('Containers on hydra_students_net:')
        print('-' * 50)
        for cid, info in containers.items():
            print(f"  {info.get('Name', cid)}: {info.get('IPv4Address', 'N/A')}")
        
        names = [c.get('Name', '') for c in containers.values()]
        has_traefik = any('traefik' in n.lower() for n in names)
        has_student = any('student' in n.lower() for n in names)
        
        if has_traefik and has_student:
            print('\n[OK] Traefik and student containers connected')
            return True
        if not has_traefik:
            print('\n[FAIL] Traefik not on network')
        if not has_student:
            print('\n[FAIL] No student containers on network')
        return False
    except Exception as e:
        print(f'[FAIL] {e}')
        return False

test_docker_network()

## Layer 7: Direct Container Service Test

In [None]:
def test_container_direct():
    # Get IP from the specific network (hydra_students_net) to avoid concatenated IPs
    result = subprocess.run(
        ['docker', 'inspect', '-f', 
         '{{range $key, $value := .NetworkSettings.Networks}}{{if eq $key "hydra_students_net"}}{{$value.IPAddress}}{{end}}{{end}}', 
         f'student-{TEST_USER}'],
        capture_output=True, text=True
    )
    if result.returncode != 0:
        print(f'[FAIL] Container student-{TEST_USER} not found')
        return False
    
    ip = result.stdout.strip()
    if not ip:
        print('[FAIL] Container has no IP on hydra_students_net')
        return False
    
    print(f'Container IP: {ip}')
    success = True
    
    print('\ncode-server (8443):')
    try:
        # code-server runs on HTTP, not HTTPS (--auth none mode)
        r = requests.get(f'http://{ip}:8443/', timeout=5)
        print(f'  [OK] status={r.status_code}')
    except Exception as e:
        print(f'  [FAIL] {e}')
        success = False
    
    print('\nJupyter (8888):')
    try:
        r = requests.get(f'http://{ip}:8888/', timeout=5)
        print(f'  [OK] status={r.status_code}')
    except Exception as e:
        print(f'  [FAIL] {e}')
        success = False
    
    return success

test_container_direct()

## Layer 8: Full Route Test (Through Traefik)

In [None]:
def test_full_route():
    tests = [
        ('Dashboard', f'http://{HYDRA_HOST}/dashboard'),
        ('VS Code', f'http://{HYDRA_HOST}/students/{TEST_USER}/vscode/'),
        ('Jupyter', f'http://{HYDRA_HOST}/students/{TEST_USER}/jupyter/'),
    ]
    
    print('Route tests through Traefik:')
    print('-' * 50)
    
    all_ok = True
    for name, url in tests:
        try:
            r = requests.get(url, timeout=10, allow_redirects=False)
            code = r.status_code
            if code == 200:
                result = '[OK]'
            elif code == 302:
                result = f'[REDIRECT] -> {r.headers.get("Location", "?")}'
            elif code == 401:
                result = '[AUTH] forwardAuth working'
            elif code == 404:
                result = '[FAIL] not found'
                all_ok = False
            elif code == 502:
                result = '[FAIL] bad gateway'
                all_ok = False
            else:
                result = f'[{code}]'
            print(f'{name}: {result}')
        except Exception as e:
            print(f'{name}: [FAIL] {e}')
            all_ok = False
    
    return all_ok

test_full_route()

## Layer 9: ForwardAuth Flow Test

In [None]:
def test_forward_auth():
    auth_url = 'http://localhost:6969/auth/verify'
    print('ForwardAuth test:')
    print('-' * 50)
    try:
        r = requests.get(auth_url, timeout=5)
        print(f'No session: {r.status_code}')
        if r.status_code == 401:
            print('[OK] Rejecting unauthenticated')
            return True
        elif r.status_code == 200:
            print('[WARN] Passing without session (dev mode?)')
            return True
        print(f'[WARN] Unexpected status')
        return False
    except Exception as e:
        print(f'[FAIL] {e}')
        return False

test_forward_auth()

## Layer 10: GPU Access Test (nvidia-smi)

In [None]:
def test_gpu_access():
    """Test GPU access on the host and in containers"""
    print('GPU Access Tests:')
    print('-' * 50)
    
    # Test 1: Host nvidia-smi
    print('\n1. Host nvidia-smi:')
    result = subprocess.run(
        ['nvidia-smi', '--query-gpu=name,memory.total,memory.free,utilization.gpu', '--format=csv,noheader'],
        capture_output=True, text=True
    )
    if result.returncode == 0:
        gpus = result.stdout.strip().split('\n')
        for i, gpu in enumerate(gpus):
            print(f'   GPU {i}: {gpu}')
        print(f'[OK] {len(gpus)} GPU(s) detected on host')
        host_gpus = len(gpus)
    else:
        print(f'[FAIL] nvidia-smi not available: {result.stderr.strip()}')
        return False
    
    # Test 2: Docker nvidia runtime
    print('\n2. Docker NVIDIA runtime:')
    result = subprocess.run(
        ['docker', 'info', '--format', '{{json .Runtimes}}'],
        capture_output=True, text=True
    )
    if result.returncode == 0:
        runtimes = json.loads(result.stdout)
        if 'nvidia' in runtimes:
            print('[OK] NVIDIA runtime registered in Docker')
        else:
            print('[WARN] NVIDIA runtime not found in Docker')
            print(f'     Available runtimes: {list(runtimes.keys())}')
    else:
        print(f'[FAIL] Could not query Docker runtimes')
    
    # Test 3: GPU container test
    print('\n3. GPU container test (quick):')
    result = subprocess.run(
        ['docker', 'run', '--rm', '--gpus', 'all', 
         'nvidia/cuda:12.0-base-ubuntu22.04', 
         'nvidia-smi', '-L'],
        capture_output=True, text=True,
        timeout=60
    )
    if result.returncode == 0:
        container_gpus = result.stdout.strip().split('\n')
        for gpu in container_gpus:
            print(f'   {gpu}')
        print(f'[OK] Container sees {len(container_gpus)} GPU(s)')
        if len(container_gpus) == host_gpus:
            print('[OK] Container GPU count matches host')
        else:
            print(f'[WARN] Container sees {len(container_gpus)} GPUs, host has {host_gpus}')
    else:
        print(f'[FAIL] GPU container test failed: {result.stderr.strip()}')
        return False
    
    # Test 4: Check student container GPU access (if exists)
    print(f'\n4. Student container GPU access (student-{TEST_USER}):')
    result = subprocess.run(
        ['docker', 'exec', f'student-{TEST_USER}', 'nvidia-smi', '-L'],
        capture_output=True, text=True
    )
    if result.returncode == 0:
        student_gpus = result.stdout.strip().split('\n')
        for gpu in student_gpus:
            print(f'   {gpu}')
        print(f'[OK] Student container has GPU access')
    elif 'No such container' in result.stderr:
        print(f'[SKIP] Container student-{TEST_USER} not found')
    else:
        print(f'[INFO] No GPU access in student container (expected for non-GPU preset)')
    
    return True

test_gpu_access()

## Layer 11: Kubernetes Setup Verification

In [None]:
def test_k8s_setup():
    """Verify Kubernetes cluster setup for Hydra"""
    print('Kubernetes Setup Tests:')
    print('-' * 50)
    
    # Helper to run kubectl commands
    def kubectl(args, json_output=False):
        cmd = ['kubectl'] + args
        if json_output:
            cmd += ['-o', 'json']
        result = subprocess.run(cmd, capture_output=True, text=True)
        if result.returncode != 0:
            return None, result.stderr
        if json_output:
            return json.loads(result.stdout), None
        return result.stdout, None
    
    # Test 1: kubectl connectivity
    print('\n1. Cluster connectivity:')
    result, err = kubectl(['cluster-info'])
    if result:
        print('[OK] kubectl connected to cluster')
        # Extract cluster endpoint
        for line in result.split('\n')[:2]:
            if line.strip():
                print(f'   {line.strip()}')
    else:
        print(f'[FAIL] Cannot connect to cluster: {err}')
        return False
    
    # Test 2: Check nodes and labels
    print('\n2. Node status and GPU labels:')
    nodes, err = kubectl(['get', 'nodes'], json_output=True)
    if nodes:
        for node in nodes.get('items', []):
            name = node['metadata']['name']
            labels = node['metadata'].get('labels', {})
            status = 'Ready' if any(
                c['type'] == 'Ready' and c['status'] == 'True'
                for c in node['status'].get('conditions', [])
            ) else 'NotReady'
            
            gpu_enabled = labels.get('hydra.gpu-enabled', 'false')
            node_role = labels.get('hydra.node-role', 'unknown')
            gpu_count = labels.get('hydra.gpu-count', '0')
            
            print(f'   {name}: {status}')
            print(f'     role={node_role}, gpu-enabled={gpu_enabled}, gpu-count={gpu_count}')
        print(f'[OK] {len(nodes.get("items", []))} node(s) found')
    else:
        print(f'[FAIL] Cannot list nodes: {err}')
    
    # Test 3: Check GPU resources advertised
    print('\n3. GPU resources per node:')
    nodes, err = kubectl(['get', 'nodes'], json_output=True)
    if nodes:
        total_gpus = 0
        for node in nodes.get('items', []):
            name = node['metadata']['name']
            allocatable = node['status'].get('allocatable', {})
            capacity = node['status'].get('capacity', {})
            gpus = int(allocatable.get('nvidia.com/gpu', 0))
            total_gpus += gpus
            if gpus > 0:
                print(f'   {name}: {gpus} GPU(s) allocatable')
        if total_gpus > 0:
            print(f'[OK] Total {total_gpus} GPU(s) available in cluster')
        else:
            print('[INFO] No GPUs detected (NVIDIA device plugin may not be running)')
    
    # Test 4: Check NVIDIA device plugin
    print('\n4. NVIDIA device plugin:')
    pods, err = kubectl(['get', 'pods', '-n', 'kube-system', '-l', 'app.kubernetes.io/name=nvidia-device-plugin'], json_output=True)
    if pods and pods.get('items'):
        for pod in pods['items']:
            name = pod['metadata']['name']
            phase = pod['status'].get('phase', 'Unknown')
            node = pod['spec'].get('nodeName', 'unscheduled')
            print(f'   {name}: {phase} on {node}')
        print('[OK] NVIDIA device plugin deployed')
    else:
        print('[WARN] NVIDIA device plugin not found in kube-system')
    
    # Test 5: Check hydra namespaces
    print('\n5. Hydra namespaces:')
    namespaces, err = kubectl(['get', 'namespaces'], json_output=True)
    required_ns = ['hydra-system', 'hydra-students']
    found_ns = [ns['metadata']['name'] for ns in namespaces.get('items', [])] if namespaces else []
    
    for ns in required_ns:
        if ns in found_ns:
            print(f'   [OK] {ns}')
        else:
            print(f'   [FAIL] {ns} not found')
    
    # Test 6: Check ResourceQuota for GPU
    print('\n6. GPU ResourceQuota:')
    quota, err = kubectl(['get', 'resourcequota', '-n', 'hydra-students', 'gpu-quota'], json_output=True)
    if quota:
        hard = quota['status'].get('hard', {})
        used = quota['status'].get('used', {})
        gpu_limit = hard.get('limits.nvidia.com/gpu', '0')
        gpu_used = used.get('limits.nvidia.com/gpu', '0')
        print(f'   GPU limit: {gpu_used}/{gpu_limit}')
        print('[OK] GPU quota configured')
    else:
        print('[INFO] No GPU quota configured (optional)')
    
    # Test 7: Check student pods
    print('\n7. Student pods:')
    pods, err = kubectl(['get', 'pods', '-n', 'hydra-students', '-l', 'app.kubernetes.io/name=student-container'], json_output=True)
    if pods and pods.get('items'):
        for pod in pods['items']:
            name = pod['metadata']['name']
            phase = pod['status'].get('phase', 'Unknown')
            node = pod['spec'].get('nodeName', 'unscheduled')
            # Check if pod has GPU
            containers = pod['spec'].get('containers', [])
            gpu_count = 0
            for c in containers:
                limits = c.get('resources', {}).get('limits', {})
                gpu_count += int(limits.get('nvidia.com/gpu', 0))
            gpu_str = f', {gpu_count} GPU' if gpu_count > 0 else ''
            print(f'   {name}: {phase} on {node}{gpu_str}')
        print(f'[OK] {len(pods["items"])} student pod(s) running')
    else:
        print('[INFO] No student pods running')
    
    # Test 8: Traefik IngressRoutes
    print('\n8. Traefik IngressRoutes:')
    routes, err = kubectl(['get', 'ingressroute', '-n', 'hydra-students'], json_output=True)
    if routes and routes.get('items'):
        for route in routes['items']:
            name = route['metadata']['name']
            print(f'   {name}')
        print(f'[OK] {len(routes["items"])} IngressRoute(s)')
    else:
        print('[INFO] No IngressRoutes in hydra-students namespace')
    
    return True

test_k8s_setup()

## Layer 12: K8s GPU Pod Test

In [None]:
def test_k8s_gpu_pod():
    """Test GPU access in a Kubernetes pod"""
    import time
    
    print('K8s GPU Pod Test:')
    print('-' * 50)
    
    # Helper to run kubectl
    def kubectl(args, json_output=False):
        cmd = ['kubectl'] + args
        if json_output:
            cmd += ['-o', 'json']
        result = subprocess.run(cmd, capture_output=True, text=True)
        if result.returncode != 0:
            return None, result.stderr
        if json_output:
            return json.loads(result.stdout), None
        return result.stdout, None
    
    pod_name = 'gpu-test-pod'
    namespace = 'default'
    
    # GPU test pod manifest
    pod_manifest = f'''
apiVersion: v1
kind: Pod
metadata:
  name: {pod_name}
  namespace: {namespace}
spec:
  restartPolicy: Never
  containers:
  - name: cuda
    image: nvidia/cuda:12.0-base-ubuntu22.04
    command: ["nvidia-smi"]
    resources:
      limits:
        nvidia.com/gpu: 1
'''
    
    # Clean up any existing test pod
    print('\n1. Cleaning up existing test pod...')
    kubectl(['delete', 'pod', pod_name, '-n', namespace, '--ignore-not-found'])
    time.sleep(2)
    
    # Create test pod
    print('\n2. Creating GPU test pod...')
    result = subprocess.run(
        ['kubectl', 'apply', '-f', '-'],
        input=pod_manifest,
        capture_output=True, text=True
    )
    if result.returncode != 0:
        print(f'[FAIL] Could not create pod: {result.stderr}')
        return False
    print('[OK] Pod created')
    
    # Wait for pod to complete
    print('\n3. Waiting for pod to run (max 60s)...')
    for i in range(12):
        time.sleep(5)
        pod, err = kubectl(['get', 'pod', pod_name, '-n', namespace], json_output=True)
        if pod:
            phase = pod['status'].get('phase', 'Unknown')
            print(f'   Status: {phase}')
            if phase == 'Succeeded':
                break
            if phase == 'Failed':
                print('[FAIL] Pod failed to run')
                # Get pod events
                events, _ = kubectl(['get', 'events', '-n', namespace, '--field-selector', f'involvedObject.name={pod_name}'])
                if events:
                    print('   Events:')
                    for line in events.strip().split('\n')[-5:]:
                        print(f'   {line}')
                return False
    
    # Get logs
    print('\n4. nvidia-smi output from pod:')
    logs, err = kubectl(['logs', pod_name, '-n', namespace])
    if logs:
        for line in logs.strip().split('\n'):
            print(f'   {line}')
        print('[OK] GPU accessible in K8s pod')
    else:
        print(f'[FAIL] Could not get logs: {err}')
    
    # Cleanup
    print('\n5. Cleaning up...')
    kubectl(['delete', 'pod', pod_name, '-n', namespace])
    print('[OK] Test pod deleted')
    
    return True

# Uncomment to run (creates temporary pod):
# test_k8s_gpu_pod()

## Summary: Run All Tests

In [None]:
def run_all_tests(include_gpu=False, include_k8s=False):
    """
    Run all routing tests.
    
    Args:
        include_gpu: Run GPU-specific tests (requires NVIDIA hardware)
        include_k8s: Run Kubernetes tests (requires kubectl access)
    """
    results = {
        'DNS': test_dns(),
        'Traefik Entrypoint': test_traefik_entrypoint(),
        'Routers': test_traefik_routers(),
        'Middlewares': test_traefik_middlewares(),
        'Services': test_traefik_services(),
        'Docker Network': test_docker_network(),
        'Container Direct': test_container_direct(),
        'Full Route': test_full_route(),
        'ForwardAuth': test_forward_auth(),
    }
    
    if include_gpu:
        results['GPU Access'] = test_gpu_access()
    
    if include_k8s:
        results['K8s Setup'] = test_k8s_setup()
    
    print('\n' + '=' * 50)
    print('SUMMARY')
    print('=' * 50)
    for test, passed in results.items():
        print(f"{'[OK]' if passed else '[FAIL]'} {test}")
    
    failed = sum(1 for v in results.values() if not v)
    if failed:
        print(f'\n{failed} test(s) failed')
    else:
        print('\nAll tests passed')
    
    if not include_gpu:
        print('\n[TIP] Run with include_gpu=True to test GPU access')
    if not include_k8s:
        print('[TIP] Run with include_k8s=True to test Kubernetes setup')

# Run basic tests:
# run_all_tests()

# Run with GPU tests:
# run_all_tests(include_gpu=True)

# Run with K8s tests:
# run_all_tests(include_k8s=True)

# Run all:
# run_all_tests(include_gpu=True, include_k8s=True)