In [1]:
%%writefile ../../.devcontainer/.env.template
ENV_NAME=docker_dev_template

# GPU Configuration for RTX 4090
CUDA_TAG=12.4.0
PYTHON_VER=3.10

# Host Port Configuration
HOST_JUPYTER_PORT=8891
HOST_TENSORBOARD_PORT=6008
HOST_EXPLAINER_PORT=8050
HOST_STREAMLIT_PORT=8501
HOST_MLFLOW_PORT=5000

# JAX/GPU Configuration - CRITICAL: NO INLINE COMMENTS
# These environment variables are parsed directly by JAX and must be clean

# Memory fraction for GPU allocation (0.0 to 1.0)
# For RTX 4090 24GB VRAM, 0.4 provides good balance
XLA_PYTHON_CLIENT_MEM_FRACTION=0.4

# Disable memory preallocation for better memory management
XLA_PYTHON_CLIENT_PREALLOCATE=false

# Use platform allocator for optimal GPU memory handling
XLA_PYTHON_CLIENT_ALLOCATOR=platform

# XLA compiler flags for CUDA
XLA_FLAGS=--xla_gpu_cuda_data_dir=/usr/local/cuda

# JAX memory preallocation limit in bytes
# 16GB limit (17179869184 bytes) for RTX 4090
JAX_PREALLOCATION_SIZE_LIMIT_BYTES=17179869184

# JAX behavior configuration
JAX_DISABLE_JIT=false
JAX_ENABLE_X64=false

# TensorFlow GPU configuration (if using TensorFlow)
TF_FORCE_GPU_ALLOW_GROWTH=true


Overwriting ../../.devcontainer/.env.template


In [2]:
%%writefile ../../.devcontainer/.dockerignore
# Reduce Docker build context
.git
.gitignore
.gitattributes
.gitmodules
.vscode
.idea
*.swp
*.swo
*~
.DS_Store
Thumbs.db
__pycache__
*.pyc
*.pyo
*.pyd
.Python
*.so
.coverage*
.cache
.pytest_cache
.mypy_cache
.tox
pip-log.txt
pip-delete-this-directory.txt
env
venv
ENV
env.bak
venv.bak
.ipynb_checkpoints
# Large data (adjust as needed)
data/raw
data/external
*.csv
*.parquet
*.h5
*.hdf5
# Models
*.pt
*.pth
*.pkl
*.joblib
models/
# Logs and temps
*.log
logs/
*.tmp
*.temp
.tmp
temp/
# Build artifacts
build/
dist/
*.egg-info/
.eggs/
# Node
node_modules
npm-debug.log*
yarn-*.log*
.npm
.eslintcache
.node_repl_history
*.tgz
*.tar.gz
# Archives
*.zip
*.tar
*.tar.bz2
*.rar
*.7z
# Docs (opt‑in if needed)
docs/
*.md
README*
LICENSE*
CHANGELOG*
# Tests (opt‑in if needed)
tests/
test_*
*_test.py
# CI
.github/
.gitlab-ci.yml
.travis.yml
.circleci/
azure-pipelines.yml
# Env
.env
.env.local
.env.*.local
.editorconfig
.prettierrc*
.eslintrc*
# Universal junk (de‑duped)
*.py[cod]

Overwriting ../../.devcontainer/.dockerignore


In [None]:
%%writefile ../../.devcontainer/devcontainer.json
{
  "name": "docker_dev_template_rtx4090",
  "dockerComposeFile": "docker-compose.yml",
  "service": "datascience",
  "workspaceFolder": "/workspace",
  "shutdownAction": "stopCompose",

  // Disable command override to use compose command
  "overrideCommand": false,
  
  // Container environment variables
  "containerEnv": {
    "CONTAINER_WORKSPACE_FOLDER": "/workspace",
    "UV_PROJECT_ENVIRONMENT": "/app/.venv",
    "VIRTUAL_ENV": "/app/.venv",
    "PYTHONPATH": "/workspace",
    "TERM": "xterm-256color",
    "DEBIAN_FRONTEND": "noninteractive"
  },

  // Additional run arguments for GPU support
  "runArgs": [
    "--gpus", "all",
    "--name", "${localEnv:ENV_NAME:docker_dev_template}_datascience",
    "--shm-size", "8g"
  ],

  // VS Code customizations
  "customizations": {
    "vscode": {
      "settings": {
        // Python interpreter settings
        "python.defaultInterpreterPath": "/app/.venv/bin/python",
        "python.pythonPath": "/app/.venv/bin/python",
        "python.terminal.activateEnvironment": true,
        "python.terminal.activateEnvInCurrentTerminal": true,
        
        // Terminal configuration
        "terminal.integrated.defaultProfile.linux": "bash",
        "terminal.integrated.profiles.linux": {
          "bash": {
            "path": "/bin/bash",
            "args": ["-l"],
            "env": {
              "VIRTUAL_ENV": "/app/.venv",
              "PATH": "/app/.venv/bin:${env:PATH}",
              "UV_PROJECT_ENVIRONMENT": "/app/.venv",
              "PYTHONPATH": "/workspace"
            }
          }
        },
        
        // Jupyter settings
        "jupyter.notebookFileRoot": "/workspace",
        "jupyter.kernels.filter": [
          {
            "path": "/app/.venv/bin/python",
            "type": "pythonEnvironment"
          }
        ],
        "jupyter.interactiveWindow.creationMode": "perFile",
        
        // File associations and workspace settings
        "files.watcherExclude": {
          "**/.git/**": true,
          "**/node_modules/**": true,
          "**/__pycache__/**": true,
          "**/.pytest_cache/**": true,
          "**/.venv/**": true
        },
        
        // Docker settings
        "docker.showStartPage": false
      },
      
      // Essential extensions
      "extensions": [
        "ms-python.python",
        "ms-python.flake8", 
        "ms-python.black-formatter",
        "ms-toolsai.jupyter",
        "ms-azuretools.vscode-docker",
        "ms-vscode.makefile-tools"
      ]
    }
  },

  // Lifecycle commands with better error handling
  "onCreateCommand": {
    "validate-environment": [
      "bash", "-lc", 
      "echo 'onCreate: Validating environment setup...'; ls -la /app/.venv/bin/ || echo 'Virtual env not ready'; which python || echo 'Python not found in PATH'; echo 'onCreate validation complete.'"
    ]
  },

  "postCreateCommand": {
    "setup-jupyter-kernel": [
      "bash", "-lc",
      "set -e; echo 'Setting up Jupyter kernel...'; \
      source /app/.venv/bin/activate; \
      python -c \"import sys; print(f'Python executable: {sys.executable}')\"; \
      python - <<'PY'\nimport importlib\nfor m in ('ipykernel','jupyter_client','psutil','debugpy'):\n    try:\n        mod=importlib.import_module(m)\n        print(f'ok: {m} {getattr(mod,\"__version__\",\"unknown\")}')\n    except Exception as e:\n        print(f'ERR: {m} -> {e}')\nPY\n; \
      python -m ipykernel install --user --name='uv_docker_dev_template' --display-name='Python (UV Environment)' || echo 'Kernel install failed'; \
      jupyter kernelspec list || echo 'Cannot list kernels'; \
      echo 'Running environment tests...'; \
      python /app/tests/test_summary.py || echo 'Tests completed with warnings'"
    ]
  },

  "postStartCommand": {
    "validate-gpu-quick": [
      "bash", "-lc",
      "echo 'postStart: Running quick validation...'; source /app/.venv/bin/activate; python --version; python -c 'import torch; print(f\"PyTorch CUDA available: {torch.cuda.is_available()}\")' || echo 'PyTorch validation failed'; python /app/validate_gpu.py --quick || echo 'GPU validation completed with warnings'; echo 'Container ready for development!'"
    ]
  },

  // Port forwarding with better labeling
  "forwardPorts": [8888, 6008, 8050, 8501, 5000],
  "portsAttributes": {
    "8888": { 
      "label": "Jupyter Lab", 
      "onAutoForward": "notify",
      "protocol": "http"
    },
    "6008": { 
      "label": "TensorBoard", 
      "onAutoForward": "silent",
      "protocol": "http"
    },
    "8050": { 
      "label": "Explainer Dashboard", 
      "onAutoForward": "silent",
      "protocol": "http"
    },
    "8501": { 
      "label": "Streamlit", 
      "onAutoForward": "silent",
      "protocol": "http"
    },
    "5000": { 
      "label": "MLflow", 
      "onAutoForward": "silent",
      "protocol": "http"
    }
  },

  // Volume mounts for caching
  "mounts": [
    "source=docker_dev_template_uv_cache,target=/root/.cache/uv,type=volume"
  ],

  // Additional features and settings
  "features": {},
  
  // Wait for services to be ready
  "waitFor": "postCreateCommand",
  
  // Increase timeout for initial setup
  "postCreateCommand.timeout": 300,
  "postStartCommand.timeout": 120
}


Overwriting ../../.devcontainer/devcontainer.json


In [None]:
%%writefile ../../.devcontainer/Dockerfile
# Fixed Dockerfile: RTX 4090 devcontainer with UV, JAX, and PyTorch (CUDA 12.x)

ARG CUDA_TAG=12.4.0
FROM nvidia/cuda:${CUDA_TAG}-devel-ubuntu22.04

ARG PYTHON_VER=3.10
ARG ENV_NAME=docker_dev_template
ENV DEBIAN_FRONTEND=noninteractive

# System dependencies
RUN --mount=type=cache,id=apt-cache,target=/var/cache/apt,sharing=locked \
    --mount=type=cache,id=apt-lists,target=/var/lib/apt/lists,sharing=locked \
    apt-get update && apt-get install -y --no-install-recommends \
        bash curl ca-certificates git procps htop \
        python3 python3-venv python3-pip python3-dev \
        build-essential cmake pkg-config \
        libjemalloc2 libjemalloc-dev \
        iproute2 net-tools lsof wget \
    && apt-get clean && rm -rf /var/lib/apt/lists/*

# UV package manager
COPY --from=ghcr.io/astral-sh/uv:0.7.12 /uv /uvx /bin/

WORKDIR /app

# Create venv managed by UV
RUN uv venv .venv --python "${PYTHON_VER}" --prompt "${ENV_NAME}"

ENV VIRTUAL_ENV=/app/.venv \
    PATH="/app/.venv/bin:${PATH}" \
    UV_PROJECT_ENVIRONMENT=/app/.venv \
    PYTHONPATH="/workspace"

# Memory and allocator settings
ENV LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so.2 \
    MALLOC_ARENA_MAX=2 \
    MALLOC_TCACHE_MAX=0 \
    PYTORCH_NO_CUDA_MEMORY_CACHING=1

# GPU-relevant environment
ENV XLA_PYTHON_CLIENT_PREALLOCATE=false \
    XLA_PYTHON_CLIENT_MEM_FRACTION=0.4 \
    XLA_PYTHON_CLIENT_ALLOCATOR=platform \
    PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:1024,expandable_segments:True \
    JAX_PREALLOCATION_SIZE_LIMIT_BYTES=17179869184

# Project files
COPY pyproject.toml /workspace/

# Devcontainer tests and validator
COPY .devcontainer/validate_gpu.py /app/validate_gpu.py
COPY .devcontainer/tests/ /app/tests/

# (IMPORTANT) Do NOT 'uv init' if pyproject exists. Just lock/sync.
# Optionally create a lock explicitly when it's missing (documented and safe)
RUN --mount=type=cache,target=/root/.cache/uv,sharing=locked \
    cd /workspace && \
    if [ ! -f uv.lock ]; then \
      echo "[uv] No uv.lock found; creating from existing pyproject.toml"; \
      uv lock --refresh; \
    else \
      echo "[uv] Using existing uv.lock"; \
    fi

# Resolve non-GPU project dependencies first (clean, deterministic)
RUN --mount=type=cache,target=/root/.cache/uv,sharing=locked \
    cd /workspace && \
    (uv sync --frozen --no-dev 2>/dev/null || \
     uv sync --no-dev 2>/dev/null || \
     (echo "[uv] Installing basic dependencies..." && uv add numpy pandas matplotlib scipy))

# --- GPU stack: install PyTorch (CUDA 12.4) and JAX (CUDA 12) ---
# Keep these out of pyproject to avoid conflicts; install explicitly here.

# PyTorch with CUDA 12.4 wheels
RUN --mount=type=cache,target=/root/.cache/uv,sharing=locked \
    uv pip install --no-cache-dir torch torchvision torchaudio \
        --index-url https://download.pytorch.org/whl/cu124 && \
    python - <<'PY'
import torch
print("Torch version:", torch.__version__)
print("CUDA available during build:", torch.cuda.is_available())
print("Note: CUDA will be available at runtime when container has GPU access")
PY

# JAX with CUDA 12 wheels (from official wheel index)
RUN --mount=type=cache,target=/root/.cache/uv,sharing=locked \
    uv pip install --no-cache-dir --upgrade \
        "jax[cuda12]" \
        -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html && \
    python - <<'PY'
import jax, jaxlib
print("JAX:", jax.__version__, "JAXLIB:", jaxlib.__version__)
print("JAX devices during build:", jax.devices())
print("Note: GPU devices will be available at runtime when container has GPU access")
PY

# Jupyter & kernel: pin versions and avoid corrupt wheels; include psutil/debugpy
RUN --mount=type=cache,target=/root/.cache/uv,sharing=locked \
    uv pip install --no-cache-dir \
        psutil==5.9.8 \
        debugpy==1.8.7 \
        ipykernel==6.29.5 \
        jupyter-client==8.6.1 \
        jupyterlab==4.2.5

# CUDA libs in path - include both system and any packaged libs
ENV LD_LIBRARY_PATH="/app/.venv/lib:/usr/local/cuda/lib64:${LD_LIBRARY_PATH}"

# Shell activation helper
RUN echo '#!/bin/bash' > /app/activate_uv.sh && \
    echo 'export VIRTUAL_ENV="/app/.venv"' >> /app/activate_uv.sh && \
    echo 'export PATH="/app/.venv/bin:$PATH"' >> /app/activate_uv.sh && \
    echo 'export UV_PROJECT_ENVIRONMENT="/app/.venv"' >> /app/activate_uv.sh && \
    echo 'export PYTHONPATH="/workspace:$PYTHONPATH"' >> /app/activate_uv.sh && \
    echo 'export LD_LIBRARY_PATH="/app/.venv/lib:/usr/local/cuda/lib64:${LD_LIBRARY_PATH}"' >> /app/activate_uv.sh && \
    echo 'cd /workspace' >> /app/activate_uv.sh && \
    chmod +x /app/activate_uv.sh && \
    echo 'source /app/activate_uv.sh' > /etc/profile.d/10-uv-activate.sh && \
    echo 'source /app/activate_uv.sh' >> /root/.bashrc && \
    chmod +x /etc/profile.d/10-uv-activate.sh

# Enhanced healthcheck script
RUN echo '#!/bin/bash' > /app/healthcheck.sh && \
    echo 'source /app/.venv/bin/activate' >> /app/healthcheck.sh && \
    echo 'python /app/validate_gpu.py --quick' >> /app/healthcheck.sh && \
    chmod +x /app/healthcheck.sh

WORKDIR /workspace
CMD ["bash", "-l"]


Overwriting ../../.devcontainer/Dockerfile


In [None]:
%%writefile ../../.devcontainer/docker-compose.yml
# Fixed .devcontainer/docker-compose.yml 
name: ${ENV_NAME:-docker_dev_template}

services:
  datascience:
    build:
      # Build context is parent directory (project root)
      context: ..
      dockerfile: .devcontainer/Dockerfile
      args:
        CUDA_TAG: ${CUDA_TAG:-12.4.0}
        PYTHON_VER: ${PYTHON_VER:-3.10}
        ENV_NAME: ${ENV_NAME:-docker_dev_template}
      cache_from:
        - nvidia/cuda:${CUDA_TAG:-12.4.0}-devel-ubuntu22.04
      # Additional build options for better Windows compatibility
      extra_hosts:
        - "host.docker.internal:host-gateway"

    container_name: ${ENV_NAME:-docker_dev_template}_datascience

    # Environment template now referenced locally with fallback
    env_file:
      - .env

    restart: unless-stopped
    depends_on:
      mlflow:
        condition: service_started  # FIXED: Changed from service_healthy to service_started

    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]

    # Better process management
    init: true
    gpus: all
    shm_size: 8g
    ulimits:
      memlock: -1
      stack: 67108864

    environment:
      - PYTHON_VER=${PYTHON_VER:-3.10}
      - UV_PROJECT_ENVIRONMENT=/app/.venv
      - VIRTUAL_ENV=/app/.venv
      - PYTHONPATH=/workspace
      - NVIDIA_VISIBLE_DEVICES=all
      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
      - CUDA_VISIBLE_DEVICES=0
      - LD_LIBRARY_PATH=/app/.venv/lib:/usr/local/cuda/lib64
      - LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so.2
      - MALLOC_ARENA_MAX=2
      - MALLOC_TCACHE_MAX=0
      - PYTORCH_NO_CUDA_MEMORY_CACHING=1
      
      # CRITICAL FIX: Clean JAX environment variables (no inline comments)
      - XLA_PYTHON_CLIENT_PREALLOCATE=false
      - XLA_PYTHON_CLIENT_ALLOCATOR=platform
      - XLA_PYTHON_CLIENT_MEM_FRACTION=0.4
      - XLA_FLAGS=--xla_gpu_cuda_data_dir=/usr/local/cuda
      - JAX_PREALLOCATION_SIZE_LIMIT_BYTES=17179869184
      - PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:1024,expandable_segments:True
      - JUPYTER_TOKEN=${JUPYTER_TOKEN:-jupyter}

    volumes:
      # Mount parent directory (project root) to workspace
      - ..:/workspace:delegated
      # MLflow volumes reference parent directory  
      - ../mlruns:/workspace/mlruns:delegated
      - ../mlflow_db:/workspace/mlflow_db:delegated
      # UV cache volume for faster builds
      - uv-cache:/root/.cache/uv

    ports:
      - "${HOST_JUPYTER_PORT:-8891}:8888"
      - "${HOST_TENSORBOARD_PORT:-6008}:6008"
      - "${HOST_EXPLAINER_PORT:-8050}:8050"
      - "${HOST_STREAMLIT_PORT:-8501}:8501"

    # Enhanced startup command with better error handling
    command: >
      bash -lc '
        set -e;
        echo "[boot] Starting container: ${ENV_NAME:-docker_dev_template}";
        echo "[boot] System info: $(uname -a)";
        echo "[boot] Working directory: $(pwd)";
        echo "[boot] Files in /workspace: $(ls -la /workspace/ | head -10)";
        echo "[boot] Activating uv environment...";
        source /app/.venv/bin/activate;
        echo "[boot] Environment activated - Python: $(which python)";
        echo "[boot] Python version: $(python --version)";
        echo "[boot] UV available: $(uv --version 2>/dev/null || echo "uv not found")";
        echo "[boot] Running GPU validation...";
        python /app/validate_gpu.py --quick || echo "GPU validation completed with warnings";
        echo "[boot] Starting Jupyter Lab on port 8888...";
        exec jupyter lab --ip=0.0.0.0 --port=8888 --allow-root \
        --ServerApp.token="${JUPYTER_TOKEN:-jupyter}" \
        --ServerApp.allow_origin="*" \
        --ServerApp.open_browser=false \
        --ServerApp.root_dir="/workspace"
      '

    # More robust healthcheck
    healthcheck:
      test: |
        bash -c '
          source /app/.venv/bin/activate 2>/dev/null || exit 1;
          python -c "
            import sys, torch, jax;
            assert torch.cuda.is_available(), \"PyTorch CUDA not available\";
            gpu_devs = [d for d in jax.devices() if \"gpu\" in str(d).lower()];
            assert len(gpu_devs) > 0, \"JAX GPU devices not found\";
            print(f\"Health check OK: PyTorch CUDA={torch.cuda.is_available()}, JAX GPUs={len(gpu_devs)}\")
          " 2>/dev/null || (echo "GPU check failed" && exit 1)
        '
      interval: 60s
      timeout: 30s
      retries: 3
      start_period: 180s  # Longer startup time for initial build

    labels:
      - "com.docker.compose.project=${ENV_NAME:-docker_dev_template}"
      - "com.docker.compose.service=datascience"
      - "description=RTX 4090 GPU Dev Environment (PyTorch+JAX) - CUDA 12.4"

  # FIXED MLflow service with robust configuration
  mlflow:
    container_name: ${ENV_NAME:-docker_dev_template}_mlflow
    image: ghcr.io/mlflow/mlflow:latest
    
    # FIXED: Create required directories and use better startup command
    command: >
      bash -c '
        set -e;
        echo "[MLflow] Starting MLflow server...";
        mkdir -p /mlflow_artifacts /mlflow_db;
        echo "[MLflow] Created directories";
        echo "[MLflow] Database path: /mlflow_db/mlflow.db";
        echo "[MLflow] Artifacts path: /mlflow_artifacts";
        exec mlflow server
        --host 0.0.0.0
        --port 5000
        --backend-store-uri sqlite:////mlflow_db/mlflow.db
        --default-artifact-root /mlflow_artifacts
        --serve-artifacts
      '
    
    environment:
      MLFLOW_EXPERIMENTS_DEFAULT_ARTIFACT_LOCATION: /mlflow_artifacts
    
    volumes:
      # Create host directories if they don't exist by mounting to parent directory paths
      - ../mlruns:/mlflow_artifacts:delegated
      - ../mlflow_db:/mlflow_db:delegated
    
    ports:
      - "${HOST_MLFLOW_PORT:-5000}:5000"
    
    restart: unless-stopped
    
    # FIXED: Simplified and more reliable healthcheck
    healthcheck:
      test: |
        timeout 10 bash -c '</dev/tcp/localhost/5000' || 
        curl -f -s http://localhost:5000 >/dev/null 2>&1 || 
        wget --quiet --tries=1 --timeout=5 --spider http://localhost:5000 || 
        exit 1
      interval: 30s
      timeout: 15s
      retries: 10  # More retries for initial startup
      start_period: 120s  # Longer startup period

    labels:
      - "com.docker.compose.project=${ENV_NAME:-docker_dev_template}"
      - "description=MLflow Experiment Tracking Server"

# Named volume for UV cache persistence
volumes:
  uv-cache:
    driver: local

Overwriting ../../.devcontainer/docker-compose.yml


In [6]:
%%writefile ../../pyproject.toml
[project]
name = "docker_dev_template"
version = "0.1.0"
description = "Hierarchical Bayesian modeling for baseball exit velocity data"
authors = [
  { name = "Marlins Data Science Team" },
]
license = "MIT"
readme = "README.md"

# ─── Restrict to Python 3.10–3.12 ──────────────────────────────
requires-python = ">=3.10,<3.13"

dependencies = [
  "pandas>=2.0",
  "numpy>=1.20,<2",
  "matplotlib>=3.4.0",
  "scikit-learn>=1.4.2",
  "pymc>=5.0.0",
  "arviz>=0.14.0",
  "statsmodels>=0.13.0",
  "jupyterlab>=3.0.0",
  "seaborn>=0.11.0",
  "tabulate>=0.9.0",
  "shap>=0.40.0",
  "xgboost>=1.5.0",
  "lightgbm>=3.3.0",
  "catboost>=1.0.0",
  "scipy>=1.7.0",
  "shapash[report]>=2.3.0",
  "shapiq>=1.3.0",
  "explainerdashboard>=0.3.0",
  "ipywidgets>=8.0.0",
  "nutpie>=0.7.1",
  "pytensor>=2.18.3",
  "aesara>=2.9.4",
  "tqdm>=4.67.0",
  "pyarrow>=12.0.0",
  "streamlit>=1.20.0",
  "sqlalchemy>=1.4",
  "mysql-connector-python>=8.0",
  "optuna>=4.3.0",
  "bayesian-optimization>=1.2.0",
  "pretty_errors>=1.2.0",
  "gdown>=4.0.0",
  "invoke>=2.2",
  "pytube @ git+https://github.com/pytube/pytube",
  "yt-dlp>=2024.12.0",
  "ffmpeg-python>=0.2.0",
  "ultralytics==8.3.158",
  "opencv-python-headless>=4.10.0",
  "roboflow>=1.0.0",
  "mlflow>=3.1.1,<4.0.0",
  "optuna-integration[mlflow]>=4.4.0,<5.0.0",
  "pydantic>=2.0.0",
  "pydantic-settings>=2.0.0",
]

[project.optional-dependencies]
dev = [
  "pytest>=7.0.0",
  "black>=23.0.0",
  "isort>=5.0.0",
  "flake8>=5.0.0",
  "mypy>=1.0.0",
  "pre-commit>=3.0.0",
]

cuda = [
  "cupy-cuda12x>=12.0.0",  # For CUDA 12.x
]

# ─── uv configuration ──────────────────────────────────────────
[tool.uv]                   # uv reads this block
index-strategy = "unsafe-best-match"

# Define named indexes for PyTorch CUDA variants
[[tool.uv.index]]
name = "pytorch-cu121"
url = "https://download.pytorch.org/whl/cu121"
explicit = true

[[tool.uv.index]]
name = "pytorch-cu118"
url = "https://download.pytorch.org/whl/cu118"
explicit = true

[[tool.uv.index]]
name = "pytorch-cu124"
url = "https://download.pytorch.org/whl/cu124"
explicit = true

[[tool.uv.index]]
name = "pytorch-cu128"
url = "https://download.pytorch.org/whl/cu128"
explicit = true

# Removed unsupported option: torch-backend requires uv ≥0.5.3
# To re-enable, first run: pip install -U uv>=0.5.3
[tool.uv.pip]
# (No unsupported keys here; configure only valid pip options.)

# Map only when explicitly used (not needed now; Torch installed in Dockerfile)
[tool.uv.sources]

[tool.pytensor]
device    = "cuda"
floatX    = "float32"
allow_gc  = true
optimizer = "fast_run"



Overwriting ../../pyproject.toml


In [7]:
%%writefile ../../.devcontainer/validate_gpu.py
#!/usr/bin/env python3
"""
Docker Build & GPU Validation Script (container-friendly)
- Adds --quick mode to skip Docker CLI checks
- Skips Docker checks automatically if docker is unavailable
- Keeps strict on GPU/Torch/JAX checks
"""
import os
import sys
import shutil
import subprocess
import argparse
from pathlib import Path
from typing import List, Tuple


def print_section(title: str) -> None:
    print(f"\n{'='*60}\n  {title}\n{'='*60}")


def run_command(cmd: List[str], timeout: int = 60) -> Tuple[bool, str, str]:
    try:
        r = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
        return r.returncode == 0, r.stdout, r.stderr
    except Exception as e:
        return False, "", str(e)


def docker_available() -> bool:
    return shutil.which("docker") is not None


def ensure_project_structure() -> bool:
    print_section("ENSURING PROJECT STRUCTURE")
    cwd = Path.cwd()
    print(f"Current directory: {cwd}")

    if (cwd / ".devcontainer").exists():
        project_root = cwd
    elif cwd.name == ".devcontainer":
        project_root = cwd.parent
    else:
        project_root = cwd
        (project_root / ".devcontainer").mkdir(exist_ok=True)

    dev_dir = project_root / ".devcontainer"
    print(f"Project root: {project_root}")
    print(f"DevContainer directory: {dev_dir}")

    (dev_dir / "tests").mkdir(exist_ok=True)

    pyproject = project_root / "pyproject.toml"
    if not pyproject.exists():
        print("Creating minimal pyproject.toml...")
        pyproject.write_text(
            """[project]
name = "docker_dev_template"
version = "0.1.0"
description = "Docker development environment"
requires-python = ">=3.10,<3.13"

dependencies = [
    "pandas>=2.0",
    "numpy>=1.20,<2",
    "matplotlib>=3.4.0",
    "scipy>=1.7.0",
    "jupyterlab>=3.0.0",
]

[tool.uv]
index-strategy = "unsafe-best-match"
"""
        )
        print("✅ Created pyproject.toml")

    return True


def create_env_file() -> bool:
    print_section("CREATING ENVIRONMENT FILE")
    t = Path(".devcontainer/.env.template")
    f = Path(".devcontainer/.env")
    if t.exists() and not f.exists():
        f.write_bytes(t.read_bytes())
        print("✅ Created .env from template")
        return True
    elif f.exists():
        print("✅ .env file already exists")
        return True
    else:
        f.write_text(
            """ENV_NAME=docker_dev_template
CUDA_TAG=12.4.0
PYTHON_VER=3.10
HOST_JUPYTER_PORT=8891
HOST_TENSORBOARD_PORT=6008
HOST_EXPLAINER_PORT=8050
HOST_STREAMLIT_PORT=8501
HOST_MLFLOW_PORT=5000
"""
        )
        print("✅ Created minimal .env file")
        return True


def fix_file_permissions() -> bool:
    print_section("FIXING FILE PERMISSIONS")
    try:
        is_wsl = "microsoft" in os.uname().release.lower()
    except Exception:
        is_wsl = False

    if os.name == "nt" or is_wsl:
        print("Detected Windows/WSL environment")
        for p in [
            ".devcontainer/validate_gpu.py",
            ".devcontainer/tests/test_summary.py",
            ".devcontainer/tests/test_pytorch.py",
            ".devcontainer/tests/test_pytorch_gpu.py",
            ".devcontainer/tests/test_uv.py",
        ]:
            fp = Path(p)
            if fp.exists():
                try:
                    os.chmod(fp, 0o755)
                    print(f"✅ Fixed permissions for {p}")
                except Exception as e:
                    print(f"⚠️ Could not fix permissions for {p}: {e}")
    return True


def validate_docker_environment() -> bool:
    print_section("VALIDATING DOCKER ENVIRONMENT")
    if not docker_available():
        print("ℹ️ Docker CLI not found in this environment; skipping Docker checks.")
        return True  # treat as success inside containers
    ok, out, err = run_command(["docker", "info"])
    if not ok:
        print(f"❌ Docker daemon not accessible: {err}")
        return False
    print("✅ Docker daemon is running")

    ok, out, err = run_command(["docker", "compose", "version"])
    if not ok:
        print(f"❌ Docker Compose not available: {err}")
        return False
    print(f"✅ Docker Compose: {out.strip()}")
    return True


def stop_and_remove_containers() -> bool:
    print_section("CLEANING EXISTING CONTAINERS")
    if not docker_available():
        print("ℹ️ Docker CLI not found; skipping container cleanup.")
        return True
    ok, _, err = run_command(
        ["docker", "compose", "-f", ".devcontainer/docker-compose.yml", "down", "--volumes"]
    )
    if not ok:
        print(f"⚠️ Could not stop containers (may not exist): {err}")
    for name in ["docker_dev_template_datascience", "docker_dev_template_mlflow"]:
        run_command(["docker", "rm", "-f", name])
    print("✅ Container cleanup complete")
    return True


def clean_docker_cache() -> bool:
    print_section("CLEANING DOCKER CACHE")
    if not docker_available():
        print("ℹ️ Docker CLI not found; skipping cache prune.")
        return True
    ok, out, err = run_command(["docker", "builder", "prune", "--all", "--force"])
    if ok:
        print("✅ Docker build cache cleaned")
        if out:
            print(out)
        return True
    print(f"❌ Failed to clean Docker cache: {err}")
    return False


def test_build() -> bool:
    print_section("TESTING DOCKER BUILD")
    if not docker_available():
        print("ℹ️ Docker CLI not found; skipping compose build test.")
        return True
    if Path.cwd().name == ".devcontainer":
        os.chdir("..")
    compose_file = ".devcontainer/docker-compose.yml"
    print(f"Using compose file: {Path(compose_file).absolute()}")
    print(f"Build context: {Path('.').absolute()}")
    ok, out, err = run_command(
        ["docker", "compose", "-f", compose_file, "build", "--no-cache"], timeout=600
    )
    if ok:
        print("✅ Docker build successful!")
        print("\n".join(out.splitlines()[-10:]))
        return True
    print("❌ Docker build failed")
    print("STDERR:\n", err)
    print("STDOUT (last 20 lines):\n", "\n".join(out.splitlines()[-20:]))
    return False


def section_summary(struct_ok, uv_ok, pt_ok, jax_ok):
    print_section("SUMMARY")
    print(f"structure: {struct_ok} uv: {uv_ok} pytorch: {pt_ok} jax: {jax_ok}")


def test_uv() -> bool:
    print_section("UV")
    ok, out, err = run_command(["uv", "--version"])
    print((out or err).strip() or "uv not in PATH")
    return ok


def test_pytorch() -> bool:
    print_section("PYTORCH")
    try:
        import torch
        print("version:", torch.__version__)
        print("cuda:", torch.cuda.is_available())
        if torch.cuda.is_available():
            d = torch.device("cuda:0")
            import time
            x = torch.randn((512, 512), device=d)
            t0 = time.time()
            y = (x @ x.T).sum()
            torch.cuda.synchronize()
            print("sum:", float(y))
            print(f"gpu op ms: {(time.time() - t0)*1000:.2f}")
            return True
        return False
    except Exception as e:
        print("error:", e)
        return False


def test_jax() -> bool:
    print_section("JAX")
    try:
        import jax
        import jax.numpy as jnp

        devs = jax.devices()
        print("devices:", devs)
        gpus = jax.devices("gpu") or [
            d for d in devs
            if getattr(d, "platform", "").lower() in {"gpu", "cuda"} or "cuda" in str(d).lower()
        ]
        if not gpus:
            print("no gpu devices detected by jax")
            return False
        x = jnp.ones((512, 512), dtype=jnp.float32)
        x = jax.device_put(x, gpus[0])
        s = jnp.sum(x).block_until_ready()
        print("sum:", float(s))
        return True
    except Exception as e:
        print("error:", e)
        return False


def parse_args() -> argparse.Namespace:
    p = argparse.ArgumentParser()
    p.add_argument("--quick", action="store_true",
                   help="Skip Docker checks; run only structure/UV/Torch/JAX")
    return p.parse_args()


def main() -> int:
    args = parse_args()
    print("Docker DevContainer Build & GPU Validation")
    print(f"Working directory: {os.getcwd()}")

    # Always run these
    struct_ok = ensure_project_structure()
    env_ok = create_env_file()
    perm_ok = fix_file_permissions()

    # Optional Docker checks
    docker_ok = True
    build_ok = True
    cache_ok = True
    stop_ok = True

    if not args.quick:
        docker_ok = validate_docker_environment()
        stop_ok = stop_and_remove_containers()
        cache_ok = clean_docker_cache()
        build_ok = test_build()

    uv_ok = test_uv()
    pt_ok = test_pytorch()
    jax_ok = test_jax()

    section_summary(struct_ok, uv_ok, pt_ok, jax_ok)

    # In quick mode, ignore Docker results entirely.
    if args.quick:
        return 0 if all([struct_ok, uv_ok, pt_ok, jax_ok]) else 1

    # Otherwise include Docker outcomes.
    ok = all([
        struct_ok, env_ok, perm_ok,
        docker_ok, stop_ok, cache_ok, build_ok,
        uv_ok, pt_ok, jax_ok
    ])
    return 0 if ok else 1


if __name__ == "__main__":
    sys.exit(main())


Overwriting ../../.devcontainer/validate_gpu.py


In [8]:
%%writefile ../../.devcontainer/tests/test_pytorch_gpu.py
#!/usr/bin/env python3
"""Small PyTorch GPU benchmark."""
import time


def test_pytorch(force_cpu: bool = False) -> None:
    import torch
    cuda_ok = torch.cuda.is_available() and not force_cpu
    if cuda_ok:
        name = torch.cuda.get_device_name(0)
        major, minor = torch.cuda.get_device_capability()
        print(f"device: {name} (sm_{major}{minor:02d})")
        device = torch.device("cuda:0")
    else:
        print("falling back to cpu")
        device = torch.device("cpu")

    size = (1000, 1000)
    a, b = (torch.randn(size, device=device) for _ in range(2))
    _ = a @ b
    t0 = time.time()
    _ = (a @ b).sum().item()
    if device.type == "cuda":
        torch.cuda.synchronize()
    print(f"matmul on {device} took {(time.time()-t0)*1000:.2f} ms")


if __name__ == "__main__":
    test_pytorch()


Overwriting ../../.devcontainer/tests/test_pytorch_gpu.py


In [9]:
%%writefile ../../.devcontainer/tests/test_uv.py
"""UV and key package presence check."""
import subprocess
import sys

print("UV version:")
try:
    r = subprocess.run(["uv", "--version"], capture_output=True, text=True)
    print(r.stdout.strip() or r.stderr.strip())
except FileNotFoundError:
    print("uv not found")

print("\nPython:")
print(sys.executable)
print(sys.version)

print("\nKey packages:")
for pkg in ["numpy", "pandas", "matplotlib", "scipy", "sklearn", "jupyterlab", "seaborn", "tqdm"]:
    try:
        if pkg == "sklearn":
            import sklearn as m
        else:
            m = __import__(pkg)
        print(pkg, getattr(m, "__version__", "unknown"))
    except Exception as e:
        print(pkg, "missing or error:", e)


Overwriting ../../.devcontainer/tests/test_uv.py


In [10]:
%%writefile ../../.devcontainer/tests/test_pytorch.py
print("PyTorch quick check")
try:
    import torch
    print("version:", torch.__version__)
    print("cuda:", torch.cuda.is_available())
    if torch.cuda.is_available():
        print("devices:", torch.cuda.device_count())
        for i in range(torch.cuda.device_count()):
            print(i, torch.cuda.get_device_name(i))
        x = torch.ones(100, 100, device='cuda:0')
        print("sum:", float(torch.sum(x)))
except Exception as e:
    print("error:", e)


Overwriting ../../.devcontainer/tests/test_pytorch.py


In [11]:
%%writefile ../../.devcontainer/tests/test_uv.py
# Test other critical packages
print("\n📦 Testing other critical packages...")

packages_to_test = [
    'numpy', 'pandas', 'matplotlib', 'scipy', 'sklearn', 
    'jupyterlab', 'seaborn', 'tqdm'
]

for package in packages_to_test:
    try:
        if package == 'sklearn':
            import sklearn
            version = sklearn.__version__
        else:
            module = __import__(package)
            version = getattr(module, '__version__', 'unknown')
        print(f"   ✅ {package}: {version}")
    except ImportError:
        print(f"   ❌ {package}: Not installed")
    except Exception as e:
        print(f"   ⚠️  {package}: Error - {e}")


Overwriting ../../.devcontainer/tests/test_uv.py


In [12]:
%%writefile ../../.devcontainer/tests/test_summary.py
#!/usr/bin/env python3
"""Aggregated checks for the devcontainer layout and GPU readiness."""
import os
import sys
import time
import subprocess


def section(t):
    print("\n" + "=" * 60)
    print(t)
    print("=" * 60)


def test_structure() -> bool:
    section("STRUCTURE")
    expected = [
        '/workspace/.devcontainer/docker-compose.yml',  # MOVED FROM ROOT
        '/workspace/pyproject.toml',
        '/workspace/.devcontainer/devcontainer.json',
        '/workspace/.devcontainer/Dockerfile',
        '/workspace/.devcontainer/.env.template',
        '/workspace/.devcontainer/.dockerignore',
        '/app/validate_gpu.py',
        '/app/tests/'
    ]
    ok = True
    for p in expected:
        if os.path.exists(p):
            print("ok:", p)
        else:
            print("missing:", p)
            ok = False
    return ok


def test_uv() -> bool:
    section("UV")
    try:
        r = subprocess.run(['uv', '--version'], capture_output=True, text=True)
        print(r.stdout.strip() or r.stderr.strip())
        return r.returncode == 0
    except FileNotFoundError:
        print('uv not in PATH')
        return False


def test_pytorch() -> bool:
    section("PYTORCH")
    try:
        import torch
        print("version:", torch.__version__)
        print("cuda:", torch.cuda.is_available())
        if torch.cuda.is_available():
            d = torch.device('cuda:0')
            x = torch.ones(512, 512, device=d)
            y = torch.sum(x)
            print("sum:", y.item())
            return True
        return False
    except Exception as e:
        print("error:", e)
        return False


def test_jax() -> bool:
    section("JAX")
    try:
        import jax, jax.numpy as jnp

        # Show all devices for visibility
        devs = jax.devices()
        print("devices:", devs)

        # Prefer the supported filtered query
        gpus = jax.devices("gpu")

        # Fallback for older/newer renderings (e.g., "CudaDevice(id=0)")
        if not gpus:
            gpus = [
                d for d in devs
                if getattr(d, "platform", "").lower() in {"gpu", "cuda"} or "cuda" in str(d).lower()
            ]

        if not gpus:
            print("no gpu devices detected by jax")
            return False

        # Tiny compute on the first GPU to ensure execution
        x = jnp.ones((512, 512), dtype=jnp.float32)
        x = jax.device_put(x, gpus[0])
        s = jnp.sum(x).block_until_ready()
        print("sum:", float(s))
        return True
    except Exception as e:
        print("error:", e)
        return False



def main() -> int:
    s_ok = test_structure()
    uv_ok = test_uv()
    pt_ok = test_pytorch()
    j_ok = test_jax()

    section("SUMMARY")
    print("structure:", s_ok, "uv:", uv_ok, "pytorch:", pt_ok, "jax:", j_ok)
    return 0 if all([s_ok, uv_ok, pt_ok, j_ok]) else 1


if __name__ == '__main__':
    sys.exit(main())


Overwriting ../../.devcontainer/tests/test_summary.py
