In [85]:
%%writefile package.json
{
  "name": "fastapi-react-monorepo",
  "private": true,
  "scripts": {
    "install:all": "python -m venv .venv && .venv\\Scripts\\python.exe -m pip install uv && .venv\\Scripts\\uv.exe pip install -e api && .venv\\Scripts\\python.exe -m pip install bcrypt passlib[bcrypt] python-dotenv && (cd web && npm install)",
    "seed": ".venv\\Scripts\\python.exe api/scripts/seed_user.py",
    "dev": "concurrently -n \"API,WEB\" -c \"cyan,magenta\" \".venv\\Scripts\\python.exe -m uvicorn api.app.main:app --reload --env-file .env\" \"npm --prefix web run dev\"",
    "dev:backend": "python api/scripts/ensure_models.py && .venv\\Scripts\\python.exe -m uvicorn api.app.main:app --reload --env-file .env",
    "dev:full": "concurrently -n \"API,WEB\" -c \"cyan,magenta\" \"npm run dev:backend\" \"npm --prefix web run dev\"",
    "backend": ".venv\\Scripts\\python.exe -m uvicorn api.app.main:app --host 0.0.0.0 --port 8000 --env-file .env",
    "backend:dev": ".venv\\Scripts\\python.exe -m uvicorn api.app.main:app --reload --host 0.0.0.0 --port 8000 --env-file .env",
    "backend:fast": "set SKIP_BACKGROUND_TRAINING=1 && .venv\\Scripts\\python.exe -m uvicorn api.app.main:app --host 0.0.0.0 --port 8000 --env-file .env",
    "frontend": "npm --prefix web run dev",
    "ensure:models": "python api/scripts/ensure_models.py",
    "test:self-healing": "python test_self_healing.py",
    "test:import": "python test_import.py",
    "build:web": "npm --prefix web run build",
    "debug": "timeout /T 3 && curl -s http://127.0.0.1:8000/api/health && echo. && curl -s -X POST -d \"username=alice&password=secret\" -H \"Content-Type: application/x-www-form-urlencoded\" http://127.0.0.1:8000/api/token"
  },
  "devDependencies": {
    "concurrently": "^8.2.2"
  }
} 


Overwriting package.json


In [86]:
%%writefile invoke.yml
# invoke.yml
tasks:
  dev:
    - uv venv
    - uv sync
    - uvicorn api.app.main:app --reload
  test:
    - uv pip install pytest coverage
    - pytest -q
  lint:
    - uv pip install black isort flake8
    - black .
    - isort .
    - flake8


Overwriting invoke.yml


In [87]:
%%writefile .gitignore
.env
dev.env
.devcontainer/.env.runtime

mlruns/
mlflow_db/
mlruns_local/

node_modules/
frontend/node_modules/

archive/
.venv
uv.lock

test_iris.json
#.env.template

# Railway CLI (never commit tokens)
.railway/config.json

archive/

Overwriting .gitignore


In [88]:
%%writefile .env.template
ENV_NAME=react_fastapi_railway
CUDA_TAG=12.8.0
DOCKER_BUILDKIT=1
HOST_JUPYTER_PORT=8890
HOST_TENSORBOARD_PORT=6008
HOST_EXPLAINER_PORT=8050
HOST_STREAMLIT_PORT=8501
HOST_MLFLOW_PORT=5000
HOST_APP_PORT=5100
HOST_BACKEND_DEV_PORT=5002
MLFLOW_TRACKING_URI=http://mlflow:5000
MLFLOW_VERSION=2.12.2
PYTHON_VER=3.10
JAX_PLATFORM_NAME=gpu
XLA_PYTHON_CLIENT_PREALLOCATE=true
XLA_PYTHON_CLIENT_ALLOCATOR=platform
XLA_PYTHON_CLIENT_MEM_FRACTION=0.95
XLA_FLAGS=--xla_force_host_platform_device_count=1
JAX_DISABLE_JIT=false
JAX_ENABLE_X64=false
TF_FORCE_GPU_ALLOW_GROWTH=false
JAX_PREALLOCATION_SIZE_LIMIT_BYTES=8589934592

RAILWAY_TOKEN=
RAILWAY_VITE_API_URL=https://fastapi-production-1d13.up.railway.app
VITE_API_URL=http://127.0.0.1:8000
REACT_APP_API_URL=https://react-frontend-production-2805.up.railway.app

SECRET_KEY=change-me-in-prod
USERNAME_KEY=alice
USER_PASSWORD=supersecretvalue
DATABASE_URL=sqlite+aiosqlite:///./app.db

# CORS
ALLOWED_ORIGINS=*

# Model training flags
SKIP_BACKGROUND_TRAINING=0
AUTO_TRAIN_MISSING=1
UNIT_TESTING=0 


RAILWAY_ENVIRONMENT=production
RAILWAY_ENVIRONMENT_ID=fa10dc06-75ec-4c11-93d4-a0fde17996d0
RAILWAY_ENVIRONMENT_NAME=production
RAILWAY_PRIVATE_DOMAIN=empowering-appreciation.railway.internal
RAILWAY_PROJECT_ID=fc9da558-31d6-4b28-9eda-2bbe56cc7390
RAILWAY_PROJECT_NAME=responsible-abundance
RAILWAY_SERVICE_ID=87c129ab-ba49-471a-88bb-853ace60180d
RAILWAY_SERVICE_NAME=empowering-appreciation


Overwriting .env.template


In [89]:
%%writefile logging.yaml
version: 1
disable_existing_loggers: False
formatters:
  default: 
    format: "[%(levelname).1s] %(asctime)s %(name)s ▶ %(message)s"
handlers:
  console:
    class: logging.StreamHandler
    formatter: default
  file:
    class: logging.FileHandler
    filename: logs/backend.log
    formatter: default
loggers:
  uvicorn.error:  
    level: INFO
    handlers: [console, file]
  uvicorn.access: 
    level: INFO
    handlers: [console, file]
  app:            
    level: DEBUG
    handlers: [console, file]
    propagate: False
  app.services.ml.model_service:
    level: DEBUG
    handlers: [console, file]
    propagate: False
root:
  level: INFO
  handlers: [console, file] 

Overwriting logging.yaml


In [90]:
%%writefile pyproject.toml
[project]
name = "react_fastapi_railway"
version = "0.1.0"
description = "Pytorch and Jax GPU docker container"
authors = [
  { name = "Geoffrey Hadfield" },
]
license = "MIT"
readme = "README.md"

# ─── Restrict to Python 3.10–3.12 ──────────────────────────────
requires-python = ">=3.10,<3.13"

dependencies = [
  # Core web framework
  "fastapi>=0.104.0",
  "uvicorn[standard]>=0.24.0",
  "python-dotenv>=1.0.0",

  # Settings and validation
  "pydantic>=2.0.0",
  "pydantic-settings>=2.0.0",

  # HTTP client and multipart parsing
  "httpx>=0.24.0",
  "python-multipart>=0.0.6",

  # Data & ML basics
  "numpy>=1.24.0",
  "pandas>=2.1.0",
  "scikit-learn>=1.3.0",
  "mlflow>=2.8.0",

  # (Your existing extras—keep if you still need them)
  "matplotlib>=3.4.0",
  "pymc>=5.0.0",
  "arviz>=0.14.0",
  "statsmodels>=0.13.0",
  "jupyterlab>=3.0.0",
  "seaborn>=0.11.0",
  "tabulate>=0.9.0",
  "shap>=0.40.0",
  "xgboost>=1.5.0",
  "lightgbm>=3.3.0",
  "catboost>=1.2.8,<1.3.0",
  "scipy>=1.7.0",
  "shapash[report]>=2.3.0",
  "shapiq>=0.1.0",
  "explainerdashboard==0.5.1",
  "ipywidgets>=8.0.0",
  "nutpie>=0.7.1",
  "numpyro>=0.18.0,<1.0.0",
  "jax==0.6.0",
  "jaxlib==0.6.0",
  "pytensor>=2.18.3",
  "aesara>=2.9.4",
  "tqdm>=4.67.0",
  "pyarrow>=12.0.0",
  "optuna>=3.0.0",
  "optuna-integration[mlflow]>=0.2.0",
  "omegaconf>=2.3.0,<2.4.0",
  "hydra-core>=1.3.2,<1.4.0",
  "aiosqlite>=0.19.0", 
  "python-jose[cryptography]>=3.3.0",
  "passlib[bcrypt]>=1.7.4",
  "bcrypt==4.0.1",  # Pin bcrypt version to resolve warning
]

[project.optional-dependencies]
dev = [
  "pytest>=7.0.0",
  "black>=23.0.0",
  "isort>=5.0.0",
  "flake8>=5.0.0",
  "mypy>=1.0.0",
  "invoke>=2.2",
]

cuda = [
  "cupy-cuda12x>=12.0.0",
]

[tool.pytensor]
device    = "cuda"
floatX    = "float32"
allow_gc  = true
optimizer = "fast_run"



Overwriting pyproject.toml


In [91]:
%%writefile api/pyproject.toml
[project]
name = "api"
version = "1.0.0"
description = "FastAPI backend with React frontend"
requires-python = ">=3.8"
dependencies = [
    "fastapi>=0.104.0",
    "uvicorn>=0.24.0",
    "sqlalchemy>=2.0.23",
    "aiosqlite>=0.19.0",
    "python-jose[cryptography]>=3.3.0",
    "passlib[bcrypt]>=1.7.4",
    "python-multipart>=0.0.6",
    "pydantic>=2.4.2",
    "bcrypt==4.0.1",  # Pin bcrypt version to resolve warning
    # ML dependencies
    "mlflow>=2.8.0",
    "scikit-learn>=1.3.0",
    "pandas>=2.0.0",
    "numpy>=1.24.0",
    "pymc>=5.7.0",
    "arviz>=0.15.0",
    "requests>=2.31.0",
    "jax>=0.4.23",
    "jaxlib>=0.4.23"
]

[project.optional-dependencies]
dev = [
    "pytest>=7.0.0",
    "pytest-asyncio>=0.21.0",
    "httpx>=0.24.0"
]

[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"

[tool.hatch.build.targets.wheel]
packages = ["app"]




Overwriting api/pyproject.toml


In [92]:
%%writefile api/railway.json
{
  "$schema": "https://railway.app/railway.schema.json",
  "build": { "builder": "NIXPACKS" },
  "deploy": {
    "startCommand": "bash ./start.sh",
    "healthcheckPath": "/api/v1/health",
    "healthcheckInterval": 10,
    "healthcheckTimeout": 300,
    "restartPolicyType": "ON_FAILURE",
    "restartPolicyMaxRetries": 10
  }
}


Overwriting api/railway.json


In [93]:
%%writefile api/start.sh
#!/usr/bin/env bash
set -euo pipefail

# ── sanity ─────────────────────────────────────────────────────────
if [[ -z "${PORT:-}" ]]; then
  echo "❌  PORT not set – Railway always provides it." >&2
  exit 1
fi

if [[ -z "${SECRET_KEY:-}" ]]; then
  echo "❌  SECRET_KEY is not set for the backend service – aborting." >&2
  exit 1
fi

echo "🚀  FastAPI boot; PORT=$PORT  PY=$(python -V)"
env | grep -E 'RAILWAY_|PORT|DATABASE_URL' | sed 's/SECRET_KEY=.*/SECRET_KEY=***/'

# ── optional local .env ------------------------------------------------------
[[ -f .env ]] && export $(grep -Ev '^#' .env | xargs)

# ── one-shot DB migrate + seed (blocks until done) ---------------------------
python -m scripts.seed_user

# ── run the API --------------------------------------------------------------
exec uvicorn app.main:app \
  --host 0.0.0.0 --port "$PORT" \
  --proxy-headers --forwarded-allow-ips="*" --log-level info



Overwriting api/start.sh


In [94]:
%%writefile api/scripts/seed_user.py
from pathlib import Path
from passlib.context import CryptContext
from sqlalchemy.ext.asyncio import create_async_engine, async_sessionmaker
from sqlalchemy import select
import os, asyncio

# ── optional .env load (UTF-8 only) ───────────────────────────────
ENV_PATH = Path(__file__).resolve().parents[2] / ".env"
if ENV_PATH.exists():
    try:
        from dotenv import load_dotenv
        load_dotenv(ENV_PATH, encoding="utf-8")
    except UnicodeDecodeError:
        print("⚠️  .env not UTF-8 – skipped")

# ── model import (kept same) ──────────────────────────────────────
import sys; sys.path.append(str(Path(__file__).resolve().parents[1]))
from app.models import Base, User

USERNAME = os.getenv("USERNAME_KEY", "alice")
PASSWORD = os.getenv("USER_PASSWORD", "supersecretvalue")

pwd = CryptContext(schemes=["bcrypt"], deprecated="auto")
engine = create_async_engine("sqlite+aiosqlite:///./app.db")
session_factory = async_sessionmaker(engine, expire_on_commit=False)

async def main():
    async with engine.begin() as conn:
        await conn.run_sync(Base.metadata.create_all)

    async with session_factory() as db:
        result = await db.execute(select(User).where(User.username == USERNAME))
        user = result.scalar_one_or_none()
        hashed = pwd.hash(PASSWORD)

        if user:
            user.hashed_password = hashed
            action = "Updated"
        else:
            db.add(User(username=USERNAME, hashed_password=hashed))
            action = "Created"
        await db.commit()
        print(f"{action} user {USERNAME}")

if __name__ == "__main__":
    asyncio.run(main())




Overwriting api/scripts/seed_user.py


In [95]:
%%writefile api/app/__init__.py
# FastAPI backend package 

Overwriting api/app/__init__.py


In [96]:
%%writefile api/app/core/config.py
"""
Core configuration settings for the FastAPI application.
Centralizes environment variables and provides sensible defaults.
"""

import os
from typing import Optional

class Settings:
    """Application settings with environment-based configuration."""
    
    # Database
    DATABASE_URL: str = os.getenv("DATABASE_URL", "sqlite+aiosqlite:///./app.db")
    
    # Security
    SECRET_KEY: Optional[str] = os.getenv("SECRET_KEY")
    ACCESS_TOKEN_EXPIRE_MINUTES: int = int(os.getenv("ACCESS_TOKEN_EXPIRE_MINUTES", "30"))
    
    # CORS
    ALLOWED_ORIGINS: str = os.getenv("ALLOWED_ORIGINS", "*")
    
    # MLflow in local-file mode by default
    MLFLOW_TRACKING_URI: str = os.getenv(
        "MLFLOW_TRACKING_URI",
        "file:./mlruns_local"
    )
    MLFLOW_REGISTRY_URI: str = os.getenv(
        "MLFLOW_REGISTRY_URI",
        MLFLOW_TRACKING_URI
    )
    
    # Model training flags
    SKIP_BACKGROUND_TRAINING: bool = os.getenv("SKIP_BACKGROUND_TRAINING", "0") == "1"
    AUTO_TRAIN_MISSING: bool = os.getenv("AUTO_TRAIN_MISSING", "1") == "1"
    UNIT_TESTING: bool = os.getenv("UNIT_TESTING", "0") == "1"

settings = Settings() 

Overwriting api/app/core/config.py


In [97]:
%%writefile api/app/crud.py
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from .models import User

async def get_user_by_username(db: AsyncSession, username: str):
    stmt = select(User).where(User.username == username)
    res = await db.execute(stmt)
    return res.scalar_one_or_none() 

Overwriting api/app/crud.py


In [98]:
%%writefile api/app/models.py
from sqlalchemy import Column, Integer, String
from sqlalchemy.orm import declarative_base

Base = declarative_base()

class User(Base):
    __tablename__ = "users"
    id = Column(Integer, primary_key=True, index=True)
    username = Column(String, unique=True, index=True)
    hashed_password = Column(String) 

Overwriting api/app/models.py


In [99]:
%%writefile api/app/db.py
# api/app/db.py
from contextlib import asynccontextmanager
import os, logging, asyncio
from sqlalchemy.ext.asyncio import (
    AsyncSession,
    create_async_engine,
    async_sessionmaker,
)
from .models import Base
from .services.ml.model_service import model_service
from .core.config import settings

logger = logging.getLogger(__name__)

# ---------------------------------------------------------------------------
# Database engine & session factory (module-level singletons – cheap & safe)
# ---------------------------------------------------------------------------
DATABASE_URL = os.getenv("DATABASE_URL", "sqlite+aiosqlite:///./app.db")
engine = create_async_engine(DATABASE_URL, echo=False, future=True)
AsyncSessionLocal = async_sessionmaker(engine, expire_on_commit=False)

# Global readiness flag
_app_ready: bool = False

def get_app_ready():
    """Get the current app ready status."""
    return _app_ready

# ---------------------------------------------------------------------------
# FastAPI lifespan – runs ONCE at startup / shutdown
# ---------------------------------------------------------------------------
@asynccontextmanager
async def lifespan(app):
    """Open & dispose engine at app startup/shutdown; create all tables."""
    global _app_ready
    
    logger.info("🗄️  Initializing database…  URL=%s", DATABASE_URL)
    try:
        async with engine.begin() as conn:
            # DDL is safe here; it blocks startup until complete
            await conn.run_sync(Base.metadata.create_all)
        logger.info("✅ Database tables created/verified successfully")
        
        # Initialize application readiness
        logger.info("🚀 Startup event starting - _app_ready=%s", _app_ready)

        if settings.UNIT_TESTING:
            logger.info("🔒 UNIT_TESTING=1 – startup hooks bypassed")
            _app_ready = True
            logger.info("✅ _app_ready set to True (unit testing)")
        else:
            try:
                # Initialize ModelService first
                logger.info("🔧 Initializing ModelService")
                await model_service.initialize()
                logger.info("✅ ModelService initialized successfully")

                # Start background training tasks
                logger.info("🔄 Starting background training tasks")
                asyncio.create_task(model_service.startup())
                logger.info("✅ Background training tasks started")

                # Set ready to true after initialization (models will load in background)
                _app_ready = True
                logger.info("🚀 FastAPI ready – _app_ready=%s, health probes will pass immediately", _app_ready)
                
            except Exception as e:
                logger.error("❌ Startup event failed: %s", e)
                import traceback
                logger.error("❌ Startup traceback: %s", traceback.format_exc())
                # Set ready to true anyway so the API can serve requests
                _app_ready = True
                logger.warning("⚠️  Setting _app_ready=True despite startup errors")
        
        logger.info("🎯 Lifespan startup complete - _app_ready=%s", _app_ready)
        yield
    finally:
        logger.info("🔒 Disposing database engine…")
        await engine.dispose()

# ---------------------------------------------------------------------------
# Dependency injection helper
# ---------------------------------------------------------------------------
async def get_db() -> AsyncSession:
    """Yield a new DB session per request."""
    async with AsyncSessionLocal() as session:
        yield session



Overwriting api/app/db.py


In [100]:
%%writefile api/app/security.py
from __future__ import annotations
import os, logging, secrets
from datetime import datetime, timedelta
from typing import Optional

from fastapi import Depends, HTTPException, status
from fastapi.security import OAuth2PasswordBearer, OAuth2PasswordRequestForm
from jose import jwt, JWTError
from passlib.context import CryptContext
from pydantic import BaseModel

log = logging.getLogger(__name__)

# ---------------------------------------------------------------------------
# 1.  SECRET_KEY ***must*** be provided in the environment in production.
# ---------------------------------------------------------------------------
SECRET_KEY = os.getenv("SECRET_KEY")
if not SECRET_KEY:
    log.critical(
        "ENV variable SECRET_KEY is missing -- generating a temporary key. "
        "ALL issued JWTs will be invalid after a pod restart! "
        "Set it in Railway → Variables to disable this warning."
    )
    SECRET_KEY = secrets.token_urlsafe(32)   # fallback only for dev

ALGORITHM = "HS256"
ACCESS_TOKEN_EXPIRE_MINUTES = int(os.getenv("ACCESS_TOKEN_EXPIRE_MINUTES", 30))

pwd_ctx = CryptContext(schemes=["bcrypt"], deprecated="auto")
oauth2_scheme = OAuth2PasswordBearer(tokenUrl="/api/v1/token")

class TokenData(BaseModel):
    username: Optional[str] = None

def verify_password(raw: str, hashed: str) -> bool:
    return pwd_ctx.verify(raw, hashed)

def get_password_hash(pw: str) -> str:
    return pwd_ctx.hash(pw)

def create_access_token(subject: str) -> str:
    expire = datetime.utcnow() + timedelta(minutes=ACCESS_TOKEN_EXPIRE_MINUTES)
    return jwt.encode({"sub": subject, "exp": expire}, SECRET_KEY, algorithm=ALGORITHM)

async def get_current_user(token: str = Depends(oauth2_scheme)) -> str:
    try:
        payload = jwt.decode(token, SECRET_KEY, algorithms=[ALGORITHM])
        username: str = payload.get("sub")
        if not username:
            raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED)
        return username
    except JWTError as exc:
        raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED) from exc


Overwriting api/app/security.py


# additional models

In [101]:
%%writefile api/app/schemas/cancer.py
from pydantic import BaseModel, Field
from typing import List, Optional

class CancerFeatures(BaseModel):
    """Breast cancer diagnostic features."""
    mean_radius: float = Field(..., description="Mean of distances from center to points on perimeter")
    mean_texture: float = Field(..., description="Standard deviation of gray-scale values")
    mean_perimeter: float = Field(..., description="Mean size of the core tumor")
    mean_area: float = Field(..., description="Mean area of the core tumor")
    mean_smoothness: float = Field(..., description="Mean of local variation in radius lengths")
    mean_compactness: float = Field(..., description="Mean of perimeter^2 / area - 1.0")
    mean_concavity: float = Field(..., description="Mean of severity of concave portions of the contour")
    mean_concave_points: float = Field(..., description="Mean for number of concave portions of the contour")
    mean_symmetry: float = Field(..., description="Mean symmetry")
    mean_fractal_dimension: float = Field(..., description="Mean for 'coastline approximation' - 1")
    
    # SE features (standard error)
    se_radius: float = Field(..., description="Standard error of radius")
    se_texture: float = Field(..., description="Standard error of texture")
    se_perimeter: float = Field(..., description="Standard error of perimeter")
    se_area: float = Field(..., description="Standard error of area")
    se_smoothness: float = Field(..., description="Standard error of smoothness")
    se_compactness: float = Field(..., description="Standard error of compactness")
    se_concavity: float = Field(..., description="Standard error of concavity")
    se_concave_points: float = Field(..., description="Standard error of concave points")
    se_symmetry: float = Field(..., description="Standard error of symmetry")
    se_fractal_dimension: float = Field(..., description="Standard error of fractal dimension")
    
    # Worst features
    worst_radius: float = Field(..., description="Worst radius")
    worst_texture: float = Field(..., description="Worst texture")
    worst_perimeter: float = Field(..., description="Worst perimeter")
    worst_area: float = Field(..., description="Worst area")
    worst_smoothness: float = Field(..., description="Worst smoothness")
    worst_compactness: float = Field(..., description="Worst compactness")
    worst_concavity: float = Field(..., description="Worst concavity")
    worst_concave_points: float = Field(..., description="Worst concave points")
    worst_symmetry: float = Field(..., description="Worst symmetry")
    worst_fractal_dimension: float = Field(..., description="Worst fractal dimension")

class CancerPredictRequest(BaseModel):
    """Cancer prediction request (allows 'rows' alias)."""
    model_type: str = Field("bayes", description="Model type: 'bayes', 'logreg', or 'rf'")
    samples: List[CancerFeatures] = Field(
        ...,
        description="Breast-cancer feature vectors",
        alias="rows",
    )
    posterior_samples: Optional[int] = Field(
        None, ge=10, le=10_000, description="Posterior draws for uncertainty"
    )

    class Config:
        populate_by_name = True
        extra = "forbid"

class CancerPredictResponse(BaseModel):
    """Cancer prediction response."""
    predictions: List[str] = Field(..., description="Predicted diagnosis (M=malignant, B=benign)")
    probabilities: List[float] = Field(..., description="Probability of malignancy")
    uncertainties: Optional[List[float]] = Field(None, description="Uncertainty estimates (if requested)")
    input_received: List[CancerFeatures] = Field(..., description="Echo of input features") 

Overwriting api/app/schemas/cancer.py


In [102]:
%%writefile api/app/schemas/iris.py
from pydantic import BaseModel, Field
from typing import List, Optional

class IrisFeatures(BaseModel):
    """Iris measurement features."""
    sepal_length: float = Field(..., description="Sepal length in cm", ge=0, le=10)
    sepal_width: float = Field(..., description="Sepal width in cm", ge=0, le=10)
    petal_length: float = Field(..., description="Petal length in cm", ge=0, le=10)
    petal_width: float = Field(..., description="Petal width in cm", ge=0, le=10)

class IrisPredictRequest(BaseModel):
    """Iris prediction request (accepts legacy 'rows' alias)."""
    model_type: str = Field("rf", description="Model type: 'rf' or 'logreg'")
    samples: List[IrisFeatures] = Field(
        ...,
        description="List of iris measurements",
        alias="rows",
    )

    class Config:
        populate_by_name = True
        extra = "forbid"

class IrisPredictResponse(BaseModel):
    """Iris prediction response."""
    predictions: List[str] = Field(..., description="Predicted iris species")
    probabilities: List[List[float]] = Field(..., description="Class probabilities")
    input_received: List[IrisFeatures] = Field(..., description="Echo of input features") 

Overwriting api/app/schemas/iris.py


In [103]:
%%writefile api/app/ml/__init__.py
"""
ML sub-package – exposes built-in trainers so the service can import
`app.ml.builtin_trainers` with an absolute import.
"""

from .builtin_trainers import (
    train_iris_random_forest,
    train_breast_cancer_bayes,
    train_breast_cancer_stub,
)

__all__ = [
    "train_iris_random_forest",
    "train_breast_cancer_bayes",
    "train_breast_cancer_stub",
] 


Overwriting api/app/ml/__init__.py


In [104]:
%%writefile api/app/ml/utils.py
"""
Utility functions for ML model training and compiler detection.
"""

import shutil
import logging
import os
import subprocess
import shlex
import platform
import glob

log = logging.getLogger(__name__)

def find_compiler() -> str | None:
    """
    Return absolute path to a working C/C++ compiler or None.

    Searches in order:
    1. Explicit override via PYTENSOR_CXX environment variable
    2. Common compiler names (g++, gcc, cl.exe, cl)
    3. Windows Visual Studio BuildTools typical location (last resort)

    Returns:
        str | None: Absolute path to compiler executable, or None if not found
    """
    # 1️⃣ explicit override via env
    override = os.getenv("PYTENSOR_CXX")
    if override and shutil.which(override):
        log.info(f"Using compiler from PYTENSOR_CXX: {override}")
        return override

    # 2️⃣ try common names
    for exe in ("g++", "gcc", "cl.exe", "cl"):
        path = shutil.which(exe)
        if path:
            log.info(f"Found compiler: {path}")
            return path

    # 3️⃣ Windows VS BuildTools typical location (last resort)
    if platform.system() == "Windows":
        vswhere = r"C:\Program Files (x86)\Microsoft Visual Studio\Installer\vswhere.exe"
        if os.path.exists(vswhere):
            try:
                log.debug("Probing for Visual Studio BuildTools via vswhere...")
                out = subprocess.check_output(
                    [vswhere, "-latest", "-products", "*", "-requires", 
                     "Microsoft.VisualStudio.Component.VC.Tools.x86.x64", 
                     "-property", "installationPath"],
                    text=True,
                    timeout=5,
                ).strip()

                if out:
                    # Look for cl.exe in the typical location
                    cand = rf"{out}\VC\Tools\MSVC\*\bin\Hostx64\x64\cl.exe"
                    matches = glob.glob(cand)
                    if matches:
                        log.info(f"Found Visual Studio compiler: {matches[0]}")
                        return matches[0]
                    else:
                        log.debug(f"VS installation found at {out} but cl.exe not found")
                else:
                    log.debug("vswhere found no Visual Studio installations")

            except subprocess.TimeoutExpired:
                log.debug("vswhere probe timed out")
            except subprocess.CalledProcessError as exc:
                log.debug(f"vswhere probe failed with return code {exc.returncode}")
            except Exception as exc:
                log.debug(f"vswhere probe failed: {exc}")

    log.warning("No C/C++ compiler found")
    return None

def configure_pytensor_compiler(compiler_path: str | None = None) -> bool:
    """
    Configure PyTensor to use a specific compiler with MSVC-safe flags.

    Args:
        compiler_path: Path to compiler executable. If None, will search for one.

    Returns:
        bool: True if compiler was configured successfully, False otherwise
    """
    try:
        import pytensor  # late import so function can be called very early
    except ImportError:
        log.warning("PyTensor not available – cannot configure compiler")
        return False

    # 1️⃣ Resolve the compiler path ------------------------------------------------
    if compiler_path is None:
        compiler_path = find_compiler()
    if compiler_path is None:
        log.warning("No compiler found – PyTensor will fall back to defaults")
        return False

    # 2️⃣ Write settings into PyTensor's global config -----------------------------
    system_is_windows = platform.system() == "Windows"
    basename = os.path.basename(compiler_path).lower()

    if system_is_windows:
        # Quote path so spaces in "Program Files (x86)" don't break the command line
        pytensor.config.cxx = f'"{compiler_path}"'

        # If this *is* MSVC, strip every GCC flag and substitute safe disables
        if "cl" in basename:
            # MSVC understands /wdXXXX but not -Wno-…  ➜  map the common ones
            pytensor.config.cxxflags = "/wd4100 /wd4244 /wd4267 /wd4996"
            log.info("✅ Configured MSVC with safe warning suppressions")
    else:
        pytensor.config.cxx = compiler_path  # GCC / Clang path is fine

    # 3️⃣ NUCLEAR OPTION: Blank ALL environment variables that PyTensor uses to inject flags
    # PyTensor checks these environment variables in multiple places
    flag_vars = [
        "PYTENSOR_FLAGS",
        "THEANO_FLAGS",  # Legacy but still checked
        "PYTENSOR_CXXFLAGS",
        "THEANO_CXXFLAGS",  # Legacy but still checked
    ]
    
    for var in flag_vars:
        os.environ[var] = "cxxflags="
    
    # 4️⃣ Additional PyTensor config overrides to prevent flag injection
    if system_is_windows and "cl" in basename:
        # Disable PyTensor's automatic flag injection
        pytensor.config.mode = "FAST_COMPILE"  # Avoid some optimizations that inject flags
        pytensor.config.optimizer = "fast_compile"  # Use simpler optimizer
        
        # Set additional config to prevent GCC flag injection
        pytensor.config.cmodule__compilation_warning = False
        pytensor.config.cmodule__warn_no_version = False
        
        # Force PyTensor to use our flags only
        pytensor.config.cxxflags = "/wd4100 /wd4244 /wd4267 /wd4996"
        
        log.info("🛡️ Applied nuclear option: disabled all GCC flag injection")

    # 5️⃣ Optional verbose diagnostics --------------------------------------------
    if os.getenv("DEBUG_COMPILER") == "1":
        log.debug("PyTensor.cxx      = %s", pytensor.config.cxx)
        log.debug("PyTensor.cxxflags = %s", getattr(pytensor.config, "cxxflags", ""))
        log.debug("PYTENSOR_FLAGS    = %s", os.getenv("PYTENSOR_FLAGS", "NOT_SET"))
        log.debug("THEANO_FLAGS      = %s", os.getenv("THEANO_FLAGS", "NOT_SET"))

    log.info("🛠 PyTensor now uses compiler: %s", compiler_path)
    return True

def test_compiler_availability() -> dict:
    """
    Test what compilers are available on the system.

    Returns:
        dict: Mapping of compiler names to availability status
    """
    compilers = ["g++", "gcc", "cl.exe", "cl"]
    available = {}

    for compiler in compilers:
        try:
            result = subprocess.run([compiler, "--version"], 
                                  capture_output=True, text=True, timeout=5)
            available[compiler] = result.returncode == 0
            if result.returncode == 0:
                log.info(f"✅ {compiler}: Available")
                log.debug(f"   Version: {result.stdout.split()[0] if result.stdout else 'Unknown'}")
            else:
                log.debug(f"❌ {compiler}: Not available (return code: {result.returncode})")
        except FileNotFoundError:
            log.debug(f"❌ {compiler}: Not found")
            available[compiler] = False
        except subprocess.TimeoutExpired:
            log.debug(f"⏰ {compiler}: Timeout")
            available[compiler] = False
        except Exception as e:
            log.debug(f"❌ {compiler}: Error - {e}")
            available[compiler] = False

    return available 


Overwriting api/app/ml/utils.py


In [105]:
%%writefile api/app/ml/builtin_trainers.py
# api/ml/builtin_trainers.py
"""
Built-in trainers for Iris RF and Breast-Cancer Bayesian LogReg.
Executed automatically by ModelService when a model is missing.
"""

import logging
logger = logging.getLogger(__name__)

from pathlib import Path
import mlflow, mlflow.sklearn, mlflow.pyfunc
from sklearn.datasets import load_iris, load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import pandas as pd
import numpy as np
import tempfile
import pickle
import warnings
import subprocess
import os
import platform

# Conditional imports for heavy dependencies
if os.getenv("UNIT_TESTING") != "1" and os.getenv("SKIP_BACKGROUND_TRAINING") != "1":
    import pymc as pm
    import arviz as az
else:
    pm = None
    az = None

# ── NEW: Configure MLflow to use local file storage ─────────────────────────
# Set MLflow to use local file storage instead of remote server
os.environ.setdefault("MLFLOW_TRACKING_URI", "file:./mlruns_local")
os.environ.setdefault("MLFLOW_REGISTRY_URI", "file:./mlruns_local")

# Configure MLflow tracking URI immediately
mlflow.set_tracking_uri("file:./mlruns_local")
# ──────────────────────────────────────────────────────────────────────────────

MLFLOW_EXPERIMENT = "ml_fullstack_models"

# Only set experiment if not in unit test mode and after tracking URI is set
if os.getenv("UNIT_TESTING") != "1":
    try:
        mlflow.set_experiment(MLFLOW_EXPERIMENT)
    except Exception as e:
        logging.warning(f"Could not set MLflow experiment: {e}")

# -----------------------------------------------------------------------------
#  IRIS – point-estimate Random-Forest (enhanced with better parameters)
# -----------------------------------------------------------------------------
def train_iris_random_forest(
    n_estimators: int = 300,
    max_depth: int | None = None,
    random_state: int = 42
) -> str:
    """
    Train + register a Random-Forest on the Iris data and push it to MLflow.
    Returns the run_id (string). Enhanced with better parameters and stratified split.
    """
    iris = load_iris(as_frame=True)
    X, y = iris.data, iris.target
    X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.25,
                                              stratify=y, random_state=random_state)

    # Enhanced Random Forest with better parameters
    rf = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        random_state=random_state,
        n_jobs=-1,  # Use all available cores
        class_weight='balanced'  # Handle any class imbalance
    ).fit(X_tr, y_tr)

    preds = rf.predict(X_te)
    metrics = {
        "accuracy":  accuracy_score(y_te, preds),
        "f1_macro":  f1_score(y_te, preds, average="macro"),
        "precision_macro": precision_score(y_te, preds, average="macro"),
        "recall_macro":    recall_score(y_te, preds, average="macro"),
    }

    with mlflow.start_run(run_name="iris_random_forest") as run:
        # Log hyperparameters
        mlflow.log_params({
            "n_estimators": n_estimators,
            "max_depth": max_depth,
            "random_state": random_state
        })

        # Log metrics
        mlflow.log_metrics(metrics)

        # Create a custom pyfunc wrapper that exposes both predict and predict_proba
        class IrisRFWrapper(mlflow.pyfunc.PythonModel):
            def __init__(self, model):
                self.model = model

            def predict(self, model_input, params=None):
                # Return class probabilities for pyfunc interface
                # Convert to numpy array if it's a DataFrame
                if hasattr(model_input, 'values'):
                    X = model_input.values
                else:
                    X = model_input
                return self.model.predict_proba(X)

            def predict_proba(self, X):
                # Expose predict_proba for direct access
                if hasattr(X, 'values'):
                    X = X.values
                return self.model.predict_proba(X)

            def predict_classes(self, X):
                # Expose class prediction
                if hasattr(X, 'values'):
                    X = X.values
                return self.model.predict(X)

        iris_wrapper = IrisRFWrapper(rf)

        # Log model with proper signature
        mlflow.pyfunc.log_model(
            artifact_path="model",
            python_model=iris_wrapper,
            registered_model_name="iris_random_forest",
            input_example=X.head(),
            signature=mlflow.models.signature.infer_signature(X, iris_wrapper.predict(X))
        )
        return run.info.run_id

# -----------------------------------------------------------------------------
#  BREAST-CANCER STUB – ultra-fast fallback model
# -----------------------------------------------------------------------------
def train_breast_cancer_stub(random_state: int = 42) -> str:
    """
    *Ultra-fast* fallback –  < 100 ms on any laptop.
    Trains vanilla LogisticRegression so the API can
    answer probability queries while the PyMC model cooks.
    """
    from sklearn.datasets import load_breast_cancer
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import accuracy_score
    import mlflow, tempfile, pickle, pandas as pd

    X, y = load_breast_cancer(return_X_y=True, as_frame=True)
    Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.3,
                                          stratify=y, random_state=random_state)

    clf = LogisticRegression(max_iter=200, n_jobs=-1).fit(Xtr, ytr)
    acc = accuracy_score(yte, clf.predict(Xte))

    with tempfile.TemporaryDirectory() as td, mlflow.start_run(run_name="breast_cancer_stub") as run:
        mlflow.log_metric("accuracy", acc)
        mlflow.sklearn.log_model(
            clf, "model",
            registered_model_name="breast_cancer_stub",
            input_example=X.head()
        )
        return run.info.run_id

# -----------------------------------------------------------------------------
#  BREAST-CANCER – hierarchical Bayesian logistic regression
# -----------------------------------------------------------------------------
class _HierBayesLogReg(mlflow.pyfunc.PythonModel):
    """
    Hierarchical Bayesian Logistic Regression wrapper for MLflow serving.
    Implements varying intercepts by mean_texture quintiles with global slopes.
    """

    def __init__(self, trace, scaler, group_edges, feature_names):
        self.trace = trace                # ArviZ InferenceData for posterior samples
        self.scaler = scaler              # sklearn StandardScaler for feature normalization
        self.group_edges = group_edges    # bin edges for creating group indices
        self.feature_names = feature_names # column names for proper DataFrame handling

    def _group_index(self, X_df):
        """Create group indices based on mean_texture quintiles."""
        tex = X_df["mean texture"].to_numpy()
        # Use same quintile edges as training, clipping to valid range
        return np.clip(np.digitize(tex, self.group_edges, right=False), 0, 4)

    def predict(self, model_input, params=None):
        """
        MLflow-required prediction method.

        Args:
            model_input: pandas.DataFrame with breast cancer features
            params: Optional parameters (unused)

        Returns:
            np.array: Probability of malignancy [0,1] for each sample
        """
        # Ensure we have a DataFrame with proper column order
        if isinstance(model_input, pd.DataFrame):
            X_df = model_input
        else:
            X_df = pd.DataFrame(model_input, columns=self.feature_names)

        # Standardize features using training scaler
        Xs = self.scaler.transform(X_df)

        # Get group indices for hierarchical structure
        g = self._group_index(X_df)

        # Extract posterior medians for prediction
        α = self.trace.posterior["α_group"].median(("chain", "draw")).values
        β = self.trace.posterior["β"].median(("chain", "draw")).values

        # Compute predictions: logit = α_group[g] + X @ β
        logits = α[g] + np.dot(Xs, β)

        # Convert to probabilities
        return 1 / (1 + np.exp(-logits))

def train_breast_cancer_bayes(
    draws: int = 800,
    tune: int = 400,
    target_accept: float = 0.90,
) -> str:
    """
    Train a hierarchical Bayesian logistic-regression model.

    *On Windows* we first configure PyTensor so MSVC builds succeed; if that
    fails we raise – the caller will fall back to the stub model instead.
    """
    # ------------------------------------------------------------------ compiler
    from app.ml.utils import find_compiler, configure_pytensor_compiler

    cxx = find_compiler()
    if cxx is None or not configure_pytensor_compiler(cxx):
        msg = (
            "No compatible C/C++ compiler (or mis-configuration) – "
            "skipping Bayesian build."
        )
        logger.warning("⚠️ %s", msg)
        raise RuntimeError(msg)

    # ---------------------------------------------------------------- NumPyro opt
    # If we *still* end up on MSVC we can opt-out of C-thunks entirely.
    nuts_backend = (
        "numpyro"
        if platform.system() == "Windows" and "cl" in os.path.basename(cxx).lower()
        else "nuts"
    )

    # ------------------------------------------------------------------ modelling
    import pymc as pm, pandas as pd, numpy as np
    from sklearn.datasets import load_breast_cancer
    from sklearn.preprocessing import StandardScaler
    import mlflow, tempfile, pickle, arviz as az

    X_df, y = load_breast_cancer(as_frame=True, return_X_y=True)
    quint, edges = pd.qcut(X_df["mean texture"], 5, labels=False, retbins=True)
    g = quint.astype("int64").to_numpy()
    scaler = StandardScaler().fit(X_df)
    Xs = scaler.transform(X_df)

    logger.info("🧠 Building hierarchical Bayesian model (backend=%s)…", nuts_backend)
    with pm.Model() as mdl:
        α_group = pm.Normal("α_group", mu=0, sigma=1, shape=5)
        β = pm.Normal("β", mu=0, sigma=1, shape=Xs.shape[1])
        logits = α_group[g] + pm.math.dot(Xs, β)
        pm.Bernoulli("obs", logit_p=logits, observed=y)
        trace = pm.sample(
            draws=draws, tune=tune,
            target_accept=target_accept,
            nuts_sampler=nuts_backend,
            progressbar=False,
            chains=2, random_seed=123,
        )

    # ------------------------------------------------------------------ wrapping
    class _HierBayesLogReg(mlflow.pyfunc.PythonModel):
        def __init__(self, trc, sc, edge, cols):
            self.trace, self.scaler, self.edges, self.cols = trc, sc, edge, cols

        def _grp(self, df):  # replicate training quintiles
            tex = df["mean texture"].to_numpy()
            return np.clip(np.digitize(tex, self.edges, right=False), 0, 4)

        def predict(self, X, params=None):
            df = X if isinstance(X, pd.DataFrame) else pd.DataFrame(X, columns=self.cols)
            Xs = self.scaler.transform(df)
            g = self._grp(df)
            αg = self.trace.posterior["α_group"].median(("chain", "draw")).values
            β = self.trace.posterior["β"].median(("chain", "draw")).values
            lg = αg[g] + np.dot(Xs, β)
            return 1 / (1 + np.exp(-lg))

    wrapper = _HierBayesLogReg(trace, scaler, edges[1:-1], X_df.columns.tolist())
    preds = (wrapper.predict(X_df) > 0.5).astype(int)
    acc = float((preds == y).mean())

    # -------------------------------------------------------------------- MLflow
    with tempfile.TemporaryDirectory() as td, mlflow.start_run(run_name="breast_cancer_bayes") as run:
        scaler_path = Path(td) / "scaler.pkl"
        with open(scaler_path, "wb") as fh:
            pickle.dump(scaler, fh)

        mlflow.log_params({"draws": draws, "tune": tune, "target_accept": target_accept})
        mlflow.log_metric("accuracy", acc)
        mlflow.pyfunc.log_model(
            "model", python_model=wrapper,
            artifacts={"scaler": str(scaler_path)},
            registered_model_name="breast_cancer_bayes",
            input_example=X_df.head(),
            signature=mlflow.models.signature.infer_signature(X_df, wrapper.predict(X_df)),
        )
        logger.info("📦 Bayesian model logged – run_id=%s  acc=%.3f", run.info.run_id, acc)
        return run.info.run_id


Overwriting api/app/ml/builtin_trainers.py


In [106]:
%%writefile api/app/services/ml/model_service.py
"""
Model service – self-healing startup with background training.
"""

from __future__ import annotations
import asyncio, logging, os, time, socket
from concurrent.futures import ThreadPoolExecutor
from typing import Dict, Any, List, Tuple, Optional

import mlflow, pandas as pd, numpy as np
from mlflow.tracking import MlflowClient
from mlflow.exceptions import MlflowException

from app.core.config import settings
from app.ml.builtin_trainers import (
    train_iris_random_forest,
    train_breast_cancer_bayes,
    train_breast_cancer_stub,
)

logger = logging.getLogger(__name__)

# Trainer mapping for self-healing
TRAINERS = {
    "iris_random_forest": train_iris_random_forest,
    "breast_cancer_bayes": train_breast_cancer_bayes,
    "breast_cancer_stub": train_breast_cancer_stub,
}

class ModelService:
    """
    Self-healing model service that loads existing models and schedules
    background training for missing ones.
    """

    _EXECUTOR = ThreadPoolExecutor(max_workers=2)

    def __init__(self) -> None:
        self._unit_test_mode = settings.UNIT_TESTING
        self.initialized = False

        # 🚫 Heavy clients only when NOT unit-testing
        self.client = None if self._unit_test_mode else None  # Will be set in initialize()
        self.mlflow_client = None

        self.models: Dict[str, Any] = {}
        self.status: Dict[str, str] = {
            "iris_random_forest": "missing",
            "breast_cancer_bayes": "missing",
            "breast_cancer_stub": "missing",
        }

    async def initialize(self) -> None:
        """
        Connect to MLflow – fall back to local file store if the configured
        tracking URI is unreachable *or* the client is missing critical methods
        (e.g. when mlflow-skinny accidentally shadows the full package).
        """
        if self.initialized:
            return

        def _needs_fallback(client) -> bool:
            # any missing attr is a strong signal we are on mlflow-skinny
            return not callable(getattr(client, "list_experiments", None))

        try:
            mlflow.set_tracking_uri(settings.MLFLOW_TRACKING_URI)
            self.mlflow_client = MlflowClient(settings.MLFLOW_TRACKING_URI)

            if _needs_fallback(self.mlflow_client):
                raise AttributeError("list_experiments not implemented – skinny build detected")

            # minimal probe (cheap & always present)
            self.mlflow_client.search_experiments(max_results=1)
            logger.info("🟢  Connected to MLflow @ %s", settings.MLFLOW_TRACKING_URI)

        except (MlflowException, socket.gaierror, AttributeError) as exc:
            logger.warning("🔄  Falling back to local MLflow store – %s", exc)
            mlflow.set_tracking_uri("file:./mlruns_local")
            self.mlflow_client = MlflowClient("file:./mlruns_local")
            logger.info("📂  Using local file store ./mlruns_local")

        await self._load_models()
        self.initialized = True

    async def _load_models(self) -> None:
        """Load existing models from MLflow."""
        await self._try_load("iris_random_forest")
        await self._try_load("breast_cancer_bayes")
        await self._try_load("breast_cancer_stub")

    async def startup(self, auto_train: bool | None = None) -> None:
        """
        Faster: serve stub immediately; heavy Bayesian job in background.
        """
        if self._unit_test_mode:
            logger.info("🔒 UNIT_TESTING=1 – skipping model loading")
            return                      # 👉 nothing else runs

        # Initialize MLflow connection first
        await self.initialize()

        if settings.SKIP_BACKGROUND_TRAINING:
            logger.warning("⏩ SKIP_BACKGROUND_TRAINING=1 – models will load on-demand")
            # We still *try* to load existing artefacts so prod works
            await self._try_load("iris_random_forest")
            await self._try_load("breast_cancer_bayes")
            return

        auto = auto_train if auto_train is not None else settings.AUTO_TRAIN_MISSING
        logger.info("🔄 Model-service startup (auto_train=%s)", auto)

        # 1️⃣ try to load whatever already exists
        await self._try_load("iris_random_forest")

        # 2️⃣ Load bayes – if exists we're done
        if not await self._try_load("breast_cancer_bayes"):
            # 3️⃣ Ensure stub is *synchronously* available
            if not await self._try_load("breast_cancer_stub"):
                logger.info("Training stub cancer model …")
                await asyncio.get_running_loop().run_in_executor(
                    self._EXECUTOR, train_breast_cancer_stub
                )
                await self._try_load("breast_cancer_stub")

            # 4️⃣ Fire full PyMC build in background unless disabled
            if not settings.SKIP_BACKGROUND_TRAINING:
                logger.info("Scheduling full Bayesian retrain in background")
                asyncio.create_task(
                    self._train_and_reload("breast_cancer_bayes", train_breast_cancer_bayes)
                )

        # 5️⃣ Train iris if missing
        if not await self._try_load("iris_random_forest"):
            logger.info("Training iris model …")
            await asyncio.get_running_loop().run_in_executor(
                self._EXECUTOR, train_iris_random_forest
            )
            await self._try_load("iris_random_forest")

    async def _try_load(self, name: str) -> None:
        """Try to load a model and update status."""
        model = await self._load_production_model(name)
        if model:
            self.models[name] = model
            self.status[name] = "loaded"
            logger.info("✅ %s loaded", name)
            return True
        return False

    async def _train_and_reload(self, name: str, trainer) -> None:
        """Train a model in background and reload it, with verbose phase logs."""
        try:
            t0 = time.perf_counter()
            logger.info("🏗️  BEGIN training %s", name)
            self.status[name] = "training"

            loop = asyncio.get_running_loop()
            await loop.run_in_executor(self._EXECUTOR, trainer)

            logger.info("📦 Training %s complete in %.1fs – re-loading", name,
                        time.perf_counter() - t0)
            model = await self._load_production_model(name)
            if not model:
                raise RuntimeError(f"{name} trained but could not be re-loaded")

            self.models[name] = model
            self.status[name] = "loaded"
            logger.info("✅ %s trained & loaded", name)

        except Exception as exc:
            self.status[name] = "failed"
            logger.error("❌ %s failed: %s", name, exc, exc_info=True)  # ← keeps trace
            # NEW: persist last_error for UI / debug endpoint
            self.status[f"{name}_last_error"] = str(exc)

    async def _load_production_model(self, name: str) -> Optional[Any]:
        """
        1. Registry 'Production' stage → load.  
        2. Otherwise most recent run with runName == name.
        Returns None if not found.
        """
        try:
            versions = self.mlflow_client.search_model_versions(f"name='{name}'")
            prod = [v for v in versions if v.current_stage == "Production"]
            if prod:
                uri = f"models:/{name}/{prod[0].version}"
                logger.info("↪︎  Loading %s from registry:%s", name, prod[0].version)
                return mlflow.pyfunc.load_model(uri)
        except MlflowException:
            pass

        # Fallback – scan experiments for latest run
        runs = []
        for exp in self.mlflow_client.search_experiments():
            runs.extend(self.mlflow_client.search_runs(
                [exp.experiment_id],
                f"tags.mlflow.runName = '{name}'",
                order_by=["attributes.start_time DESC"],
                max_results=1))
        if runs:
            uri = f"runs:/{runs[0].info.run_id}/model"
            logger.info("↪︎  Loading %s from latest run:%s", name, runs[0].info.run_id)
            return mlflow.pyfunc.load_model(uri)
        return None

    # Manual training endpoints (for UI)
    async def train_iris(self) -> None:
        await self._train_and_reload("iris_random_forest", TRAINERS["iris_random_forest"])

    async def train_cancer(self) -> None:
        await self._train_and_reload("breast_cancer_bayes", TRAINERS["breast_cancer_bayes"])

    # Predict methods (unchanged from your previous version)
    async def predict_iris(
        self,
        features: List[Dict[str, float]],
        model_type: str = "rf",
    ) -> Tuple[List[str], List[List[float]]]:
        """
        Predict Iris species from measurements.

        Args:
            features: List of iris measurements as dictionaries
            model_type: Model type to use (only 'rf' supported)

        Returns:
            Tuple of (predicted_class_names, class_probabilities)
        """
        if model_type != "rf":
            raise ValueError("Only 'rf' supported for iris")

        model = self.models.get("iris_random_forest")
        if not model:
            raise RuntimeError("Iris model not loaded")

        # Convert to DataFrame with proper column names (matching training data)
        X_df = pd.DataFrame([{
            "sepal length (cm)": sample["sepal_length"],
            "sepal width (cm)": sample["sepal_width"], 
            "petal length (cm)": sample["petal_length"],
            "petal width (cm)": sample["petal_width"]
        } for sample in features])

        # The iris model wrapper returns probabilities via predict() method
        probs = model.predict(X_df)                  # shape (n, 3) - probabilities
        preds = probs.argmax(axis=1)                 # numerical class indices

        # Map numerical classes to species names
        class_names = ["setosa", "versicolor", "virginica"]
        pred_names = [class_names[i] for i in preds]

        return pred_names, probs.tolist()

    async def predict_cancer(
        self,
        features: List[Dict[str, float]],
        model_type: str = "bayes",
        posterior_samples: Optional[int] = None,
    ) -> Tuple[List[str], List[float], Optional[List[Tuple[float, float]]]]:
        """
        Predict breast cancer diagnosis from features using hierarchical Bayesian model.
        Falls back to stub model if Bayesian model is not available.

        Args:
            features: List of cancer measurements as dictionaries
            model_type: Model type to use ('bayes' or 'stub')
            posterior_samples: Number of posterior samples for uncertainty (Bayesian only)

        Returns:
            Tuple of (predicted_labels, probabilities, uncertainty_intervals)
        """
        # Determine which model to use
        if model_type == "bayes":
            model = self.models.get("breast_cancer_bayes")
            if not model:
                # Fall back to stub model
                model = self.models.get("breast_cancer_stub")
                if not model:
                    raise RuntimeError("No cancer model available")
                logger.info("Using stub cancer model (Bayesian model not ready)")
        elif model_type == "stub":
            model = self.models.get("breast_cancer_stub")
            if not model:
                raise RuntimeError("Stub cancer model not loaded")
        else:
            raise ValueError("model_type must be 'bayes' or 'stub'")

        # Convert to DataFrame with proper column names
        X_df = pd.DataFrame(features)

        # Get predictions
        if model_type == "bayes" and "breast_cancer_bayes" in self.models:
            # Use Bayesian model with uncertainty
            probs = model.predict(X_df)
            labels = ["malignant" if p > 0.5 else "benign" for p in probs]
        else:
            # Use stub model (sklearn LogisticRegression)
            probs = model.predict_proba(X_df)[:, 1]  # Probability of malignant
            labels = ["malignant" if p > 0.5 else "benign" for p in probs]

        # Compute uncertainty intervals if requested (Bayesian model only)
        ci = None
        if posterior_samples and model_type == "bayes" and "breast_cancer_bayes" in self.models:
            try:
                # Access the underlying python model to get the trace
                python_model = model.unwrap_python_model()

                # Access posterior samples for uncertainty quantification
                draws = python_model.trace.posterior
                αg = draws["α_group"].stack(samples=("chain", "draw"))
                β = draws["β"].stack(samples=("chain", "draw"))

                # Get group indices and standardized features
                g = python_model._group_index(X_df)
                Xs = python_model.scaler.transform(X_df)

                # Compute posterior predictive samples
                logits = αg.values[:, g] + np.dot(β.values.T, Xs.T)      # shape (S, N)
                pp = 1 / (1 + np.exp(-logits))

                # Compute 95% credible intervals
                lo, hi = np.percentile(pp, [2.5, 97.5], axis=0)
                ci = list(zip(lo.tolist(), hi.tolist()))

            except Exception as e:
                logger.warning(f"Failed to compute uncertainty intervals: {e}")
                ci = None

        return labels, probs.tolist(), ci


# Global singleton
model_service = ModelService()







Overwriting api/app/services/ml/model_service.py


In [107]:
%%writefile api/app/main.py
import logging
import os
import asyncio
from fastapi import FastAPI, Request, Depends, BackgroundTasks, status, HTTPException
from fastapi.security import OAuth2PasswordRequestForm
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.exc import SQLAlchemyError
import time

from pydantic import BaseModel

from .db import lifespan, get_db, get_app_ready
from .security import create_access_token, get_current_user, verify_password
from .crud import get_user_by_username
from .schemas.iris import IrisPredictRequest, IrisPredictResponse, IrisFeatures
from .schemas.cancer import CancerPredictRequest, CancerPredictResponse, CancerFeatures
from .services.ml.model_service import model_service
from .core.config import settings

# ── NEW: guarantee log directory exists ───────────────────────────
os.makedirs("logs", exist_ok=True)
# ──────────────────────────────────────────────────────────────────

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Pydantic models
class Payload(BaseModel):
    count: int

class PredictionRequest(BaseModel):
    data: Payload

class PredictionResponse(BaseModel):
    prediction: str
    confidence: float
    input_received: Payload  # Echo back the input for verification

class Token(BaseModel):
    access_token: str
    token_type: str

app = FastAPI(
    title="FastAPI + React ML App",
    version="1.0.0",
    docs_url="/api/v1/docs",
    redoc_url="/api/v1/redoc",
    openapi_url="/api/v1/openapi.json",
    swagger_ui_parameters={"persistAuthorization": True},
    lifespan=lifespan,  # register startup/shutdown events
)

# Configure CORS with environment-based origins
origins_env = settings.ALLOWED_ORIGINS
origins: list[str] = [o.strip() for o in origins_env.split(",")] if origins_env != "*" else ["*"]

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],  # In production, replace with specific origins
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

@app.middleware("http")
async def add_process_time_header(request: Request, call_next):
    """Measure request time and add X-Process-Time header."""
    start = time.perf_counter()
    response = await call_next(request)
    elapsed = time.perf_counter() - start
    response.headers["X-Process-Time"] = f"{elapsed:.4f}"
    return response

# Health check endpoint
@app.get("/api/v1/health")
async def health_check():
    """Basic health check - always returns 200 if server is running."""
    return {"status": "healthy", "timestamp": time.time()}

@app.get("/api/v1/hello")
async def hello(current_user: str = Depends(get_current_user)):
    """Simple endpoint for token validation."""
    return {"message": f"Hello {current_user}!", "status": "authenticated"}

@app.get("/api/v1/ready")
async def ready():
    """Basic readiness check."""
    return {"ready": get_app_ready()}

@app.post("/api/v1/token", response_model=Token)
async def login(
    form_data: OAuth2PasswordRequestForm = Depends(),
    db: AsyncSession = Depends(get_db),
):
    """Authenticate user and issue JWT."""
    if not get_app_ready():
        raise HTTPException(
            status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
            detail="Backend still loading models. Try again in a moment.",
            headers={"Retry-After": "10"}
        )

    user = await get_user_by_username(db, form_data.username)
    if not user or not verify_password(form_data.password, user.hashed_password):
        raise HTTPException(
            status_code=status.HTTP_401_UNAUTHORIZED,
            detail="Invalid credentials"
        )
    token = create_access_token(subject=user.username)
    return Token(access_token=token, token_type="bearer")

@app.get("/api/v1/ready/full")
async def ready_full() -> dict:
    """
    Extended readiness probe:
    - ready: API server is ready to accept requests (login allowed)
    - model_status: dict of {model_name: status} where status is 'loaded'|'training'|'failed'|'missing'
    - all_models_loaded: true if all models are in 'loaded' state
    """
    # Allow login if API is ready, regardless of model status
    ready_for_login = get_app_ready()

    expected = {"iris_random_forest", "breast_cancer_bayes"}
    loaded = set(model_service.models.keys())
    training = set(model_service.status.keys()) - loaded

    response = {
        "ready": ready_for_login,  # Allow login immediately
        "model_status": model_service.status,
        "all_models_loaded": all(s == "loaded" for s in model_service.status.values()),
        "models": {m: (m in loaded) for m in expected},
        "training": list(training)
    }

    logger.debug("READY endpoint – _app_ready=%s, response=%s", get_app_ready(), response)
    return response

# ----- on-demand training endpoints ----------------------------------
@app.post("/api/v1/iris/train", status_code=202)
async def train_iris(background_tasks: BackgroundTasks,
                     current_user: str = Depends(get_current_user)):
    background_tasks.add_task(model_service.train_iris)
    return {"status": "started"}

@app.post("/api/v1/cancer/train", status_code=202)
async def train_cancer(background_tasks: BackgroundTasks,
                       current_user: str = Depends(get_current_user)):
    background_tasks.add_task(model_service.train_cancer)
    return {"status": "started"}

@app.get("/api/v1/iris/ready")
async def iris_ready():
    """Check if Iris model is loaded and ready."""
    return {"loaded": "iris_random_forest" in model_service.models}

@app.get("/api/v1/cancer/ready")
async def cancer_ready():
    """Check if Cancer model is loaded and ready."""
    return {"loaded": "breast_cancer_bayes" in model_service.models}

@app.post(
    "/api/v1/iris/predict",
    response_model=IrisPredictResponse,
    status_code=status.HTTP_200_OK
)
async def predict_iris(
    request: IrisPredictRequest,
    background_tasks: BackgroundTasks,
    current_user: str = Depends(get_current_user),
):
    """
    Predict iris species from measurements.

    Example request:
        {
            "model_type": "rf",
            "samples": [
                {
                    "sepal_length": 5.1,
                    "sepal_width": 3.5,
                    "petal_length": 1.4,
                    "petal_width": 0.2
                }
            ]
        }
    """
    logger.info(f"User {current_user} called /iris/predict with {len(request.samples)} samples")
    logger.debug(f"→ Iris payload: {request.samples}")

    # Check if iris model is ready
    if request.model_type == "rf" and "iris_random_forest" not in model_service.models:
        logger.warning("Iris model not ready - returning 503")
        raise HTTPException(
            status_code=503,
            detail="Iris model is still loading. Please try again in a few seconds.",
            headers={"Retry-After": "30"}
        )

    # Convert Pydantic models to dicts
    features = [sample.dict() for sample in request.samples]
    logger.debug(f"→ Iris features: {features}")

    # Get predictions
    predictions, probabilities = await model_service.predict_iris(
        features=features,
        model_type=request.model_type
    )
    logger.debug(f"← Iris predictions: {predictions}")
    logger.debug(f"← Iris probabilities: {probabilities}")

    result = {
        "predictions": predictions,
        "probabilities": probabilities,
        "input_received": request.samples
    }

    # Background task for audit logging
    background_tasks.add_task(
        logger.info,
        f"[audit] user={current_user} endpoint=iris input={request.samples} output={predictions}"
    )

    return IrisPredictResponse(**result)

@app.post(
    "/api/v1/cancer/predict",
    response_model=CancerPredictResponse,
    status_code=status.HTTP_200_OK
)
async def predict_cancer(
    request: CancerPredictRequest,
    background_tasks: BackgroundTasks,
    current_user: str = Depends(get_current_user),
):
    """
    Predict breast cancer diagnosis from features.

    Example request:
        {
            "model_type": "bayes",
            "samples": [
                {
                    "mean_radius": 17.99,
                    "mean_texture": 10.38,
                    ...
                }
            ],
            "posterior_samples": 1000  # optional
        }
    """
    logger.info(f"User {current_user} called /cancer/predict with {len(request.samples)} samples")
    logger.debug(f"→ Cancer payload: {request.samples}")

    # Check if cancer model is ready
    if request.model_type == "bayes" and "breast_cancer_bayes" not in model_service.models:
        logger.warning("Cancer model not ready - returning 503")
        raise HTTPException(
            status_code=503,
            detail="Cancer model is still loading. Please try again in a few seconds.",
            headers={"Retry-After": "30"}
        )

    # Convert Pydantic models to dicts
    features = [sample.dict() for sample in request.samples]
    logger.debug(f"→ Cancer features: {features}")

    # Get predictions
    predictions, probabilities, uncertainties = await model_service.predict_cancer(
        features=features,
        model_type=request.model_type,
        posterior_samples=request.posterior_samples
    )
    logger.debug(f"← Cancer predictions: {predictions}")
    logger.debug(f"← Cancer probabilities: {probabilities}")
    logger.debug(f"← Cancer uncertainties: {uncertainties}")

    result = {
        "predictions": predictions,
        "probabilities": probabilities,
        "uncertainties": uncertainties,
        "input_received": request.samples
    }

    # Background task for audit logging
    background_tasks.add_task(
        logger.info,
        f"[audit] user={current_user} endpoint=cancer input={request.samples} output={predictions}"
    )

    return CancerPredictResponse(**result) 

@app.get("/api/v1/debug/ready")
async def debug_ready():
    """Debug endpoint to check _app_ready status."""
    return {
        "app_ready": get_app_ready(),
        "model_service_initialized": model_service.initialized,
        "models": list(model_service.models.keys()),
        "status": model_service.status,
        "errors": {k: v for k, v in model_service.status.items() if k.endswith("_last_error")}
    } 

@app.get("/api/v1/test/401")
async def test_401():
    """Test endpoint that returns 401 for testing session expiry."""
    raise HTTPException(
        status_code=status.HTTP_401_UNAUTHORIZED,
        detail="Test 401 response"
    ) 


Overwriting api/app/main.py


# Tests

In [108]:
%%writefile api/scripts/ensure_models.py
#!/usr/bin/env python3
"""
Ensure models script - pre-trains all models before starting the API.
This can be used in development or CI to ensure models are ready.
"""

import asyncio
import logging
import sys
from pathlib import Path

# Add the api directory to the Python path
sys.path.insert(0, str(Path(__file__).parent.parent))

from app.services.ml.model_service import TRAINERS, ModelService

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

async def main():
    """Ensure all models are trained and loaded."""
    logger.info("🚀 Starting model ensure script...")
    
    svc = ModelService()
    
    # Start the self-healing process
    await svc.startup(auto_train=True)
    
    # Wait until all models are loaded
    max_wait = 300  # 5 minutes max
    start_time = asyncio.get_event_loop().time()
    
    while len(svc.models) < len(TRAINERS):
        if asyncio.get_event_loop().time() - start_time > max_wait:
            logger.error("❌ Timeout waiting for models to load")
            return False
            
        logger.info(f"⏳ Waiting for models... ({len(svc.models)}/{len(TRAINERS)} loaded)")
        
        # Check for failed models
        failed = [name for name, status in svc.status.items() if status == "failed"]
        if failed:
            logger.error(f"❌ Models failed to train: {failed}")
            return False
            
        await asyncio.sleep(5)
    
    logger.info("✅ All models loaded successfully!")
    return True

if __name__ == "__main__":
    try:
        success = asyncio.run(main())
        sys.exit(0 if success else 1)
    except KeyboardInterrupt:
        logger.info("⏹️  Interrupted by user")
        sys.exit(1)
    except Exception as e:
        logger.error(f"❌ Unexpected error: {e}")
        sys.exit(1) 

Overwriting api/scripts/ensure_models.py


In [109]:
%%writefile api/__init__.py
# Create logs dir early when package is imported by Uvicorn workers
import os
os.makedirs("logs", exist_ok=True) 

Overwriting api/__init__.py


In [110]:
%%writefile test_self_healing.py
#!/usr/bin/env python3
"""
Test script for the self-healing model system.
This script tests the new startup pattern and status tracking.
"""

import asyncio
import requests
import time
import json
import os
import shutil
import subprocess
import sys
import pathlib
from typing import Dict, Any

def start_backend():
    """
    Launch uvicorn in a subprocess, stream its output in real time and
    fail fast if it crashes or if port 8000 is already taken.
    """
    import socket, threading, os, pathlib, sys, subprocess, time, shutil

    # ── quick port-availability probe ──────────────────────────────
    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
        if sock.connect_ex(("127.0.0.1", 8000)) == 0:
            raise RuntimeError("Port 8000 already in use – aborting tests")

    print("🚀  Spawning backend …")

    uvicorn_cmd = [
        sys.executable, "-m", "uvicorn",
        "api.app.main:app",
        "--port", "8000",
        "--env-file", ".env",
        "--log-level", "info",
    ]

    env = os.environ.copy()
    # Ensure the project root is on PYTHONPATH so 'api' is importable
    env["PYTHONPATH"] = str(pathlib.Path(__file__).parent) + os.pathsep + env.get("PYTHONPATH", "")

    proc = subprocess.Popen(
        uvicorn_cmd,
        cwd=pathlib.Path(__file__).parent,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        text=True,
        start_new_session=True,          # avoid Zombie children on ^C
        env=env,
    )

    # ── stream output in background thread ─────────────────────────
    def _pump(pipe):
        for ln in iter(pipe.readline, ""):
            print("[backend]", ln.rstrip())

    t = threading.Thread(target=_pump, args=(proc.stdout,), daemon=True)
    t.start()

    deadline = time.time() + 60
    while time.time() < deadline:
        if proc.poll() is not None:
            raise RuntimeError("Backend process exited early – see above log")

        try:
            import requests
            r = requests.get("http://127.0.0.1:8000/api/v1/health", timeout=1)
            if r.status_code == 200:
                print("✅ Backend responded to /health")
                return proc
        except requests.exceptions.ConnectionError:
            pass

        time.sleep(1)

    proc.terminate()
    raise RuntimeError("Backend did not become healthy within 60 seconds")

def cleanup_mlruns():
    """Remove mlruns directory to simulate fresh start."""
    mlruns_path = "mlruns"
    if os.path.exists(mlruns_path):
        print("🧹 Cleaning up mlruns directory...")
        shutil.rmtree(mlruns_path)
        print("✅ Cleanup complete")

def test_backend_startup():
    """Test that the backend starts up immediately."""
    print("🔍 Testing backend startup...")

    # Wait for backend to be ready
    max_wait = 30
    start_time = time.time()

    while time.time() - start_time < max_wait:
        try:
            # Test basic health
            health_response = requests.get("http://localhost:8000/api/v1/health")
            if health_response.status_code == 200:
                print("✅ Backend health check passed")
                break
        except requests.exceptions.ConnectionError:
            print("⏳ Waiting for backend to start...")
            time.sleep(2)
    else:
        print("❌ Backend failed to start within 30 seconds")
        return False

    # Test readiness endpoint - should be ready immediately
    try:
        ready_response = requests.get("http://localhost:8000/api/v1/ready")
        if ready_response.status_code == 200:
            ready_data = ready_response.json()
            if ready_data.get("ready"):
                print("✅ Backend is ready for requests immediately")
            else:
                print("❌ Backend not ready")
                return False
        else:
            print(f"❌ Readiness endpoint failed: {ready_response.status_code}")
            return False
    except Exception as e:
        print(f"❌ Error testing readiness: {e}")
        return False

    return True

def test_model_status_evolution():
    """Test that model status evolves correctly over time."""
    print("\n🔍 Testing model status evolution...")

    status_history = []
    max_polls = 30  # Poll for up to 60 seconds

    for i in range(max_polls):
        try:
            response = requests.get("http://localhost:8000/api/v1/ready/full")
            if response.status_code == 200:
                status_data = response.json()
                status_history.append(status_data)

                print(f"Poll {i+1}: Ready={status_data.get('ready')}, "
                      f"All loaded={status_data.get('all_models_loaded')}")

                # Show individual model status
                model_status = status_data.get('model_status', {})
                for model, status in model_status.items():
                    print(f"  {model}: {status}")

                # Check if all models are loaded
                if status_data.get('all_models_loaded'):
                    print("✅ All models loaded successfully!")
                    return True

            time.sleep(2)
        except Exception as e:
            print(f"❌ Error polling status: {e}")
            time.sleep(2)

    print("❌ Models did not load within expected time")
    return False

def test_login_immediate():
    """Test that login works immediately even when models are training."""
    print("\n🔍 Testing immediate login capability...")

    try:
        # Try to get a token immediately after startup
        token_response = requests.post(
            "http://localhost:8000/api/v1/token",
            data={"username": "alice", "password": "supersecretvalue"}
        )

        if token_response.status_code == 200:
            token_data = token_response.json()
            print("✅ Login successful immediately after startup")
            return True
        else:
            print(f"❌ Login failed: {token_response.status_code}")
            return False

    except Exception as e:
        print(f"❌ Error testing login: {e}")
        return False

def test_prediction_with_training_models():
    """Test prediction behavior when models are still training."""
    print("\n🔍 Testing prediction behavior during training...")

    try:
        # Get a token
        token_response = requests.post(
            "http://localhost:8000/api/v1/token",
            data={"username": "alice", "password": "supersecretvalue"}
        )

        if token_response.status_code != 200:
            print("❌ Failed to get authentication token")
            return False

        token_data = token_response.json()
        headers = {"Authorization": f"Bearer {token_data['access_token']}"}

        # Try iris prediction (should work if model is loaded)
        iris_data = {
            "model_type": "rf",
            "samples": [{
                "sepal_length": 5.1,
                "sepal_width": 3.5,
                "petal_length": 1.4,
                "petal_width": 0.2
            }]
        }

        iris_response = requests.post(
            "http://localhost:8000/api/v1/iris/predict",
            json=iris_data,
            headers=headers
        )

        if iris_response.status_code == 200:
            iris_result = iris_response.json()
            print(f"✅ Iris prediction successful: {iris_result['predictions']}")
        elif iris_response.status_code == 503:
            print("✅ Iris prediction correctly rejected (model still training)")
        else:
            print(f"❌ Unexpected iris response: {iris_response.status_code}")
            return False

        return True

    except Exception as e:
        print(f"❌ Error testing predictions: {e}")
        return False

def main():
    """Run all tests."""
    print("🚀 Starting self-healing system tests...\n")

    # Optional: Clean up for fresh start
    if input("Clean up mlruns directory for fresh start? (y/N): ").lower() == 'y':
        cleanup_mlruns()

    # 🔑 NEW: Launch backend in-process
    backend = start_backend()
    try:
        # Test 1: Backend startup
        if not test_backend_startup():
            print("❌ Backend startup test failed")
            return

        # Test 2: Immediate login
        if not test_login_immediate():
            print("❌ Immediate login test failed")
            return

        # Test 3: Model status evolution
        if not test_model_status_evolution():
            print("❌ Model status evolution test failed")
            return

        # Test 4: Prediction behavior during training
        if not test_prediction_with_training_models():
            print("❌ Prediction behavior test failed")
            return

        print("\n🎉 All tests passed! The self-healing system is working correctly.")
        print("\n📋 Summary:")
        print("✅ Backend starts immediately")
        print("✅ Login works immediately")
        print("✅ Models train in background")
        print("✅ Status updates in real-time")
        print("✅ Predictions work when models are ready")

    finally:
        # Clean shutdown
        print("\n🛑 Shutting down backend...")
        backend.terminate()
        backend.wait(timeout=10)
        print("✅ Backend shutdown complete")

if __name__ == "__main__":
    main() 


Overwriting test_self_healing.py


In [111]:
%%writefile test_manual.py
#!/usr/bin/env python3
"""
Manual test script for the self-healing system.
This script assumes the backend is already running (e.g., via npm run dev).
"""

import requests
import time
import json

def test_backend_status():
    """Test basic backend status."""
    print("🔍 Testing backend status...")
    
    try:
        # Test health endpoint
        health_response = requests.get("http://localhost:8000/api/v1/health")
        if health_response.status_code == 200:
            print("✅ Backend health check passed")
        else:
            print(f"❌ Backend health failed: {health_response.status_code}")
            return False
            
        # Test readiness endpoint
        ready_response = requests.get("http://localhost:8000/api/v1/ready")
        if ready_response.status_code == 200:
            ready_data = ready_response.json()
            print(f"✅ Backend ready: {ready_data}")
        else:
            print(f"❌ Readiness check failed: {ready_response.status_code}")
            return False
            
        return True
        
    except requests.exceptions.ConnectionError:
        print("❌ Backend not running. Start it with: npm run dev")
        return False
    except Exception as e:
        print(f"❌ Error: {e}")
        return False

def test_model_status():
    """Test model status endpoint."""
    print("\n🔍 Testing model status...")
    
    try:
        response = requests.get("http://localhost:8000/api/v1/ready/full")
        if response.status_code == 200:
            status_data = response.json()
            print(f"✅ Model status: {json.dumps(status_data, indent=2)}")
            return True
        else:
            print(f"❌ Model status failed: {response.status_code}")
            return False
    except Exception as e:
        print(f"❌ Error: {e}")
        return False

def test_login():
    """Test login functionality."""
    print("\n🔍 Testing login...")
    
    try:
        token_response = requests.post(
            "http://localhost:8000/api/v1/token",
            data={"username": "alice", "password": "supersecretvalue"}
        )
        
        if token_response.status_code == 200:
            token_data = token_response.json()
            print("✅ Login successful")
            return token_data['access_token']
        else:
            print(f"❌ Login failed: {token_response.status_code}")
            return None
    except Exception as e:
        print(f"❌ Error: {e}")
        return None

def test_prediction(token):
    """Test prediction with authentication."""
    print("\n🔍 Testing prediction...")
    
    if not token:
        print("❌ No token available")
        return False
        
    headers = {"Authorization": f"Bearer {token}"}
    
    # Test iris prediction
    iris_data = {
        "model_type": "rf",
        "samples": [{
            "sepal_length": 5.1,
            "sepal_width": 3.5,
            "petal_length": 1.4,
            "petal_width": 0.2
        }]
    }
    
    try:
        iris_response = requests.post(
            "http://localhost:8000/api/v1/iris/predict",
            json=iris_data,
            headers=headers
        )
        
        if iris_response.status_code == 200:
            iris_result = iris_response.json()
            print(f"✅ Iris prediction: {iris_result['predictions']}")
        elif iris_response.status_code == 503:
            print("✅ Iris prediction rejected (model still training)")
        else:
            print(f"❌ Iris prediction failed: {iris_response.status_code}")
            return False
            
        return True
        
    except Exception as e:
        print(f"❌ Error: {e}")
        return False

def main():
    """Run manual tests."""
    print("🚀 Manual self-healing system tests\n")
    print("Make sure the backend is running with: npm run dev\n")
    
    # Test 1: Backend status
    if not test_backend_status():
        print("\n❌ Backend status test failed")
        return
    
    # Test 2: Model status
    if not test_model_status():
        print("\n❌ Model status test failed")
        return
    
    # Test 3: Login
    token = test_login()
    if not token:
        print("\n❌ Login test failed")
        return
    
    # Test 4: Prediction
    if not test_prediction(token):
        print("\n❌ Prediction test failed")
        return
    
    print("\n🎉 All manual tests passed!")
    print("\n📋 Summary:")
    print("✅ Backend is running and responsive")
    print("✅ Model status is being tracked")
    print("✅ Login works with authentication")
    print("✅ Predictions work (or are properly rejected)")

if __name__ == "__main__":
    main() 

Overwriting test_manual.py


In [112]:
%%writefile test_import.py
#!/usr/bin/env python3
"""
Enhanced test to verify that api.app.main can be imported without errors.
Captures detailed logs and implements unit test mode for fast imports.
"""

import importlib
import io
import logging
import os
import sys
import time
import traceback

def setup_test_environment():
    """Configure environment for fast, safe imports."""
    print("🔧 Setting up test environment...")

    # Tell the backend we are in unit-test-mode BEFORE we touch it
    os.environ["UNIT_TESTING"] = "1"
    os.environ.setdefault("MLFLOW_TRACKING_URI", "file://./mlruns_tests")

    # Ensure logs directory exists
    os.makedirs("logs", exist_ok=True)

    print("✅ Test environment configured")

def capture_import_logs():
    """Capture all logs during import for debugging."""
    print("📝 Setting up log capture...")

    # Create a string buffer to capture all logs
    log_stream = io.StringIO()

    # Configure logging to capture everything
    logging.basicConfig(
        level=logging.DEBUG,
        handlers=[
            logging.StreamHandler(log_stream),
            logging.StreamHandler(sys.stdout)  # Also show in console
        ],
        force=True  # Override any existing config
    )

    return log_stream

def test_import_with_timing():
    """Test that the main module can be imported with timing and detailed logs."""
    print("🔍 Testing api.app.main import...")

    # Capture logs during import
    log_stream = capture_import_logs()

    # Time the import
    t0 = time.perf_counter()

    try:
        # Add the current directory to Python path
        sys.path.insert(0, os.getcwd())

        # Try to import the main module
        import api.app.main
        dt = time.perf_counter() - t0

        print(f"✅ api.app.main imported successfully in {dt:.3f}s")

        # Show captured logs if any
        log_content = log_stream.getvalue()
        if log_content.strip():
            print("📋 Import logs:")
            print(log_content)

        return True

    except Exception as e:
        dt = time.perf_counter() - t0
        print(f"❌ Import failed after {dt:.3f}s")
        print(f"❌ Error: {e}")

        # Show captured logs
        log_content = log_stream.getvalue()
        if log_content.strip():
            print("📋 Logs during failed import:")
            print(log_content)

        print("📋 Full traceback:")
        traceback.print_exc()
        return False

def test_logs_directory():
    """Test that logs directory exists."""
    print("🔍 Testing logs directory...")

    logs_dir = "logs"
    if os.path.exists(logs_dir):
        print(f"✅ Logs directory exists: {logs_dir}")
        return True
    else:
        print(f"❌ Logs directory missing: {logs_dir}")
        return False

def test_mlflow_config():
    """Test MLflow configuration."""
    print("🔍 Testing MLflow configuration...")

    tracking_uri = os.environ.get("MLFLOW_TRACKING_URI", "not set")
    unit_testing = os.environ.get("UNIT_TESTING", "not set")

    print(f"✅ MLFLOW_TRACKING_URI: {tracking_uri}")
    print(f"✅ UNIT_TESTING: {unit_testing}")

    return True

def test_compiler_probe():
    """Test compiler detection functionality."""
    print("🔍 Testing compiler probe...")

    try:
        from api.app.ml.utils import find_compiler, test_compiler_availability

        # Test compiler availability
        compilers = test_compiler_availability()
        print(f"✅ Compiler test completed: {sum(compilers.values())}/{len(compilers)} available")

        # Test find_compiler
        compiler_path = find_compiler()
        if compiler_path:
            print(f"✅ Found compiler: {compiler_path}")
        else:
            print("⚠️ No compiler found (expected on CI or dev machines without build tools)")

        return True

    except Exception as e:
        print(f"❌ Compiler probe test failed: {e}")
        return False

def main():
    """Run comprehensive import tests."""
    print("🚀 Testing module imports with detailed diagnostics...\n")

    success = True

    # Test 1: Setup environment
    setup_test_environment()

    # Test 2: Logs directory
    if not test_logs_directory():
        success = False

    # Test 3: MLflow config
    if not test_mlflow_config():
        success = False

    # Test 4: Compiler probe
    if not test_compiler_probe():
        success = False

    # Test 5: Main module import (with timing and logs)
    if not test_import_with_timing():
        success = False

    if success:
        print("\n🎉 All import tests passed!")
    else:
        print("\n❌ Some import tests failed")
        sys.exit(1)

if __name__ == "__main__":
    main() 


Overwriting test_import.py
