In [0]:
%python
%pip install "optuna>=3.4" scikit-learn lightgbm optuna-dashboard psycopg2-binary

In [0]:
%python
%pip install "optuna>=3.4" scikit-learn lightgbm optuna-dashboard psycopg2-binary

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Optuna HPO Distribué — Script unique

USAGE RAPIDE
============
1) Démo locale (1 process) :
   python optuna_dist_hpo.py --n-trials 50

2) Démo locale multi-process (8 workers / même machine) :
   python optuna_dist_hpo.py --role spawn --num-workers 8 --n-trials 50

3) Exécution distribuée (plusieurs machines/nœuds/pods pointant vers le même storage) :
   # Sur chaque machine (ou job), utilisez la même URL de storage + même study :
   export OPTUNA_STORAGE="postgresql+psycopg2://user:pwd@db-host:5432/optuna_db"
   export OPTUNA_STUDY="mon_study_hpo"
   python optuna_dist_hpo.py --role worker --n-trials 50

DÉPENDANCES CONSEILLÉES (pour l'objectif ML)
============================================
pip install "optuna>=3.4" scikit-learn lightgbm optuna-dashboard psycopg2-binary

NOTES
=====
- Par défaut, le script utilise SQLite (fichier local "optuna.db") pour la démo.
  Pour **vrai distribué**, utilisez PostgreSQL/MySQL (OPTUNA_STORAGE).
- Le sampler TPE est configuré en mode avancé (multivarié + constant liar).
- Le pruner Successive Halving est activé (report à chaque fold/étape).
- Les meilleurs paramètres/valeurs sont sauvegardés dans :
  - best_params.json
  - best_value.txt
"""

from __future__ import annotations
import argparse
import json
import multiprocessing as mp
import os
import sys
import time
from dataclasses import dataclass

# -------- Détection des libs ML (optionnelles) --------
_HAS_SK = False
_HAS_LGBM = False
try:
    import numpy as np
    _HAS_NUMPY = True
except Exception:
    _HAS_NUMPY = False

try:
    import optuna
    from optuna.samplers import TPESampler
    from optuna.pruners import SuccessiveHalvingPruner
except Exception as e:
    print("[ERREUR] Optuna est requis. Installez-le : pip install optuna", file=sys.stderr)
    raise

try:
    from sklearn.datasets import load_breast_cancer
    from sklearn.model_selection import KFold
    from sklearn.metrics import roc_auc_score
    from sklearn.preprocessing import StandardScaler
    from sklearn.pipeline import Pipeline
    _HAS_SK = True
except Exception:
    _HAS_SK = False

try:
    from lightgbm import LGBMClassifier
    _HAS_LGBM = True
except Exception:
    _HAS_LGBM = False


# -------- Config & utils --------
@dataclass
class Config:
    study_name: str
    storage_url: str
    direction: str
    n_trials: int
    role: str           # "worker" | "spawn" | "single"
    num_workers: int
    objective_kind: str # "auto" | "ml" | "ackley"
    seed: int


def build_storage(storage_url: str):
    """Retourne un storage Optuna.
    - Si URL RDB (postgresql/mysql), on utilise RDBStorage.
    - Sinon SQLite local pour démo."""
    if storage_url.startswith(("postgresql", "mysql")):
        return optuna.storages.RDBStorage(url=storage_url)
    # Fallback: SQLite
    return optuna.storages.RDBStorage(url=storage_url)


def build_study(cfg: Config):
    sampler = TPESampler(
        multivariate=True,
        group=True,
        constant_liar=True,   # important pour forte parallélisation
        n_startup_trials=20,
        seed=cfg.seed,
    )
    pruner = SuccessiveHalvingPruner(min_resource=1, reduction_factor=3)

    study = optuna.create_study(
        storage=build_storage(cfg.storage_url),
        study_name=cfg.study_name,
        direction=cfg.direction,
        load_if_exists=True,
        sampler=sampler,
        pruner=pruner,
    )
    return study


# -------- Objectifs --------
def objective_ml(trial: optuna.trial.Trial) -> float:
    """Objectif ML : LightGBM + KFold sur dataset breast_cancer (sans accès réseau)."""
    if not (_HAS_SK and _HAS_LGBM):
        raise RuntimeError("Objectif ML indisponible : installez scikit-learn et lightgbm, "
                           "ou utilisez --objective ackley (fallback sans dépendances).")

    # Espace de recherche avancé
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 15, 255),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 200),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 1e-1, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 1e-1, log=True),
        "random_state": 42,
        "n_jobs": 1,  # un essai = 1 worker CPU
    }

    data = load_breast_cancer()
    X = data.data
    y = data.target

    kf = KFold(n_splits=3, shuffle=True, random_state=42)
    scores = []
    for fold, (tr_idx, va_idx) in enumerate(kf.split(X)):
        # Pipeline: standardisation + LGBM
        model = Pipeline([
            ("scaler", StandardScaler(with_mean=True, with_std=True)),
            ("clf", LGBMClassifier(**params))
        ])
        model.fit(X[tr_idx], y[tr_idx])
        prob = model.predict_proba(X[va_idx])[:, 1]
        score = roc_auc_score(y[va_idx], prob)  # AUC
        scores.append(score)

        # Report et pruning : on minimise 1 - AUC
        trial.report(1.0 - float(score), step=fold)
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

    return 1.0 - float(sum(scores) / len(scores))


def objective_ackley(trial: optuna.trial.Trial) -> float:
    """Fallback sans dépendances ML : minimise la fonction d'Ackley (d=5) avec 'epochs' simulés.
       Permet le pruning via report à chaque 'epoch'."""
    if not _HAS_NUMPY:
        # Implémentation minimale sans numpy (moins rapide)
        import math
        d = 5
        # Espace de recherche