# One-Class SVM (OC-SVM) — AIS Anomaly Detection (Galápagos)

**Objetivo:** Entrenar y evaluar un OC-SVM (RBF) usando **ventanas pre-generadas** del workspace externo (solo lectura):  
`/teamspace/studios/profound-silver-kn8tf/ais_anomaly/data`

**Principios clave**
- **Nunca** escribimos en el workspace externo.  
- Todos los artefactos nuevos del SVM se guardan en **`./data/ocsvm_runs`** del proyecto actual.  
- Pipeline **memory-safe**: muestreo por MMSI para train, imputación/escala sobre muestras, evaluación por **lotes** con **memmap**.


# OC-SVM AIS Anomaly Detection — robusto y memory-safe (Lee de /data)

- Lee **solo** desde `/data` (este Studio).
- Artefactos nuevos del SVM se guardan en **`./data/ocsvm_runs`** (no se toca `/data`).
- Evaluación por **lotes** con **memmap** para evitar picos de RAM (~30 GB límite).

In [1]:
# --- Diagnóstico del entorno ---
import os, sys, getpass, socket
from pathlib import Path

print("User:   ", getpass.getuser())
print("Host:   ", socket.gethostname())
print("Python: ", sys.executable)
print("CWD:    ", os.getcwd())
print("/data exists?:", Path("/data").exists())
if Path("/data").exists():
    print("#parquets en /data:", len(list(Path("/data").glob("*.parquet"))))

User:    erickdsuarez10
Host:    computeinstance-e00exnkvr257g0k5f5
Python:  /home/zeus/miniconda3/envs/cloudspace/bin/python
CWD:     /teamspace/studios/this_studio
/data exists?: False


In [2]:
# --- Setup & Config (auto-resuelve la fuente de lectura) ---
import os, json
from pathlib import Path

# Candidatas en orden de preferencia
CANDIDATES = [
    Path("/data").resolve(),                               # algunos entornos montan /data
    Path("/teamspace/studios/this_studio/data").resolve(), # data propia de este Studio
    Path("./data").resolve(),                              # por si copiamos aquí
]

def count_parquets(p: Path) -> int:
    try:
        return len(list(p.glob("*.parquet"))) if p.exists() else 0
    except Exception:
        return 0

external = None
for cand in CANDIDATES:
    if cand.exists() and count_parquets(cand) > 0:
        external = cand
        break

if external is None:
    # No hay /data montado ni parquets en las otras candidatas
    # Usa explícitamente la de este Studio (existe, aunque pueda tener 0 parquets)
    external = Path("/teamspace/studios/this_studio/data").resolve()
    print("⚠️ No se hallaron parquets aún. Usando ruta por defecto del Studio:", external)

EXTERNAL_DATA_DIR = external

# Carpeta local para salidas del OC-SVM (seguro)
OUT_DIR = Path("data/ocsvm_runs").resolve()
os.makedirs(OUT_DIR, exist_ok=True)

CFG = {
    "external_data_dir": str(EXTERNAL_DATA_DIR),
    "out_dir": str(OUT_DIR),
    "artifact_prefix": "ocsvm_rbf",
    "svm_nu_grid": [0.01, 0.05, 0.1],
    "svm_gamma_grid": ["scale", 0.01],
    "kernel": "rbf",
    "kfold_splits": 5,
    "max_train_samples": 500_000,
    "max_search_samples": 200_000,
    "eval_batch_size": 200_000,
}

print("LECTURA (read-only):", CFG["external_data_dir"])
print("SALIDAS (local):    ", CFG["out_dir"])
print("#parquets detectados en lectura:", count_parquets(Path(CFG["external_data_dir"])))
print("Config:", json.dumps({k:v for k,v in CFG.items() if k not in []}, indent=2))

LECTURA (read-only): /teamspace/studios/this_studio/data
SALIDAS (local):     /teamspace/studios/this_studio/data/ocsvm_runs
#parquets detectados en lectura: 17
Config: {
  "external_data_dir": "/teamspace/studios/this_studio/data",
  "out_dir": "/teamspace/studios/this_studio/data/ocsvm_runs",
  "artifact_prefix": "ocsvm_rbf",
  "svm_nu_grid": [
    0.01,
    0.05,
    0.1
  ],
  "svm_gamma_grid": [
    "scale",
    0.01
  ],
  "kernel": "rbf",
  "kfold_splits": 5,
  "max_train_samples": 500000,
  "max_search_samples": 200000,
  "eval_batch_size": 200000
}


In [3]:
# --- Carga robusta desde /data ---
# --- Carga robusta desde la ruta resuelta en CFG["external_data_dir"] ---
import os, gc, numpy as np, pandas as pd
from pathlib import Path

DATA_DIR = Path(CFG["external_data_dir"])
print("DATA_DIR:", DATA_DIR)

# Listado informativo (ayuda si algo falla)
parquets = sorted(DATA_DIR.glob("*.parquet"), key=lambda p: p.name.lower())
print("Parquets disponibles:")
for p in parquets:
    try:
        print(f" - {p.name}  ({p.stat().st_size/1e6:.1f} MB)")
    except Exception:
        print(f" - {p.name}")

def read_parquet_min(path: Path):
    df = pd.read_parquet(path, engine="pyarrow")
    for c in df.columns:
        if pd.api.types.is_float_dtype(df[c]): df[c] = df[c].astype(np.float32)
        elif pd.api.types.is_integer_dtype(df[c]) and df[c].max() <= np.iinfo(np.int32).max:
            df[c] = df[c].astype(np.int32)
    return df

def detect_label_col(df):
    for k in ["y","label","is_suspicious","target"]:
        if k in df.columns: return k
    return None

def detect_group_col(df):
    for k in ["mmsi","group","ship_id"]:
        if k in df.columns: return k
    return None

def pick_first_existing(base: Path, *names_or_patterns):
    # 1) exactos
    for n in names_or_patterns:
        if "*" not in n and "?" not in n and "[" not in n:
            p = base / n
            if p.exists(): return p
    # 2) patrones -> el más grande
    for patt in names_or_patterns:
        if any(ch in patt for ch in "*?[]"):
            matches = list(base.glob(patt))
            if matches:
                matches = sorted(matches, key=lambda x: x.stat().st_size if x.exists() else 0, reverse=True)
                return matches[0]
    return None

# --- TRAIN (solo normales) ---
train_path = pick_first_existing(DATA_DIR,
    "windows_aligned_normal.parquet", "norm_windows_flat.parquet", "ais_norm_windows.parquet",
    "*windows_aligned_normal*.parquet", "*norm*windows*.parquet", "*ais_norm_windows*.parquet"
)
if train_path is None:
    raise FileNotFoundError("No encontré TRAIN normal (p.ej. windows_aligned_normal.parquet) en /data.")
df_tr = read_parquet_min(train_path)

ycol_tr = detect_label_col(df_tr)     # no debería importar; se excluye de X
gcol_tr = detect_group_col(df_tr)
drop_common = {"lat","lon","idx","idx_end","window_id"}
drop_train = set([c for c in [ycol_tr, gcol_tr] if c]) | drop_common
feat_tr = [c for c in df_tr.columns if c not in drop_train]
X_train = df_tr[feat_tr].to_numpy(dtype=np.float32)
groups_train = df_tr[gcol_tr].to_numpy() if gcol_tr else None
print("TRAIN ->", train_path.name, "| X_train:", X_train.shape)

# --- EVAL (varios esquemas soportados) ---
eval_single = pick_first_existing(DATA_DIR,
    "windows_with_labels_aligned.parquet", "*windows_with_labels_aligned*.parquet",
    "eval_windows_aligned.parquet", "*eval_windows_aligned*.parquet",
    "windows_with_labels.parquet", "*windows_with_labels*.parquet"
)
eval_wl_norm = pick_first_existing(DATA_DIR, "windows_with_labels_aligned_normal.parquet", "*windows_with_labels_aligned_normal*.parquet")
eval_wl_anom = pick_first_existing(DATA_DIR, "windows_with_labels_aligned_anom.parquet",   "*windows_with_labels_aligned_anom*.parquet")
eval_norm    = pick_first_existing(DATA_DIR, "eval_windows_aligned_normal.parquet",        "*eval_windows_aligned_normal*.parquet")
eval_anom    = pick_first_existing(DATA_DIR, "eval_windows_aligned_anom.parquet",          "*eval_windows_aligned_anom*.parquet")
labels_any   = pick_first_existing(DATA_DIR, "eval_labels_aligned.parquet", "*eval_labels_aligned*.parquet", "labels.parquet", "*labels*.parquet")

if eval_single is not None:
    df_ev = read_parquet_min(eval_single)
    ycol_ev = detect_label_col(df_ev)
    gcol_ev = detect_group_col(df_ev)
    if ycol_ev is None:
        if labels_any is None:
            raise FileNotFoundError("Eval único sin etiquetas embebidas y no hay archivo de labels en /data.")
        df_y = read_parquet_min(labels_any)
        ycol_y = detect_label_col(df_y) or df_y.select_dtypes(include=["int32","int16","int8"]).columns[-1]
        if len(df_y) != len(df_ev): raise ValueError(f"Desalineación eval vs labels: {len[df_ev]} vs {len[df_y]}")
        drop_eval = set([gcol_ev]) | drop_common
        feat_ev = [c for c in df_ev.columns if c not in drop_eval]
        X_eval = df_ev[feat_ev].to_numpy(dtype=np.float32)
        y_eval = df_y[ycol_y].astype(np.int8).to_numpy()
    else:
        drop_eval = set([ycol_ev, gcol_ev]) | drop_common
        feat_ev = [c for c in df_ev.columns if c not in drop_eval]
        X_eval = df_ev[feat_ev].to_numpy(dtype=np.float32)
        y_eval = df_ev[ycol_ev].astype(np.int8).to_numpy()
    groups_eval = df_ev[gcol_ev].to_numpy() if gcol_ev else None
    print("EVAL ->", eval_single.name, "| X_eval:", X_eval.shape, "| y_eval:", y_eval.shape)

elif eval_wl_norm is not None and eval_wl_anom is not None:
    df_n = read_parquet_min(eval_wl_norm); df_a = read_parquet_min(eval_wl_anom)
    common = [c for c in df_n.columns if c in df_a.columns]
    df_n = df_n[common].copy(); df_a = df_a[common].copy()
    ycol_ev = detect_label_col(df_n); gcol_ev = detect_group_col(df_n)
    drop_eval = set([ycol_ev, gcol_ev]) | drop_common
    feat_ev = [c for c in common if c not in drop_eval]
    X_eval = pd.concat([df_n[feat_ev], df_a[feat_ev]], ignore_index=True).to_numpy(dtype=np.float32)
    y_eval = pd.concat([df_n[ycol_ev], df_a[ycol_ev]], ignore_index=True).astype(np.int8).to_numpy()
    groups_eval = (pd.concat([df_n[gcol_ev], df_a[gcol_ev]], ignore_index=True).to_numpy() if gcol_ev else None)
    print("EVAL ->", eval_wl_norm.name, "+", eval_wl_anom.name, "| X_eval:", X_eval.shape, "| y_eval:", y_eval.shape)

elif eval_norm is not None and eval_anom is not None:
    if labels_any is None:
        raise FileNotFoundError("Eval normal/anom sin etiquetas embebidas y no hay archivo de labels en /data.")
    df_n = read_parquet_min(eval_norm); df_a = read_parquet_min(eval_anom); df_y = read_parquet_min(labels_any)
    common = [c for c in df_n.columns if c in df_a.columns]
    df_n = df_n[common].copy(); df_a = df_a[common].copy()
    gcol_ev = detect_group_col(df_n)
    drop_eval = set([gcol_ev]) | drop_common
    feat_ev = [c for c in common if c not in drop_eval]
    df_concat = pd.concat([df_n[feat_ev], df_a[feat_ev]], ignore_index=True)
    X_eval = df_concat.to_numpy(dtype=np.float32)
    ycol_y = detect_label_col(df_y) or df_y.select_dtypes(include=["int32","int16","int8"]).columns[-1]
    if len(df_y) != len(df_concat): raise ValueError(f"Desalineación eval concat vs labels: {len(df_concat)} vs {len(df_y)}")
    y_eval = df_y[ycol_y].astype(np.int8).to_numpy()
    groups_eval = (pd.concat([df_n[gcol_ev], df_a[gcol_ev]], ignore_index=True).to_numpy() if gcol_ev else None)
    print("EVAL ->", eval_norm.name, "+", eval_anom.name, "| X_eval:", X_eval.shape, "| y_eval:", y_eval.shape)

else:
    raise FileNotFoundError("No se pudo resolver un set de EVAL válido en /data.")

# Limpieza y reporte
del df_tr; gc.collect()
print("Train -> X:", X_train.shape, "| groups:", None if groups_train is None else len(groups_train))
print("Eval  -> X:", X_eval.shape,  "| y:", y_eval.shape, "| groups:", None if groups_eval is None else len(groups_eval))
print("N feats: train", X_train.shape[1], "| eval", X_eval.shape[1])

DATA_DIR: /teamspace/studios/this_studio/data
Parquets disponibles:
 - ais_anom_enriched.parquet  (13.4 MB)
 - ais_anom_windows.parquet  (8.2 MB)
 - ais_norm_enriched.parquet  (2199.8 MB)
 - ais_norm_windows.parquet  (1264.8 MB)
 - anom_windows_flat.parquet  (8.2 MB)
 - eval_labels_aligned.parquet  (35.7 MB)
 - eval_windows_aligned.parquet  (399.1 MB)
 - eval_windows_aligned_norm.parquet  (15.8 MB)
 - labels.parquet  (118.7 MB)
 - labels_anom.parquet  (0.1 MB)
 - norm_windows_flat.parquet  (934.6 MB)
 - windows.parquet  (859.8 MB)
 - windows_aligned_anom.parquet  (8.2 MB)
 - windows_aligned_normal.parquet  (1268.4 MB)
 - windows_with_labels.parquet  (859.8 MB)
 - windows_with_labels_aligned.parquet  (1268.4 MB)
 - windows_with_labels_aligned_norm.parquet  (766.1 MB)
TRAIN -> windows_aligned_normal.parquet | X_train: (27789660, 19)
EVAL -> windows_with_labels_aligned.parquet | X_eval: (27789660, 19) | y_eval: (27789660,)
Train -> X: (27789660, 19) | groups: 27789660
Eval  -> X: (2778966

In [4]:
# --- Imputación + Escalado + Muestreo (memory-safe) ---
import os, numpy as np, gc

CFG.setdefault("max_train_samples", 500_000)
CFG.setdefault("max_search_samples", 200_000)
CFG.setdefault("eval_batch_size", 200_000)
os.makedirs(CFG["out_dir"], exist_ok=True)

def sample_by_group(n_max, X, groups):
    if (n_max is None) or (X.shape[0] <= n_max):
        idx = np.arange(X.shape[0]); return X, (groups if groups is not None else None), idx
    rng = np.random.default_rng(42)
    if groups is None:
        idx = rng.choice(X.shape[0], n_max, replace=False); return X[idx], None, idx
    uniq = np.unique(groups); per_g = max(1, n_max // len(uniq)); take = []
    for g in uniq:
        g_idx = np.where(groups == g)[0]
        take.extend(rng.choice(g_idx, min(per_g, g_idx.size), replace=False).tolist())
    take = np.array(take)
    if take.size > n_max: take = rng.choice(take, n_max, replace=False)
    return X[take], groups[take], take

def colwise_nanmedian(X):
    Xc = X.copy(); Xc[~np.isfinite(Xc)] = np.nan
    med = np.nanmedian(Xc, axis=0)
    med = np.where(np.isfinite(med), med, 0.0).astype(np.float32)
    return med

def impute_inplace(X, medians):
    bad = ~np.isfinite(X)
    if bad.any():
        cols = np.where(bad)[1]
        X[bad] = medians[cols]

def fit_standardizer(X):
    mean = X.mean(axis=0).astype(np.float32)
    var  = X.var(axis=0).astype(np.float32)
    std  = np.sqrt(var, dtype=np.float32); std[std == 0.0] = 1.0
    return mean, std

def apply_standardizer_inplace(X, mean, std):
    X -= mean; X /= std

# 1) Muestrear TRAIN
X_train_s, groups_train_s, _ = sample_by_group(CFG["max_train_samples"], X_train, groups_train)
print("Train sampled:", X_train_s.shape)

# 2) Imputación + Escalado
X_train_s = X_train_s.astype(np.float32, copy=False)
X_train_s[~np.isfinite(X_train_s)] = np.nan
train_medians = colwise_nanmedian(X_train_s)
impute_inplace(X_train_s, train_medians)
train_mean, train_std = fit_standardizer(X_train_s)
apply_standardizer_inplace(X_train_s, train_mean, train_std)

X_train_sc = X_train_s
groups_train = groups_train_s
print("Scaled train:", X_train_sc.shape)

# 3) Guardar parámetros para reproducibilidad
np.save(os.path.join(CFG["out_dir"], f"{CFG['artifact_prefix']}_imputer_medians.npy"), train_medians)
np.savez(os.path.join(CFG["out_dir"], f"{CFG['artifact_prefix']}_scaler_params.npz"), mean=train_mean, std=train_std)

# 4) Generador para transformar EVAL por lotes (sin construir X_eval_sc completo)
def transform_eval_in_batches(X, batch_size=CFG["eval_batch_size"]):
    n = X.shape[0]
    for s in range(0, n, batch_size):
        e = min(s + batch_size, n)
        Xe = X[s:e].astype(np.float32, copy=False)
        Xe[~np.isfinite(Xe)] = np.nan
        impute_inplace(Xe, train_medians)
        apply_standardizer_inplace(Xe, train_mean, train_std)
        yield s, e, Xe

gc.collect()

Train sampled: (499950, 19)
Scaled train: (499950, 19)


0

In [5]:
# --- Búsqueda de hiperparámetros (subset; objetivo = outlier-rate ~ 5%) ---
import numpy as np, pandas as pd
from sklearn.model_selection import GroupKFold, KFold, ParameterGrid
from sklearn.svm import OneClassSVM

param_grid = list(ParameterGrid({"nu": CFG["svm_nu_grid"], "gamma": CFG["svm_gamma_grid"]}))
target_outlier_rate = 0.05

def build_search_subset(X, groups, n_max):
    if (n_max is None) or (X.shape[0] <= n_max): return X, groups
    rng = np.random.default_rng(123)
    if groups is None:
        idx = rng.choice(X.shape[0], n_max, replace=False); return X[idx], None
    uniq = np.unique(groups); per_g = max(1, n_max // len(uniq)); take = []
    for g in uniq:
        g_idx = np.where(groups == g)[0]
        take.extend(rng.choice(g_idx, min(per_g, g_idx.size), replace=False).tolist())
    take = np.array(take)
    if take.size > n_max: take = rng.choice(take, n_max, replace=False)
    return X[take], groups[take]

X_search, groups_search = build_search_subset(X_train_sc, groups_train, CFG["max_search_samples"])

if (groups_search is not None) and (len(np.unique(groups_search)) >= 2):
    n_splits = min(CFG["kfold_splits"], len(np.unique(groups_search)))
    splitter = GroupKFold(n_splits=n_splits); split_args = dict(X=X_search, y=None, groups=groups_search)
else:
    n_splits = max(2, CFG["kfold_splits"])
    splitter = KFold(n_splits=n_splits, shuffle=True, random_state=42); split_args = dict(X=X_search, y=None)

def outlier_rate(pred): return float((pred == -1).mean())

best_cfg, best_obj, rows = None, None, []
for p in param_grid:
    fold_rates = []
    for tr_idx, va_idx in splitter.split(**split_args):
        Xtr, Xva = X_search[tr_idx], X_search[va_idx]
        m = OneClassSVM(kernel=CFG["kernel"], nu=p["nu"], gamma=p["gamma"])
        m.fit(Xtr)
        pred = m.predict(Xva)   # +1 normal, -1 outlier
        fold_rates.append(outlier_rate(pred))
    rate_mean, rate_std = float(np.mean(fold_rates)), float(np.std(fold_rates))
    obj = abs(rate_mean - target_outlier_rate) + rate_std
    rows.append({"params": p, "rate_mean": rate_mean, "rate_std": rate_std, "obj": obj})
    if (best_obj is None) or (obj < best_obj):
        best_obj, best_cfg = obj, p

res_df = pd.DataFrame(rows).sort_values("obj")
display(res_df.head(5))
print("Best params:", best_cfg, "| splits:", n_splits, "| search_subset:", X_search.shape)

Unnamed: 0,params,rate_mean,rate_std,obj
4,"{'gamma': 0.01, 'nu': 0.05}",0.05185,0.00874,0.01059
1,"{'gamma': 'scale', 'nu': 0.05}",0.056581,0.015771,0.022351
3,"{'gamma': 0.01, 'nu': 0.01}",0.010326,0.001866,0.041539
0,"{'gamma': 'scale', 'nu': 0.01}",0.015872,0.009458,0.043586
5,"{'gamma': 0.01, 'nu': 0.1}",0.10327,0.018643,0.071913


Best params: {'gamma': 0.01, 'nu': 0.05} | splits: 5 | search_subset: (199980, 19)


In [6]:
# --- Entrenamiento final ---
from sklearn.svm import OneClassSVM
final_model = OneClassSVM(kernel=CFG["kernel"], nu=best_cfg["nu"], gamma=best_cfg["gamma"])
final_model.fit(X_train_sc)
print("Final model trained.")

Final model trained.


In [5]:
# --- Evaluación por lotes con REANUDACIÓN (auto-setup si el kernel se reinició) ---
import os, json, time, pickle, numpy as np
from pathlib import Path
from sklearn.metrics import roc_auc_score, average_precision_score, precision_recall_curve, auc

# --------- AUTO-SETUP (por si el kernel se reinició) ---------
# CFG por defecto si no existe
if 'CFG' not in globals():
    CFG = {
        "out_dir": "data/ocsvm_runs",
        "artifact_prefix": "ocsvm_rbf",
        "eval_batch_size": 200_000,
    }

OUT = Path(CFG["out_dir"]); OUT.mkdir(parents=True, exist_ok=True)

# Cargar modelo si no está en memoria
if 'final_model' not in globals():
    with open(OUT / f"{CFG['artifact_prefix']}_model.pkl", "rb") as f:
        final_model = pickle.load(f)

# Necesitamos X_eval e y_eval en memoria
assert 'X_eval' in globals() and 'y_eval' in globals(), \
    "Falta X_eval/y_eval en memoria. Re-ejecuta la celda de CARGA (la que arma X_train, X_eval, y_eval)."

# Cargar preprocesamiento (medianas + scaler) por si tampoco está en memoria
try:
    _medians = globals().get('train_medians', None)
    _scaler_params = globals().get('train_mean', None), globals().get('train_std', None)
    if _medians is None or _scaler_params[0] is None or _scaler_params[1] is None:
        raise KeyError
    medians, train_mean, train_std = _medians, _scaler_params[0], _scaler_params[1]
except Exception:
    medians = np.load(OUT / f"{CFG['artifact_prefix']}_imputer_medians.npy")
    sp = np.load(OUT / f"{CFG['artifact_prefix']}_scaler_params.npz")
    train_mean, train_std = sp["mean"], sp["std"]
train_std = train_std.copy(); train_std[train_std == 0] = 1.0

def impute_inplace(X, med):
    bad = ~np.isfinite(X)
    if bad.any():
        X[bad] = med[np.where(bad)[1]]

def standardize_inplace(X, m, s):
    X -= m; X /= s

# --------- REANUDACIÓN CON MEMMAP ---------
n_eval = X_eval.shape[0]
scores_path = OUT / f"{CFG['artifact_prefix']}_eval_scores_mm.dat"
progress_path = OUT / f"{CFG['artifact_prefix']}_progress.json"
expected_bytes = n_eval * 4  # float32

# si hay memmap de tamaño incorrecto, recrear
if scores_path.exists() and scores_path.stat().st_size != expected_bytes:
    print(f"[WARN] Memmap incorrecto ({scores_path.stat().st_size} vs {expected_bytes}) -> recreando.")
    scores_path.unlink()

mode = "r+" if scores_path.exists() else "w+"
scores_mm = np.memmap(scores_path, dtype=np.float32, mode=mode, shape=(n_eval,))
if mode == "w+":
    scores_mm[:] = np.nan
    scores_mm.flush()

# punto de reanudación
start = 0
if progress_path.exists():
    try:
        start = int(json.loads(progress_path.read_text()).get("next_start", 0))
    except Exception:
        start = 0
if start <= 0:
    # buscar primer NaN
    nan_mask = np.isnan(scores_mm)
    start = int(np.argmax(nan_mask)) if nan_mask.any() else n_eval

print(f"[RESUME] next_start = {start:,}/{n_eval:,}")

bs = int(CFG.get("eval_batch_size", 200_000))
t0 = time.time(); last = t0

try:
    for s in range(start, n_eval, bs):
        e = min(s + bs, n_eval)
        Xe = X_eval[s:e].astype(np.float32, copy=False)
        Xe[~np.isfinite(Xe)] = np.nan
        impute_inplace(Xe, medians)
        standardize_inplace(Xe, train_mean, train_std)

        scores_mm[s:e] = -final_model.decision_function(Xe)
        scores_mm.flush()
        progress_path.write_text(json.dumps({"next_start": e}))

        now = time.time()
        if now - last > 10:
            rate = (e - start) / max(1e-6, (now - t0))
            print(f"Progress: {100*e/n_eval:5.2f}% | {e:,}/{n_eval:,} | ~{rate:,.0f} rows/s")
            last = now

except KeyboardInterrupt:
    scores_mm.flush()
    progress_path.write_text(json.dumps({"next_start": e}))
    print(f"\n[INTERRUPTED] Progreso guardado. Reanuda desde idx={e}.")
    raise
except Exception as ex:
    scores_mm.flush()
    progress_path.write_text(json.dumps({"next_start": s}))
    print(f"\n[ERROR] Guardado progreso hasta idx={s}. Detalle: {ex}")
    raise

# marcar completo
progress_path.write_text(json.dumps({"next_start": n_eval}))
scores = scores_mm

# --------- MÉTRICAS + GUARDADOS ---------
yb = y_eval.astype(int)
if len(np.unique(yb)) > 1:
    roc = roc_auc_score(yb, scores)
    prec, rec, _ = precision_recall_curve(yb, scores); pr_auc = auc(rec, prec)
    ap = average_precision_score(yb, scores)
else:
    roc = pr_auc = ap = np.nan

print(f"ROC-AUC: {roc:.4f} | PR-AUC: {pr_auc:.4f} | AP: {ap:.4f}")
print("Scores memmap:", str(scores_path))

# guardados idempotentes
with open(OUT / f"{CFG['artifact_prefix']}_model.pkl", "wb") as f:
    pickle.dump(final_model, f)
with open(OUT / f"{CFG['artifact_prefix']}_config.json", "w") as f:
    json.dump(CFG | {"best_params": globals().get("best_cfg", {}),
                     "metrics": {"roc_auc": float(roc), "pr_auc": float(pr_auc), "ap": float(ap)}},
              f, indent=2)

FileNotFoundError: [Errno 2] No such file or directory: 'data/ocsvm_runs/ocsvm_rbf_model.pkl'

In [1]:
# --- Evaluación por lotes (memmap) + métricas + guardados ---
import numpy as np, os, json, pickle, time
from sklearn.metrics import roc_auc_score, average_precision_score, precision_recall_curve, auc

n_eval = X_eval.shape[0]
scores_path_mm = os.path.join(CFG["out_dir"], f"{CFG['artifact_prefix']}_eval_scores_mm.dat")

# Si existe pero no coincide tamaño esperado, recrear
if os.path.exists(scores_path_mm):
    expected = n_eval * 4  # float32
    if os.stat(scores_path_mm).st_size != expected:
        print("[WARN] memmap incompleto -> recreando.")
        os.remove(scores_path_mm)

scores_mm = np.memmap(scores_path_mm, dtype=np.float32, mode="w+", shape=(n_eval,))

t0 = time.time(); last = t0; done = 0
for s, e, Xe_sc in transform_eval_in_batches(X_eval):
    scores_mm[s:e] = -final_model.decision_function(Xe_sc)
    done = e
    now = time.time()
    if now - last > 10:
        rate = done / max(1e-6, (now - t0))
        print(f"Progress: {100*done/n_eval:5.2f}% | {done:,}/{n_eval:,} | ~{rate:,.0f} rows/s")
        last = now

scores = scores_mm
yb = y_eval.astype(int)
if len(np.unique(yb)) > 1:
    roc = roc_auc_score(yb, scores)
    prec, rec, _ = precision_recall_curve(yb, scores); pr_auc = auc(rec, prec)
    ap = average_precision_score(yb, scores)
else:
    roc = pr_auc = ap = np.nan

print(f"ROC-AUC: {roc:.4f} | PR-AUC: {pr_auc:.4f} | AP: {ap:.4f}")
print("Scores memmap:", scores_path_mm)

# Guardar modelo + config + métricas
with open(os.path.join(CFG["out_dir"], f"{CFG['artifact_prefix']}_model.pkl"), "wb") as f:
    pickle.dump(final_model, f)
with open(os.path.join(CFG["out_dir"], f"{CFG['artifact_prefix']}_config.json"), "w") as f:
    json.dump(CFG | {"best_params": best_cfg,
                     "metrics": {"roc_auc": float(roc), "pr_auc": float(pr_auc), "ap": float(ap)}},
              f, indent=2)

NameError: name 'X_eval' is not defined

In [None]:
# --- Top-K y agregados por MMSI ---
import numpy as np, pandas as pd, os

scores = np.memmap(os.path.join(CFG["out_dir"], f"{CFG['artifact_prefix']}_eval_scores_mm.dat"),
                   dtype=np.float32, mode="r", shape=(X_eval.shape[0],))

k_rate = 0.01  # 1%
k = max(1, int(len(scores) * k_rate))
thr_k = np.partition(scores, -k)[-k]
pred_topk = (scores >= thr_k).astype(np.int8)

# Detalle TOP-K
topk_idx = np.where(pred_topk == 1)[0]
topk_df = pd.DataFrame({
    "idx": topk_idx.astype(np.int64),
    "anomaly_score": scores[topk_idx].astype(np.float32),
    "y_eval": y_eval[topk_idx].astype(np.int8)
})
if 'groups_eval' in globals() and groups_eval is not None:
    topk_df["mmsi"] = groups_eval[topk_idx].astype(np.int64)

topk_path = os.path.join(CFG["out_dir"], f"{CFG['artifact_prefix']}_topk_{int(k_rate*100)}pct.parquet")
topk_df.to_parquet(topk_path, index=False)
print("Saved TOP-K:", topk_path, "| rows:", len(topk_df))

# Métricas @k
yb = y_eval.astype(int)
tp = int(((pred_topk==1) & (yb==1)).sum())
fp = int(((pred_topk==1) & (yb==0)).sum())
fn = int(((pred_topk==0) & (yb==1)).sum())
prec_k = tp / (tp + fp) if (tp+fp)>0 else 0.0
rec_k  = tp / (tp + fn) if (tp+fn)>0 else 0.0
f1_k   = 2*prec_k*rec_k/(prec_k+rec_k) if (prec_k+rec_k)>0 else 0.0
print(f"@k={k_rate*100:.1f}% -> P: {prec_k:.3f} | R: {rec_k:.3f} | F1: {f1_k:.3f}  (k={k})")

# Agregado por MMSI
if 'groups_eval' in globals() and groups_eval is not None:
    mmsi_all, n_by_mmsi = np.unique(groups_eval, return_counts=True)
    mmsi_top, n_top_by_mmsi = np.unique(groups_eval[pred_topk==1], return_counts=True)
    top_map = dict(zip(mmsi_top.tolist(), n_top_by_mmsi.tolist()))
    anom_win = np.array([top_map.get(m, 0) for m in mmsi_all], dtype=np.int32)
    anom_rate = anom_win / n_by_mmsi
    agg_df = pd.DataFrame({"mmsi": mmsi_all, "n_win": n_by_mmsi, "anom_win": anom_win, "anom_rate": anom_rate})
    agg_path = os.path.join(CFG["out_dir"], f"{CFG['artifact_prefix']}_mmsi_agg.parquet")
    agg_df.to_parquet(agg_path, index=False)
    print("Saved MMSI agg:", agg_path)
    display(agg_df.sort_values("anom_rate", ascending=False).head(10))

In [2]:
# --- Recuperación: re-scorear sin re-entrenar ---
import os, numpy as np, pickle, json
from pathlib import Path
from sklearn.metrics import roc_auc_score, average_precision_score, precision_recall_curve, auc

OUT = Path(CFG["out_dir"])
scores_mm_path = OUT / f"{CFG['artifact_prefix']}_eval_scores_mm.dat"

assert 'X_eval' in globals() and 'y_eval' in globals(), "Re-ejecuta la Celda 4 (Carga) antes de recuperar."

# Cargar modelo + preprocesamiento
with open(OUT / f"{CFG['artifact_prefix']}_model.pkl", "rb") as f:
    final_model = pickle.load(f)
medians = np.load(OUT / f"{CFG['artifact_prefix']}_imputer_medians.npy")
scaler = np.load(OUT / f"{CFG['artifact_prefix']}_scaler_params.npz")
mean, std = scaler["mean"], scaler["std"]; std = std.copy(); std[std==0]=1.0

def impute_inplace(X, med):
    bad = ~np.isfinite(X)
    if bad.any(): X[bad] = np.take(med, np.where(bad)[1])
def standardize_inplace(X, m, s):
    X -= m; X /= s
def transform_eval_in_batches_recover(X, batch_size=CFG["eval_batch_size"]):
    n = X.shape[0]
    for s in range(0, n, batch_size):
        e = min(s + batch_size, n)
        Xe = X[s:e].astype(np.float32, copy=False)
        Xe[~np.isfinite(Xe)] = np.nan
        impute_inplace(Xe, medians); standardize_inplace(Xe, mean, std)
        yield s, e, Xe

# Memmap limpio si quedó incompleto
if scores_mm_path.exists():
    exp_bytes = X_eval.shape[0] * 4
    if scores_mm_path.stat().st_size != exp_bytes:
        print("[WARN] memmap incompleto -> borrando.")
        scores_mm_path.unlink()

scores_mm = np.memmap(scores_mm_path, dtype=np.float32, mode="w+", shape=(X_eval.shape[0],))
for s, e, Xe_sc in transform_eval_in_batches_recover(X_eval):
    scores_mm[s:e] = -final_model.decision_function(Xe_sc)

scores = scores_mm; yb = y_eval.astype(int)
if len(np.unique(yb)) > 1:
    roc = roc_auc_score(yb, scores)
    from sklearn.metrics import precision_recall_curve, auc, average_precision_score
    prec, rec, _ = precision_recall_curve(yb, scores); pr_auc = auc(rec, prec)
    ap = average_precision_score(yb, scores)
else:
    roc = pr_auc = ap = np.nan
print(f"[RECOVER] ROC-AUC: {roc:.4f} | PR-AUC: {pr_auc:.4f} | AP: {ap:.4f}")
print("OK ->", scores_mm_path)

NameError: name 'CFG' is not defined