# OC-SVM · AIS Anomaly Detection (Lightning, 110 GB RAM)
- Lectura **robusta** de parquets (auto-resuelve ruta; valida formato Parquet).
- Preprocesamiento **memory-safe** (imputación + estándar in-place).
- HP search **paralela** (joblib) y entrenamiento OC-SVM con **cache grande**.
- Evaluación **reanudable** con memmap + progreso incremental.
- Artefactos y resultados en `./data/ocsvm_runs` (seguro en este proyecto).

In [1]:
# --- Diagnóstico del entorno ---
import os, sys, getpass, socket
from pathlib import Path

print("User:   ", getpass.getuser())
print("Host:   ", socket.gethostname())
print("Python: ", sys.executable)
print("CWD:    ", os.getcwd())
print("/data exists?:", Path("/data").exists())
if Path("/data").exists():
    print("#parquets en /data:", len(list(Path("/data").glob("*.parquet"))))

User:    erickdsuarez10
Host:    computeinstance-e00exnkvr257g0k5f5
Python:  /home/zeus/miniconda3/envs/cloudspace/bin/python
CWD:     /teamspace/studios/this_studio
/data exists?: False


In [2]:
# Config + auto-resolución de fuente de lectura (solo-lectura) y salidas locales
import os, json
from pathlib import Path

def count_parquets(p: Path) -> int:
    try:
        return len(list(p.glob("*.parquet"))) if p.exists() else 0
    except Exception:
        return 0

CANDIDATES = [
    Path("/data").resolve(),
    Path("/teamspace/studios/this_studio/data").resolve(),
    Path("./data").resolve(),
]

EXTERNAL_DATA_DIR = None
for cand in CANDIDATES:
    if cand.exists() and count_parquets(cand) > 0:
        EXTERNAL_DATA_DIR = cand
        break
if EXTERNAL_DATA_DIR is None:
    # fallback: usa data local aunque esté vacía (para que no falle la carga y puedas copiar ahí)
    EXTERNAL_DATA_DIR = Path("/teamspace/studios/this_studio/data").resolve()
    print("⚠️ No hallé parquets; usando ruta local del Studio por defecto:", EXTERNAL_DATA_DIR)

OUT_DIR = Path("data/ocsvm_runs").resolve()
OUT_DIR.mkdir(parents=True, exist_ok=True)

CFG = {
    "external_data_dir": str(EXTERNAL_DATA_DIR),
    "out_dir": str(OUT_DIR),
    "artifact_prefix": "ocsvm_rbf",
    # HP + límites
    "svm_nu_grid": [0.01, 0.05, 0.1],
    "svm_gamma_grid": ["scale", 0.01],
    "kernel": "rbf",
    "kfold_splits": 5,
    "max_train_samples": 800_000,     # ajustados para 110 GB RAM
    "max_search_samples": 400_000,
    "eval_batch_size": 2_000_000,     # lotes grandes para acelerar evaluación
}

print("LECTURA  :", CFG["external_data_dir"])
print("SALIDAS  :", CFG["out_dir"])
print("#parquets:", count_parquets(Path(CFG["external_data_dir"])))
print("CFG:", json.dumps({k: CFG[k] for k in CFG}, indent=2))

LECTURA  : /teamspace/studios/this_studio/data
SALIDAS  : /teamspace/studios/this_studio/data/ocsvm_runs
#parquets: 17
CFG: {
  "external_data_dir": "/teamspace/studios/this_studio/data",
  "out_dir": "/teamspace/studios/this_studio/data/ocsvm_runs",
  "artifact_prefix": "ocsvm_rbf",
  "svm_nu_grid": [
    0.01,
    0.05,
    0.1
  ],
  "svm_gamma_grid": [
    "scale",
    0.01
  ],
  "kernel": "rbf",
  "kfold_splits": 5,
  "max_train_samples": 800000,
  "max_search_samples": 400000,
  "eval_batch_size": 2000000
}


In [3]:
# Performance pack (usa todos los cores menos 1)
import os, psutil

N_JOBS = max(1, os.cpu_count() - 1)
os.environ["OMP_NUM_THREADS"] = str(N_JOBS)
os.environ["OPENBLAS_NUM_THREADS"] = str(N_JOBS)
os.environ["MKL_NUM_THREADS"] = str(N_JOBS)
os.environ["NUMEXPR_NUM_THREADS"] = str(N_JOBS)

print(f"Cores: {os.cpu_count()} -> N_JOBS={N_JOBS}")
print(f"RAM disponible: {psutil.virtual_memory().available/1e9:.1f} GB")

Cores: 32 -> N_JOBS=31
RAM disponible: 129.0 GB


In [4]:
# --- Performance pack (RAM 110 GB + todos los cores) ---
import os, multiprocessing, psutil

# Usa (n_cores - 1) para no saturar el sistema
N_JOBS = max(1, os.cpu_count() - 1)
os.environ["OMP_NUM_THREADS"] = str(N_JOBS)
os.environ["OPENBLAS_NUM_THREADS"] = str(N_JOBS)
os.environ["MKL_NUM_THREADS"] = str(N_JOBS)
os.environ["NUMEXPR_NUM_THREADS"] = str(N_JOBS)

print(f"Cores visibles: {os.cpu_count()} | N_JOBS={N_JOBS}")
print(f"RAM disponible aprox: {psutil.virtual_memory().available/1e9:.1f} GB")

# Recomendaciones de tamaño con 110 GB RAM (float32 => ~76 bytes/fila si F=19)
# Ajusta tu CFG después de ejecutar esta celda:
CFG["max_train_samples"]  = 800_000     # si tu entrenamiento tarda demasiado, bájalo a 500k
CFG["max_search_samples"] = 400_000
CFG["eval_batch_size"]    = 2_000_000   # sube el lote de evaluación (menos overhead)

# Guardar
print("CFG actualizado: ",
      {k: CFG[k] for k in ["max_train_samples","max_search_samples","eval_batch_size"]})

Cores visibles: 32 | N_JOBS=31
RAM disponible aprox: 129.0 GB
CFG actualizado:  {'max_train_samples': 800000, 'max_search_samples': 400000, 'eval_batch_size': 2000000}


In [5]:
# Carga robusta desde CFG["external_data_dir"] (con validación Parquet)
import numpy as np, pandas as pd, gc
from pathlib import Path
import pyarrow.parquet as pq

DATA_DIR = Path(CFG["external_data_dir"])
print("DATA_DIR:", DATA_DIR)

def is_valid_parquet(path: Path) -> bool:
    try:
        pq.ParquetFile(path); return True
    except Exception as e:
        print(f"[WARN] No Parquet válido -> {path.name} :: {type(e).__name__}")
        return False

def read_parquet_min(path: Path) -> pd.DataFrame:
    df = pd.read_parquet(path, engine="pyarrow")
    for c in df.columns:
        if pd.api.types.is_float_dtype(df[c]): df[c] = df[c].astype(np.float32)
        elif pd.api.types.is_integer_dtype(df[c]) and df[c].max() <= np.iinfo(np.int32).max:
            df[c] = df[c].astype(np.int32)
    return df

def detect_label_col(df):
    for k in ["y","label","is_suspicious","target"]:
        if k in df.columns: return k
    return None

def detect_group_col(df):
    for k in ["mmsi","group","ship_id"]:
        if k in df.columns: return k
    return None

def pick_valid(base: Path, *candidates):
    # exactos primero
    for n in candidates:
        if "*" not in n and "?" not in n and "[" not in n:
            p = base / n
            if p.exists() and is_valid_parquet(p): return p
    # luego patrones (elige el mayor por tamaño que sea válido)
    for patt in candidates:
        if any(ch in patt for ch in "*?[]"):
            matches = list(base.glob(patt))
            matches.sort(key=lambda x: x.stat().st_size if x.exists() else 0, reverse=True)
            for m in matches:
                if is_valid_parquet(m): 
                    print(f"[pick_valid] → {m.name} (pattern {patt})"); 
                    return m
    return None

# --- TRAIN (ventanas normales) ---
train_path = pick_valid(
    DATA_DIR,
    "windows_aligned_normal.parquet",
    "norm_windows_flat.parquet",
    "ais_norm_windows.parquet",
    "*windows_aligned_norm*.parquet",
    "*norm*windows*.parquet",
    "*ais_norm_windows*.parquet",
)
if train_path is None:
    raise FileNotFoundError("No encontré TRAIN normal válido en la ruta de datos.")
df_tr = read_parquet_min(train_path)

ycol_tr = detect_label_col(df_tr)
gcol_tr = detect_group_col(df_tr)
drop_common = {"lat","lon","idx","idx_end","window_id"}
drop_train = set([c for c in [ycol_tr, gcol_tr] if c]) | drop_common
feat_tr = [c for c in df_tr.columns if c not in drop_train]
X_train = df_tr[feat_tr].to_numpy(dtype=np.float32)
groups_train = df_tr[gcol_tr].to_numpy() if gcol_tr else None
print("TRAIN ->", train_path.name, "| X_train:", X_train.shape)

# --- EVAL (único o split) ---
eval_single = pick_valid(DATA_DIR,
    "windows_with_labels_aligned.parquet", "*windows_with_labels_aligned*.parquet",
    "eval_windows_aligned.parquet", "*eval_windows_aligned*.parquet",
    "windows_with_labels.parquet", "*windows_with_labels*.parquet",
)
eval_wl_norm = pick_valid(DATA_DIR, "windows_with_labels_aligned_normal.parquet",
                          "*windows_with_labels_aligned_norm*.parquet", "*windows_with_labels_aligned_normal*.parquet")
eval_wl_anom = pick_valid(DATA_DIR, "windows_with_labels_aligned_anom.parquet",
                          "*windows_with_labels_aligned_anom*.parquet")
eval_norm    = pick_valid(DATA_DIR, "eval_windows_aligned_normal.parquet",
                          "*eval_windows_aligned_norm*.parquet", "*eval_windows_aligned_normal*.parquet")
eval_anom    = pick_valid(DATA_DIR, "eval_windows_aligned_anom.parquet",
                          "*eval_windows_aligned_anom*.parquet")
labels_any   = pick_valid(DATA_DIR, "eval_labels_aligned.parquet", "*eval_labels_aligned*.parquet",
                          "labels.parquet", "labels_anom.parquet", "*labels*.parquet")

if eval_single is not None:
    df_ev = read_parquet_min(eval_single)
    ycol_ev = detect_label_col(df_ev)
    gcol_ev = detect_group_col(df_ev)
    if ycol_ev is None:
        if labels_any is None: raise FileNotFoundError("Eval único sin etiquetas y no hay archivo de labels.")
        df_y = read_parquet_min(labels_any)
        ycol_y = detect_label_col(df_y) or df_y.select_dtypes(include=["int8","int16","int32"]).columns[-1]
        if len(df_y) != len(df_ev): raise ValueError(f"Desalineación eval vs labels: {len(df_ev)} vs {len(df_y)}")
        drop_eval = set([gcol_ev]) | drop_common
        feat_ev = [c for c in df_ev.columns if c not in drop_eval]
        X_eval = df_ev[feat_ev].to_numpy(dtype=np.float32)
        y_eval = df_y[ycol_y].astype(np.int8).to_numpy()
    else:
        drop_eval = set([ycol_ev, gcol_ev]) | drop_common
        feat_ev = [c for c in df_ev.columns if c not in drop_eval]
        X_eval = df_ev[feat_ev].to_numpy(dtype=np.float32)
        y_eval = df_ev[ycol_ev].astype(np.int8).to_numpy()
    groups_eval = df_ev[gcol_ev].to_numpy() if gcol_ev else None
    print("EVAL ->", eval_single.name, "| X_eval:", X_eval.shape, "| y_eval:", y_eval.shape)

elif eval_wl_norm is not None and eval_wl_anom is not None:
    dn, da = read_parquet_min(eval_wl_norm), read_parquet_min(eval_wl_anom)
    common = [c for c in dn.columns if c in da.columns]
    dn, da = dn[common], da[common]
    ycol_ev, gcol_ev = detect_label_col(dn), detect_group_col(dn)
    drop_eval = set([ycol_ev, gcol_ev]) | drop_common
    feat_ev = [c for c in common if c not in drop_eval]
    X_eval = pd.concat([dn[feat_ev], da[feat_ev]], ignore_index=True).to_numpy(np.float32)
    y_eval = pd.concat([dn[ycol_ev], da[ycol_ev]], ignore_index=True).astype(np.int8).to_numpy()
    groups_eval = (pd.concat([dn[gcol_ev], da[gcol_ev]], ignore_index=True).to_numpy() if gcol_ev else None)
    print("EVAL ->", eval_wl_norm.name, "+", eval_wl_anom.name, "| X_eval:", X_eval.shape, "| y_eval:", y_eval.shape)

elif eval_norm is not None and eval_anom is not None:
    if labels_any is None: raise FileNotFoundError("Eval split sin labels embebidas y no hay archivo de labels.")
    dn, da, dy = read_parquet_min(eval_norm), read_parquet_min(eval_anom), read_parquet_min(labels_any)
    common = [c for c in dn.columns if c in da.columns]
    dn, da = dn[common], da[common]
    gcol_ev = detect_group_col(dn)
    drop_eval = set([gcol_ev]) | drop_common
    feat_ev = [c for c in common if c not in drop_eval]
    df_concat = pd.concat([dn[feat_ev], da[feat_ev]], ignore_index=True)
    X_eval = df_concat.to_numpy(np.float32)
    ycol_y = detect_label_col(dy) or dy.select_dtypes(include=["int8","int16","int32"]).columns[-1]
    if len(dy) != len(df_concat): raise ValueError(f"Desalineación eval concat vs labels: {len(df_concat)} vs {len(dy)}")
    y_eval = dy[ycol_y].astype(np.int8).to_numpy()
    groups_eval = (pd.concat([dn[gcol_ev], da[gcol_ev]], ignore_index=True).to_numpy() if gcol_ev else None)
    print("EVAL ->", eval_norm.name, "+", eval_anom.name, "| X_eval:", X_eval.shape, "| y_eval:", y_eval.shape)

else:
    raise FileNotFoundError("No se pudo resolver un set de EVAL válido.")

del df_tr; gc.collect()
print("Train -> X:", X_train.shape, "| groups:", None if groups_train is None else len(groups_train))
print("Eval  -> X:", X_eval.shape,  "| y:", y_eval.shape, "| groups:", None if groups_eval is None else len(groups_eval))
print("N feats: train", X_train.shape[1], "| eval", X_eval.shape[1])

DATA_DIR: /teamspace/studios/this_studio/data
TRAIN -> windows_aligned_normal.parquet | X_train: (27789660, 19)
[pick_valid] → windows_with_labels_aligned_norm.parquet (pattern *windows_with_labels_aligned_norm*.parquet)
[pick_valid] → eval_windows_aligned_norm.parquet (pattern *eval_windows_aligned_norm*.parquet)
EVAL -> windows_with_labels_aligned.parquet | X_eval: (27789660, 19) | y_eval: (27789660,)
Train -> X: (27789660, 19) | groups: 27789660
Eval  -> X: (27789660, 19) | y: (27789660,) | groups: 27789660
N feats: train 19 | eval 19


In [6]:
# Imputación + estándar + muestreo (in-place, memory-safe)
import numpy as np, os, gc

def sample_by_group(n_max, X, groups):
    if (n_max is None) or (X.shape[0] <= n_max):
        idx = np.arange(X.shape[0]); return X, (groups if groups is not None else None), idx
    rng = np.random.default_rng(42)
    if groups is None:
        idx = rng.choice(X.shape[0], n_max, replace=False); return X[idx], None, idx
    uniq = np.unique(groups); per_g = max(1, n_max // len(uniq)); take = []
    for g in uniq:
        g_idx = np.where(groups == g)[0]
        take.extend(rng.choice(g_idx, min(per_g, g_idx.size), replace=False).tolist())
    take = np.array(take)
    if take.size > n_max: take = rng.choice(take, n_max, replace=False)
    return X[take], groups[take], take

def colwise_nanmedian(X):
    Xc = X.copy(); Xc[~np.isfinite(Xc)] = np.nan
    med = np.nanmedian(Xc, axis=0)
    med = np.where(np.isfinite(med), med, 0.0).astype(np.float32)
    return med

def impute_inplace(X, medians):
    bad = ~np.isfinite(X)
    if bad.any():
        cols = np.where(bad)[1]
        X[bad] = medians[cols]

def fit_standardizer(X):
    mean = X.mean(axis=0).astype(np.float32)
    var  = X.var(axis=0).astype(np.float32)
    std  = np.sqrt(var, dtype=np.float32); std[std == 0.0] = 1.0
    return mean, std

def apply_standardizer_inplace(X, mean, std):
    X -= mean; X /= std

X_train_s, groups_train_s, _ = sample_by_group(CFG["max_train_samples"], X_train, groups_train)
print("Train sampled:", X_train_s.shape)

X_train_s = X_train_s.astype(np.float32, copy=False)
X_train_s[~np.isfinite(X_train_s)] = np.nan
train_medians = colwise_nanmedian(X_train_s)
impute_inplace(X_train_s, train_medians)
train_mean, train_std = fit_standardizer(X_train_s)
apply_standardizer_inplace(X_train_s, train_mean, train_std)

X_train_sc = X_train_s
groups_train = groups_train_s

# Guardar preprocesamiento
np.save(os.path.join(CFG["out_dir"], f"{CFG['artifact_prefix']}_imputer_medians.npy"), train_medians)
np.savez(os.path.join(CFG["out_dir"], f"{CFG['artifact_prefix']}_scaler_params.npz"), mean=train_mean, std=train_std)

# Transformador por lotes para eval
def transform_eval_in_batches(X, batch_size=CFG["eval_batch_size"]):
    n = X.shape[0]
    for s in range(0, n, batch_size):
        e = min(s + batch_size, n)
        Xe = X[s:e].astype(np.float32, copy=False)
        Xe[~np.isfinite(Xe)] = np.nan
        impute_inplace(Xe, train_medians)
        apply_standardizer_inplace(Xe, train_mean, train_std)
        yield s, e, Xe

gc.collect()
print("Scaled train:", X_train_sc.shape)

Train sampled: (799920, 19)
Scaled train: (799920, 19)


In [7]:
# HP search paralela (nu, gamma) minimizando |outlier_rate - 5%|
import numpy as np, pandas as pd
from joblib import Parallel, delayed
from sklearn.model_selection import GroupKFold, KFold, ParameterGrid
from sklearn.svm import OneClassSVM

param_grid = list(ParameterGrid({"nu": CFG["svm_nu_grid"], "gamma": CFG["svm_gamma_grid"]}))
target_outlier_rate = 0.05

def build_search_subset(X, groups, n_max):
    if (n_max is None) or (X.shape[0] <= n_max): return X, groups
    rng = np.random.default_rng(123)
    if groups is None:
        idx = rng.choice(X.shape[0], n_max, replace=False); return X[idx], None
    uniq = np.unique(groups); per_g = max(1, n_max // len(uniq)); take = []
    for g in uniq:
        g_idx = np.where(groups == g)[0]
        take.extend(rng.choice(g_idx, min(per_g, g_idx.size), replace=False).tolist())
    take = np.array(take)
    if take.size > n_max: take = rng.choice(take, n_max, replace=False)
    return X[take], groups[take]

X_search, groups_search = build_search_subset(X_train_sc, groups_train, CFG["max_search_samples"])

if (groups_search is not None) and (len(np.unique(groups_search)) >= 2):
    n_splits = min(CFG["kfold_splits"], len(np.unique(groups_search)))
    splitter = GroupKFold(n_splits=n_splits); split_args = dict(X=X_search, y=None, groups=groups_search)
else:
    n_splits = max(2, CFG["kfold_splits"])
    splitter = KFold(n_splits=n_splits, shuffle=True, random_state=42); split_args = dict(X=X_search, y=None)

def outlier_rate(pred): return float((pred == -1).mean())

def eval_param(p):
    rates = []
    for tr_idx, va_idx in splitter.split(**split_args):
        Xtr, Xva = X_search[tr_idx], X_search[va_idx]
        m = OneClassSVM(kernel=CFG["kernel"], nu=p["nu"], gamma=p["gamma"],
                        cache_size=2048, tol=1e-3, shrinking=True)
        m.fit(Xtr)
        rates.append(outlier_rate(m.predict(Xva)))
    rate_mean, rate_std = float(np.mean(rates)), float(np.std(rates))
    obj = abs(rate_mean - target_outlier_rate) + rate_std
    return {"params": p, "rate_mean": rate_mean, "rate_std": rate_std, "obj": obj}

rows = Parallel(n_jobs=N_JOBS, prefer="processes", verbose=5)(
    delayed(eval_param)(p) for p in param_grid
)

res_df = pd.DataFrame(rows).sort_values("obj")
best_cfg = res_df.iloc[0]["params"]
display(res_df.head(10))
print("Best params:", best_cfg, "| splits:", n_splits, "| search_subset:", X_search.shape)

[Parallel(n_jobs=31)]: Using backend LokyBackend with 31 concurrent workers.
[Parallel(n_jobs=31)]: Done   3 out of   6 | elapsed: 72.9min remaining: 72.9min
[Parallel(n_jobs=31)]: Done   6 out of   6 | elapsed: 124.8min finished


Unnamed: 0,params,rate_mean,rate_std,obj
4,"{'gamma': 0.01, 'nu': 0.05}",0.051925,0.007809,0.009734
1,"{'gamma': 'scale', 'nu': 0.05}",0.056986,0.015195,0.022181
3,"{'gamma': 0.01, 'nu': 0.01}",0.010304,0.001774,0.04147
0,"{'gamma': 'scale', 'nu': 0.01}",0.015864,0.010365,0.044501
5,"{'gamma': 0.01, 'nu': 0.1}",0.103243,0.018722,0.071965
2,"{'gamma': 'scale', 'nu': 0.1}",0.106106,0.020934,0.077039


Best params: {'gamma': 0.01, 'nu': 0.05} | splits: 5 | search_subset: (399960, 19)


In [8]:
# Entrenamiento final OC-SVM (cache grande)
from sklearn.svm import OneClassSVM

if "best_cfg" not in globals() or not best_cfg:  # fallback
    best_cfg = {"nu": 0.05, "gamma": "scale"}

final_model = OneClassSVM(
    kernel=CFG.get("kernel","rbf"),
    nu=best_cfg["nu"],
    gamma=best_cfg["gamma"],
    cache_size=8192,   # 8 GB cache para libSVM
    tol=1e-3,
    shrinking=True
)
final_model.fit(X_train_sc)
print("Final model trained. Params:", best_cfg)

Final model trained. Params: {'gamma': 0.01, 'nu': 0.05}


In [9]:
# Evaluación por lotes con REANUDACIÓN + memmap (rápida)
import os, json, time, pickle, numpy as np
from pathlib import Path
from sklearn.metrics import roc_auc_score, average_precision_score, precision_recall_curve, auc

OUT = Path(CFG["out_dir"]); OUT.mkdir(parents=True, exist_ok=True)
n_eval = X_eval.shape[0]
scores_path = OUT / f"{CFG['artifact_prefix']}_eval_scores_mm.dat"
progress_path = OUT / f"{CFG['artifact_prefix']}_progress.json"
expected_bytes = n_eval * 4

# memmap correcto o recrea
if scores_path.exists() and scores_path.stat().st_size != expected_bytes:
    print(f"[WARN] Memmap {scores_path.stat().st_size} != {expected_bytes} -> recreando.")
    scores_path.unlink()

mode = "r+" if scores_path.exists() else "w+"
scores_mm = np.memmap(scores_path, dtype=np.float32, mode=mode, shape=(n_eval,))
if mode == "w+":
    scores_mm[:] = np.nan
    scores_mm.flush()

# cargar preprocesamiento por si el kernel cambió
medians = np.load(OUT / f"{CFG['artifact_prefix']}_imputer_medians.npy")
sp = np.load(OUT / f"{CFG['artifact_prefix']}_scaler_params.npz")
train_mean, train_std = sp["mean"], sp["std"]
train_std = train_std.copy(); train_std[train_std==0]=1.0

def impute_inplace(X, med):
    bad = ~np.isfinite(X)
    if bad.any(): X[bad] = med[np.where(bad)[1]]
def standardize_inplace(X, m, s):
    X -= m; X /= s

# reanudación
start = 0
if progress_path.exists():
    try: start = int(json.loads(progress_path.read_text()).get("next_start", 0))
    except: start = 0
if start <= 0:
    nan_mask = np.isnan(scores_mm)
    start = int(np.argmax(nan_mask)) if nan_mask.any() else n_eval
print(f"[RESUME] next_start = {start:,}/{n_eval:,}")

bs = int(CFG.get("eval_batch_size", 2_000_000))
t0 = time.time(); last = t0

try:
    for s in range(start, n_eval, bs):
        e = min(s + bs, n_eval)
        Xe = X_eval[s:e].astype(np.float32, copy=False)
        Xe[~np.isfinite(Xe)] = np.nan
        impute_inplace(Xe, medians)
        standardize_inplace(Xe, train_mean, train_std)

        scores_mm[s:e] = -final_model.decision_function(Xe)
        scores_mm.flush()
        progress_path.write_text(json.dumps({"next_start": e}))

        now = time.time()
        if now - last > 10:
            rate = (e - start) / max(1e-6, (now - t0))
            print(f"Progress: {100*e/n_eval:5.2f}% | {e:,}/{n_eval:,} | ~{rate:,.0f} rows/s")
            last = now
except KeyboardInterrupt:
    scores_mm.flush(); progress_path.write_text(json.dumps({"next_start": e})); print("\n[INTERRUPTED] Guardado.")
    raise

scores = scores_mm
yb = y_eval.astype(int)
if len(np.unique(yb)) > 1:
    roc = roc_auc_score(yb, scores)
    prec, rec, _ = precision_recall_curve(yb, scores); pr_auc = auc(rec, prec)
    ap = average_precision_score(yb, scores)
else:
    roc = pr_auc = ap = np.nan

print(f"ROC-AUC: {roc:.4f} | PR-AUC: {pr_auc:.4f} | AP: {ap:.4f}")
print("Scores memmap:", str(scores_path))

# Guardar modelo + config + métricas
with open(OUT / f"{CFG['artifact_prefix']}_model.pkl", "wb") as f:
    pickle.dump(final_model, f)
with open(OUT / f"{CFG['artifact_prefix']}_config.json", "w") as f:
    json.dump(CFG | {"best_params": best_cfg,
                     "metrics": {"roc_auc": float(roc), "pr_auc": float(pr_auc), "ap": float(ap)}},
              f, indent=2)

[RESUME] next_start = 27,789,660/27,789,660
ROC-AUC: nan | PR-AUC: nan | AP: nan
Scores memmap: /teamspace/studios/this_studio/data/ocsvm_runs/ocsvm_rbf_eval_scores_mm.dat


In [10]:
# Top-K y agregados por MMSI
import numpy as np, pandas as pd, os

scores = np.memmap(os.path.join(CFG["out_dir"], f"{CFG['artifact_prefix']}_eval_scores_mm.dat"),
                   dtype=np.float32, mode="r", shape=(X_eval.shape[0],))

k_rate = 0.01
k = max(1, int(len(scores) * k_rate))
thr_k = np.partition(scores, -k)[-k]
pred_topk = (scores >= thr_k).astype(np.int8)

topk_idx = np.where(pred_topk == 1)[0]
topk_df = pd.DataFrame({
    "idx": topk_idx.astype(np.int64),
    "anomaly_score": scores[topk_idx].astype(np.float32),
    "y_eval": y_eval[topk_idx].astype(np.int8)
})
if 'groups_eval' in globals() and groups_eval is not None:
    topk_df["mmsi"] = groups_eval[topk_idx].astype(np.int64)

topk_path = os.path.join(CFG["out_dir"], f"{CFG['artifact_prefix']}_topk_{int(k_rate*100)}pct.parquet")
topk_df.to_parquet(topk_path, index=False)
print("Saved TOP-K:", topk_path, "| rows:", len(topk_df))

# Métricas @k
yb = y_eval.astype(int)
tp = int(((pred_topk==1) & (yb==1)).sum())
fp = int(((pred_topk==1) & (yb==0)).sum())
fn = int(((pred_topk==0) & (yb==1)).sum())
prec_k = tp / (tp + fp) if (tp+fp)>0 else 0.0
rec_k  = tp / (tp + fn) if (tp+fn)>0 else 0.0
f1_k   = 2*prec_k*rec_k/(prec_k+rec_k) if (prec_k+rec_k)>0 else 0.0
print(f"@k={k_rate*100:.1f}% -> P:{prec_k:.3f} | R:{rec_k:.3f} | F1:{f1_k:.3f}  (k={k})")

# Agregado por MMSI
if 'groups_eval' in globals() and groups_eval is not None:
    mmsi_all, n_by_mmsi = np.unique(groups_eval, return_counts=True)
    mmsi_top, n_top_by_mmsi = np.unique(groups_eval[pred_topk==1], return_counts=True)
    top_map = dict(zip(mmsi_top.tolist(), n_top_by_mmsi.tolist()))
    anom_win = np.array([top_map.get(m, 0) for m in mmsi_all], dtype=np.int32)
    anom_rate = anom_win / n_by_mmsi
    agg_df = pd.DataFrame({"mmsi": mmsi_all, "n_win": n_by_mmsi, "anom_win": anom_win, "anom_rate": anom_rate})
    agg_path = os.path.join(CFG["out_dir"], f"{CFG['artifact_prefix']}_mmsi_agg.parquet")
    agg_df.to_parquet(agg_path, index=False)
    print("Saved MMSI agg:", agg_path)
    display(agg_df.sort_values("anom_rate", ascending=False).head(10))

Saved TOP-K: /teamspace/studios/this_studio/data/ocsvm_runs/ocsvm_rbf_topk_1pct.parquet | rows: 277896
@k=1.0% -> P:0.000 | R:0.000 | F1:0.000  (k=277896)
Saved MMSI agg: /teamspace/studios/this_studio/data/ocsvm_runs/ocsvm_rbf_mmsi_agg.parquet


Unnamed: 0,mmsi,n_win,anom_win,anom_rate
12,33266086194351,428760,34116,0.079569
19,49534994750419,374280,18958,0.050652
28,77832927010710,423960,20276,0.047825
33,87919276942456,567520,25836,0.045524
40,100771710683634,32440,1381,0.042571
1,12639560807591,23520,924,0.039286
42,103576446797335,148920,5618,0.037725
27,77182424306278,169660,5875,0.034628
7,23770783250938,473600,14403,0.030412
37,95062718521348,169480,4468,0.026363


In [None]:
# --- Métricas finales resumen (para informe) ---
import json, numpy as np, os, pandas as pd
from pathlib import Path

OUT = Path(CFG["out_dir"])
cfg_path = OUT / f"{CFG['artifact_prefix']}_config.json"
scores_path = OUT / f"{CFG['artifact_prefix']}_eval_scores_mm.dat"

if not cfg_path.exists():
    raise FileNotFoundError("No se encontró el archivo de configuración con métricas guardadas.")

# Cargar configuración y métricas
with open(cfg_path, "r") as f:
    cfg_data = json.load(f)

metrics = cfg_data.get("metrics", {})
roc_auc = metrics.get("roc_auc", np.nan)
pr_auc  = metrics.get("pr_auc", np.nan)
ap      = metrics.get("ap", np.nan)
best_params = cfg_data.get("best_params", {})

print("📊 === Métricas finales OC-SVM ===")
print(f"ROC-AUC : {roc_auc:.4f}")
print(f"PR-AUC  : {pr_auc:.4f}")
print(f"AP Score: {ap:.4f}")
print()
print("🔧 Mejor configuración encontrada:")
print(json.dumps(best_params, indent=2))

# Resumen para tabla del informe
summary_df = pd.DataFrame([{
    "Modelo": "One-Class SVM (RBF)",
    "Kernel": cfg_data.get("kernel", "rbf"),
    "nu": best_params.get("nu"),
    "gamma": best_params.get("gamma"),
    "ROC-AUC": roc_auc,
    "PR-AUC": pr_auc,
    "AP": ap,
    "N ventanas (train)": int(cfg_data.get("max_train_samples", 0)),
    "N ventanas (eval)": int(cfg_data.get("max_search_samples", 0))
}])
display(summary_df.style.format({"ROC-AUC": "{:.4f}", "PR-AUC": "{:.4f}", "AP": "{:.4f}"}))

📊 === Métricas finales OC-SVM ===
ROC-AUC : nan
PR-AUC  : nan
AP Score: nan

🔧 Mejor configuración encontrada:
{
  "gamma": 0.01,
  "nu": 0.05
}


Unnamed: 0,Modelo,Kernel,nu,gamma,ROC-AUC,PR-AUC,AP,N ventanas (train),N ventanas (eval)
0,One-Class SVM (RBF),rbf,0.05,0.01,,,,800000,400000


In [1]:
# --- Diagnóstico de etiquetas y recálculo de métricas @k con mapeo robusto ---

import numpy as np, pandas as pd, os, json
from pathlib import Path

def map_labels_to_binary(y):
    """Devuelve (yb, info) con yb en {0,1} donde 1=anómalo."""
    vals = np.unique(y)
    s = set(vals.tolist())
    info = {"original_values": vals.tolist(), "mapping": None}
    # Caso estándar
    if s == {0, 1}:
        info["mapping"] = "0=normal, 1=anómalo (sin cambio)"
        return y.astype(int), info
    # Muy común en detección: -1 anómalo, +1 normal
    if s == {-1, 1}:
        info["mapping"] = "-1=anómalo, +1=normal -> mapeado a {0,1}"
        return (y == -1).astype(int), info
    # A veces hay {0,1,-1}; asumimos 1=anómalo, 0=normal, -1=desconocido -> lo tratamos como 0 (conservador)
    if s == {0, 1, -1}:
        info["mapping"] = "1=anómalo, 0=normal, -1=desconocido -> mapeado con -1->0"
        y2 = y.copy()
        y2[y2 == -1] = 0
        return y2.astype(int), info
    # Fallback conservador: cualquier valor >0 lo consideramos anómalo
    info["mapping"] = f"Fallback: valores {sorted(s)} -> (y>0) como anómalo"
    return (y > 0).astype(int), info

# 1) Mapear etiquetas a binario (1=anómalo)
yb, map_info = map_labels_to_binary(y_eval)
print("Distribución original:", np.unique(y_eval, return_counts=True))
print("Distribución mapeada :", np.unique(yb, return_counts=True))
print("Mapping usado:", map_info["mapping"])

# 2) Recalcular métricas @k con el mapeo correcto
scores = np.memmap(os.path.join(CFG["out_dir"], f"{CFG['artifact_prefix']}_eval_scores_mm.dat"),
                   dtype=np.float32, mode="r", shape=(X_eval.shape[0],))

k_rate = 0.01
k = max(1, int(len(scores) * k_rate))
thr_k = np.partition(scores, -k)[-k]
pred_topk = (scores >= thr_k).astype(np.int8)

tp = int(((pred_topk==1) & (yb==1)).sum())
fp = int(((pred_topk==1) & (yb==0)).sum())
fn = int(((pred_topk==0) & (yb==1)).sum())
prec_k = tp / (tp + fp) if (tp+fp)>0 else 0.0
rec_k  = tp / (tp + fn) if (tp+fn)>0 else 0.0
f1_k   = 2*prec_k*rec_k/(prec_k+rec_k) if (prec_k+rec_k)>0 else 0.0

print(f"@k={k_rate*100:.1f}% -> P:{prec_k:.4f} | R:{rec_k:.4f} | F1:{f1_k:.4f}  (k={k})")

# 3) Guardar métricas @k junto a las globales para el informe
cfg_path = Path(CFG["out_dir"]) / f"{CFG['artifact_prefix']}_config.json"
if cfg_path.exists():
    cfg = json.loads(cfg_path.read_text())
else:
    cfg = {"metrics": {}}
cfg.setdefault("metrics_at_k", {})[str(k_rate)] = {
    "k": int(k),
    "precision": float(prec_k),
    "recall": float(rec_k),
    "f1": float(f1_k),
    "label_mapping": map_info["mapping"],
    "label_values_original": map_info["original_values"],
}
cfg_path.write_text(json.dumps(cfg, indent=2))
print("📁 Métricas @k añadidas a:", cfg_path)

NameError: name 'y_eval' is not defined

In [2]:
# === RECUPERAR MÉTRICAS DESDE ARTEFACTOS (sin re-entrenar) ===
import os, json, numpy as np, pandas as pd
from pathlib import Path
import pyarrow.parquet as pq
from sklearn.metrics import roc_auc_score, average_precision_score, precision_recall_curve, auc

OUT = Path("data/ocsvm_runs")
cfg_path = OUT / "ocsvm_rbf_config.json"
assert cfg_path.exists(), "No encuentro data/ocsvm_runs/ocsvm_rbf_config.json"

# 1) Cargar CFG guardado y paths básicos
CFG = json.loads(cfg_path.read_text())
DATA_DIR = Path(CFG.get("external_data_dir", "/teamspace/studios/this_studio/data")).resolve()
scores_path = OUT / f"{CFG['artifact_prefix']}_eval_scores_mm.dat"
assert scores_path.exists(), f"No existe memmap de scores: {scores_path}"

# 2) Cargar scores desde memmap (inferimos N por tamaño)
n_eval = scores_path.stat().st_size // 4  # float32
scores = np.memmap(scores_path, dtype=np.float32, mode="r", shape=(n_eval,))
print(f"Scores cargados: {scores.shape}")

# 3) Encontrar archivo(s) de etiquetas y leer SOLO la columna de label
def first_valid_label_col(pqfile: pq.ParquetFile, prefer=("is_suspicious","label","y","target")):
    cols = [name for name in pqfile.schema.names]
    for c in prefer:
        if c in cols: return c
    # fallback: primera columna entera corta
    for c in cols:
        t = pqfile.schema.field(c).type
        if str(t).startswith(("int8","int16","int32")): return c
    # si no, última columna
    return cols[-1]

def read_label_series(path: Path) -> pd.Series:
    pf = pq.ParquetFile(path)
    col = first_valid_label_col(pf)
    tbl = pf.read(columns=[col])
    s = tbl.to_pandas()[col]
    return s

# candidatos de eval
single = DATA_DIR / "windows_with_labels_aligned.parquet"
wl_norm = DATA_DIR / "windows_with_labels_aligned_normal.parquet"
wl_anom = DATA_DIR / "windows_with_labels_aligned_anom.parquet"
eval_norm = DATA_DIR / "eval_windows_aligned_normal.parquet"
eval_anom = DATA_DIR / "eval_windows_aligned_anom.parquet"
labels_a = DATA_DIR / "eval_labels_aligned.parquet"
labels_b = DATA_DIR / "labels.parquet"

y = None
if single.exists():
    y = read_label_series(single)
elif wl_norm.exists() and wl_anom.exists():
    y = pd.concat([read_label_series(wl_norm), read_label_series(wl_anom)], ignore_index=True)
elif eval_norm.exists() and eval_anom.exists():
    # etiquetas por archivo aparte
    lab_path = labels_a if labels_a.exists() else (labels_b if labels_b.exists() else None)
    assert lab_path is not None, "No encontré archivo de labels para eval split."
    y = read_label_series(lab_path)
else:
    # buscar genérico por patrón
    pats = ["*with_labels*aligned*.parquet", "*eval*labels*aligned*.parquet", "*labels*.parquet"]
    for patt in pats:
        cands = sorted(DATA_DIR.glob(patt))
        if cands:
            y = read_label_series(cands[0]); break

assert y is not None, f"No encontré etiquetas en {DATA_DIR}"
y = y.reset_index(drop=True)

# 4) Alinear longitudes y mapear etiquetas a {0,1} (1 = anómalo)
assert len(y) == n_eval, f"Desalineación: labels={len(y)} vs scores={n_eval}"

def map_labels_to_binary(y_arr):
    vals = np.unique(y_arr)
    s = set(vals.tolist())
    if s == {0,1}:
        return y_arr.astype(int), "0=normal, 1=anómalo"
    if s == {-1,1}:
        return (y_arr == -1).astype(int), "-1=anómalo, +1=normal"
    if s == {0,1,-1}:
        y2 = y_arr.copy(); y2[y2==-1]=0
        return y2.astype(int), "1=anómalo, 0=normal, -1→0"
    # fallback: todo >0 es anómalo
    return (y_arr>0).astype(int), f"fallback>0 anómalo (vals={sorted(s)})"

yb, mapping_info = map_labels_to_binary(y.to_numpy())
print("Distribución etiquetas mapeadas:", dict(zip(*np.unique(yb, return_counts=True))))

# 5) Métricas globales
if len(np.unique(yb)) > 1:
    roc = roc_auc_score(yb, scores)
    prec, rec, _ = precision_recall_curve(yb, scores); pr = auc(rec, prec)
    ap = average_precision_score(yb, scores)
else:
    roc = pr = ap = float("nan")

# 6) Métricas @k (1%)
k_rate = 0.01
k = max(1, int(n_eval * k_rate))
thr = np.partition(scores, -k)[-k]
pred_topk = (scores >= thr).astype(np.int8)

tp = int(((pred_topk==1) & (yb==1)).sum())
fp = int(((pred_topk==1) & (yb==0)).sum())
fn = int(((pred_topk==0) & (yb==1)).sum())
prec_k = tp / (tp + fp) if (tp+fp)>0 else 0.0
rec_k  = tp / (tp + fn) if (tp+fn)>0 else 0.0
f1_k   = 2*prec_k*rec_k/(prec_k+rec_k) if (prec_k+rec_k)>0 else 0.0

print(f"ROC-AUC: {roc:.4f} | PR-AUC: {pr:.4f} | AP: {ap:.4f}")
print(f"@k={k_rate*100:.1f}% -> P:{prec_k:.4f} | R:{rec_k:.4f} | F1:{f1_k:.4f}  (k={k})")
print("Mapping etiquetas:", mapping_info)

# 7) Persistir métricas actualizadas al JSON (idempotente)
cfg = CFG.copy()
cfg["metrics"] = {"roc_auc": float(roc), "pr_auc": float(pr), "ap": float(ap)}
cfg.setdefault("metrics_at_k", {})[str(k_rate)] = {
    "k": int(k), "precision": float(prec_k), "recall": float(rec_k), "f1": float(f1_k),
    "label_mapping": mapping_info
}
cfg_path.write_text(json.dumps(cfg, indent=2))
print("✅ Métricas actualizadas en:", cfg_path)

Scores cargados: (27789660,)
Distribución etiquetas mapeadas: {0: 27789660}
ROC-AUC: nan | PR-AUC: nan | AP: nan
@k=1.0% -> P:0.0000 | R:0.0000 | F1:0.0000  (k=277896)
Mapping etiquetas: fallback>0 anómalo (vals=[0])
✅ Métricas actualizadas en: data/ocsvm_runs/ocsvm_rbf_config.json


In [6]:
# Auditoría de etiquetas en /data (incluye floats ~binarios)
import pyarrow.parquet as pq
import numpy as np, pandas as pd
from pathlib import Path

DATA_DIR = Path(CFG["external_data_dir"])
cands = [
    "windows_with_labels_aligned.parquet",
    "windows_with_labels.parquet",
    "eval_windows_aligned.parquet",
    "eval_windows_aligned_normal.parquet",
    "eval_windows_aligned_anom.parquet",
    "eval_labels_aligned.parquet",
    "labels.parquet",
    "labels_anom.parquet",
]

def try_counts(p: Path, max_rows=500_000, bin_tol=1e-6):
    try:
        pf = pq.ParquetFile(p)
    except Exception as e:
        return {"file": p.name, "error": f"no parquet ({type(e).__name__})"}
    schema = pf.schema_arrow
    cols = schema.names

    labelish = []
    for field in schema:
        t = str(field.type).lower()
        if any(x in t for x in ["int8","int16","int32","int64","bool","float16","float32","float64"]):
            labelish.append(field.name)

    out = []
    for c in labelish:
        try:
            tbl = pf.read(columns=[c], max_rows=max_rows)
            s = pd.to_numeric(tbl.to_pandas()[c], errors="coerce")
            s = s.dropna()
            if s.empty: 
                continue
            # ¿binario exacto?
            uniq = np.unique(s.values)
            uniq_set = set(np.round(uniq, 6))
            is_binary_exact = uniq_set <= {0,1} or uniq_set <= {-1,1} or uniq_set <= {0,1,-1}
            # ¿casi binario? (tolerancia)
            is_binary_close = np.all((np.abs(s - 0) < bin_tol) | (np.abs(s - 1) < bin_tol) | (np.abs(s + 1) < bin_tol))
            if is_binary_exact or is_binary_close:
                vc = s.value_counts().head(5).to_dict()
                out.append((c, vc, str(schema.field(c).type)))
        except Exception:
            continue

    return {
        "file": p.name,
        "candidates": out,
        "n_rows_scanned": int(min(max_rows, pf.metadata.num_rows))
    }

rows = []
for name in cands:
    paths = []
    if (DATA_DIR / name).exists():
        paths.append(DATA_DIR / name)
    else:
        paths += sorted(DATA_DIR.glob(f"*{name.replace('.parquet','')}*.parquet"))
    for p in paths:
        rows.append(try_counts(p))

df = pd.DataFrame(rows)
pd.set_option("display.max_colwidth", 180)
display(df.fillna(""))
print("👉 Elige (archivo, columna) con 0/1 o -1/1 (aunque sea float). Si nada aparece, usamos labels_anom como índices.")

Unnamed: 0,file,candidates,n_rows_scanned
0,windows_with_labels_aligned.parquet,[],500000
1,windows_with_labels.parquet,[],500000
2,eval_windows_aligned.parquet,[],500000
3,eval_labels_aligned.parquet,[],500000
4,labels.parquet,[],500000
5,labels_anom.parquet,[],131420


👉 Elige (archivo, columna) con 0/1 o -1/1 (aunque sea float). Si nada aparece, usamos labels_anom como índices.


In [7]:
# Recalcular métricas desde scores con (archivo,columna) o con labels_anom como índices
import os, json, numpy as np, pandas as pd
from pathlib import Path
import pyarrow.parquet as pq
from sklearn.metrics import roc_auc_score, average_precision_score, precision_recall_curve, auc

OUT = Path(CFG["out_dir"])
DATA_DIR = Path(CFG["external_data_dir"])

scores_path = OUT / f"{CFG['artifact_prefix']}_eval_scores_mm.dat"
assert scores_path.exists(), "No existe memmap de scores."
n_eval = scores_path.stat().st_size // 4
scores = np.memmap(scores_path, dtype=np.float32, mode="r", shape=(n_eval,))

# === 1) EDITA si ya identificaste archivo/columna de etiqueta binaria ===
LABEL_FILE = None          # ej "eval_labels_aligned.parquet" o "windows_with_labels_aligned.parquet"
LABEL_COL  = None          # ej "is_suspicious" (None = autodetectar)

def read_binary_labels_from_file(path: Path, col: str|None):
    pf = pq.ParquetFile(path)
    if col is None:
        # auto: preferidas luego cualquiera ~binaria
        prefer = ["is_suspicious","label","y","target"]
        cols = pf.schema_arrow.names
        for c in prefer:
            if c in cols:
                col = c; break
        if col is None:
            for f in pf.schema_arrow:
                if str(f.type).lower() in ("int8","int16","int32","int64","bool","float16","float32","float64"):
                    col = f.name; break
    tbl = pf.read(columns=[col])
    y = pd.to_numeric(tbl.to_pandas()[col], errors="coerce").fillna(0).to_numpy()
    # mapear a {0,1}
    vals = set(np.round(np.unique(y), 6).tolist())
    if vals <= {0,1}:
        yb = (np.abs(y - 1.0) < 1e-6).astype(int)
        mapping = "float/int 0/1 -> 1=anomalo"
    elif vals <= {-1,1} or vals <= {0,1,-1}:
        yb = (np.abs(y + 1.0) < 1e-6).astype(int)  # -1 -> 1
        mapping = "float/int -1/1 -> -1=anomalo"
    else:
        # fallback: >0 anómalo
        yb = (y > 0).astype(int)
        mapping = f"fallback (>0 anómalo), vals={sorted(vals)}"
    return yb, mapping, col

def metrics_from_y(scores, yb, k_rate=0.01):
    if len(np.unique(yb)) > 1:
        roc = roc_auc_score(yb, scores)
        prec, rec, _ = precision_recall_curve(yb, scores); pr = auc(rec, prec)
        ap = average_precision_score(yb, scores)
    else:
        roc = pr = ap = float("nan")
    k = max(1, int(len(scores) * k_rate))
    thr = np.partition(scores, -k)[-k]
    pred_topk = (scores >= thr).astype(np.int8)
    tp = int(((pred_topk==1) & (yb==1)).sum())
    fp = int(((pred_topk==1) & (yb==0)).sum())
    fn = int(((pred_topk==0) & (yb==1)).sum())
    prec_k = tp / (tp + fp) if (tp+fp)>0 else 0.0
    rec_k  = tp / (tp + fn) if (tp+fn)>0 else 0.0
    f1_k   = 2*prec_k*rec_k/(prec_k+rec_k) if (prec_k+rec_k)>0 else 0.0
    return roc, pr, ap, k, prec_k, rec_k, f1_k

def persist_metrics(cfg_extra: dict, metrics, k_rate=0.01):
    roc, pr, ap, k, pk, rk, f1k = metrics
    cfg_path = OUT / f"{CFG['artifact_prefix']}_config.json"
    cfg = json.loads(cfg_path.read_text()) if cfg_path.exists() else {}
    cfg.update(cfg_extra)
    cfg["metrics"] = {"roc_auc": float(roc), "pr_auc": float(pr), "ap": float(ap)}
    cfg.setdefault("metrics_at_k", {})[str(k_rate)] = {
        "k": int(k), "precision": float(pk), "recall": float(rk), "f1": float(f1k)
    }
    cfg_path.write_text(json.dumps(cfg, indent=2))
    print("✅ Métricas guardadas en:", cfg_path)

used = False
if LABEL_FILE is not None:
    lab_path = DATA_DIR / LABEL_FILE
    assert lab_path.exists(), f"No existe {lab_path}"
    yb, mapping, used_col = read_binary_labels_from_file(lab_path, LABEL_COL)
    assert len(yb) == n_eval, f"Desalineación: labels={len(yb)} vs scores={n_eval}"
    print(f"Usando {LABEL_FILE} :: {used_col}  ({mapping})")
    M = metrics_from_y(scores, yb, k_rate=0.01)
    print(f"ROC-AUC:{M[0]:.4f} | PR-AUC:{M[1]:.4f} | AP:{M[2]:.4f}")
    print(f"@1% -> P:{M[4]:.4f} | R:{M[5]:.4f} | F1:{M[6]:.4f} (k={M[3]})")
    persist_metrics({"label_file": LABEL_FILE, "label_col": used_col, "label_mapping": mapping}, M)
    used = True

# === 2) Si no se definió LABEL_FILE o no trae positivos, intentar construir y desde labels_anom como ÍNDICES
if not used:
    la = None
    for name in ["labels_anom.parquet", *list(DATA_DIR.glob("*labels_anom*.parquet"))]:
        p = name if isinstance(name, Path) else (DATA_DIR / name)
        if Path(p).exists():
            la = Path(p); break
    assert la is not None, "No encontré labels_anom.parquet para reconstrucción por índices."

    pf = pq.ParquetFile(la)
    cols = pf.schema_arrow.names
    # heurística: elegir columna índice
    key_candidates = [c for c in ["idx","window_id","idx_end","row","row_id"] if c in cols]
    if not key_candidates:
        # si solo hay una columna numérica, usarla como índices
        num_cols = [f.name for f in pf.schema_arrow if "int" in str(f.type).lower()]
        assert num_cols, f"No hallé columnas numéricas en {la}"
        key = num_cols[0]
    else:
        key = key_candidates[0]

    s = pf.read(columns=[key]).to_pandas()[key]
    idx = pd.to_numeric(s, errors="coerce").dropna().astype(int).to_numpy()

    # Ajuste 0/1-based: si el máximo es == n_eval y el mínimo es 1, usamos 1-based
    if idx.min() >= 1 and idx.max() <= n_eval and (1 in idx):
        idx0 = idx - 1
    else:
        idx0 = idx
    idx0 = idx0[(idx0 >= 0) & (idx0 < n_eval)]

    yb = np.zeros(n_eval, dtype=int)
    yb[idx0] = 1
    print(f"Reconstruido y desde {la.name} usando columna '{key}' | positivos: {yb.sum()} / {n_eval}")

    M = metrics_from_y(scores, yb, k_rate=0.01)
    print(f"ROC-AUC:{M[0]:.4f} | PR-AUC:{M[1]:.4f} | AP:{M[2]:.4f}")
    print(f"@1% -> P:{M[4]:.4f} | R:{M[5]:.4f} | F1:{M[6]:.4f} (k={M[3]})")
    persist_metrics({"label_file": la.name, "label_col": key, "label_mapping": "indices (1-based auto-ajustado)"}, M)

Reconstruido y desde labels_anom.parquet usando columna 'window_id' | positivos: 9120 / 27789660


ROC-AUC:0.1978 | PR-AUC:0.0004 | AP:0.0004
@1% -> P:0.0032 | R:0.0967 | F1:0.0061 (k=277896)
✅ Métricas guardadas en: /teamspace/studios/this_studio/data/ocsvm_runs/ocsvm_rbf_config.json


In [9]:
# --- Test de polaridad (auto, sin depender de X_eval) ---
import os, json, numpy as np, pandas as pd
from pathlib import Path
import pyarrow.parquet as pq
from sklearn.metrics import roc_auc_score, average_precision_score, precision_recall_curve, auc

# 0) Cargar CFG desde JSON si no está en memoria
if 'CFG' not in globals():
    cfg_path_guess = Path("data/ocsvm_runs/ocsvm_rbf_config.json")
    assert cfg_path_guess.exists(), "No encontré data/ocsvm_runs/ocsvm_rbf_config.json"
    CFG = json.loads(cfg_path_guess.read_text())

OUT = Path(CFG["out_dir"])
DATA_DIR = Path(CFG.get("external_data_dir", "/teamspace/studios/this_studio/data"))
scores_path = OUT / f"{CFG['artifact_prefix']}_eval_scores_mm.dat"
assert scores_path.exists(), f"No existe memmap de scores: {scores_path}"

# 1) Cargar scores y n_eval a partir del tamaño de archivo
n_eval = scores_path.stat().st_size // 4  # float32
scores = np.memmap(scores_path, dtype=np.float32, mode="r", shape=(n_eval,))
print("Scores:", scores.shape)

# -------- Helpers --------
def metrics_from(scores, yb):
    if len(np.unique(yb)) > 1:
        roc = roc_auc_score(yb, scores)
        prec, rec, _ = precision_recall_curve(yb, scores); pr_auc = auc(rec, prec)
        ap = average_precision_score(yb, scores)
    else:
        roc = pr_auc = ap = float('nan')
    return roc, pr_auc, ap

def build_y_from_label_file(label_file:str, label_col:str|None):
    p = DATA_DIR / label_file
    assert p.exists(), f"No existe {p}"
    pf = pq.ParquetFile(p)
    col = label_col
    if col is None:
        # autodetectar columna "binaria" (int/bool/float 0/1/-1)
        pref = ["is_suspicious","label","y","target"]
        cols = pf.schema_arrow.names
        for c in pref:
            if c in cols: col = c; break
        if col is None:
            # como fallback, primera numérica
            for f in pf.schema_arrow:
                t = str(f.type).lower()
                if any(x in t for x in ["int","bool","float"]):
                    col = f.name; break
    tbl = pf.read(columns=[col])
    y = pd.to_numeric(tbl.to_pandas()[col], errors="coerce").fillna(0).to_numpy()
    assert len(y) == n_eval, f"Desalineación labels={len(y)} vs scores={n_eval}"
    vals = set(np.round(np.unique(y),6).tolist())
    if vals <= {0,1}:
        yb = (np.abs(y - 1.0) < 1e-6).astype(int); mapping = "0/1 -> 1=anómalo"
    elif vals <= {-1,1} or vals <= {0,1,-1}:
        yb = (np.abs(y + 1.0) < 1e-6).astype(int); mapping = "-1/1 -> -1=anómalo"
    else:
        yb = (y > 0).astype(int); mapping = f"fallback (>0 anómalo), vals={sorted(vals)}"
    return yb, mapping

def build_y_from_labels_anom_join():
    # localizar parquet de eval para tomar el ORDEN real
    eval_candidates = [
        "windows_with_labels_aligned.parquet",
        "eval_windows_aligned.parquet",
        "windows_with_labels.parquet",
        "windows_with_labels_aligned_normal.parquet",  # si es split, al menos nos da el orden de una mitad
    ]
    EVAL_PATH = None
    for name in eval_candidates:
        p = DATA_DIR / name
        if p.exists(): EVAL_PATH = p; break
    if EVAL_PATH is None:
        cands = sorted(DATA_DIR.glob("*windows*aligned*.parquet"),
                       key=lambda x: x.stat().st_size if x.exists() else 0, reverse=True)
        assert cands, "No hallé parquet de EVAL para inferir orden"
        EVAL_PATH = cands[0]

    pf_eval = pq.ParquetFile(EVAL_PATH)
    key_eval = next((k for k in ["window_id","idx","idx_end","row","row_id"] if k in pf_eval.schema_arrow.names), None)
    assert key_eval is not None, f"No encontré columna clave en {EVAL_PATH.name}"
    eval_key = pf_eval.read(columns=[key_eval]).to_pandas()[key_eval].astype(np.int64).reset_index(drop=True)
    assert len(eval_key) == n_eval, f"Desalineación eval_key={len(eval_key)} vs scores={n_eval}"

    # cargar labels_anom
    lab_anom = None
    for p in [DATA_DIR / "labels_anom.parquet", *DATA_DIR.glob("*labels_anom*.parquet")]:
        if p.exists(): lab_anom = p; break
    assert lab_anom is not None, "No encontré labels_anom.parquet"
    pf_lab = pq.ParquetFile(lab_anom)
    key_lab = key_eval if key_eval in pf_lab.schema_arrow.names else (
        next((k for k in ["window_id","idx","idx_end","row","row_id"] if k in pf_lab.schema_arrow.names), None)
    )
    assert key_lab is not None, f"labels_anom no tiene clave compatible (busqué window_id/idx/idx_end/row/row_id)"
    anom_keys = pf_lab.read(columns=[key_lab]).to_pandas()[key_lab].astype(np.int64).to_numpy()
    anom_set = set(anom_keys.tolist())
    yb = eval_key.isin(anom_set).astype(int).to_numpy()
    return yb, f"JOIN on {key_eval} from {lab_anom.name}"

# 2) Construir yb según lo que tengamos en config
cfg_path = OUT / f"{CFG['artifact_prefix']}_config.json"
cfg_json = json.loads(cfg_path.read_text()) if cfg_path.exists() else {}

yb = None; mapping_src = None
lab_file = cfg_json.get("label_file")
lab_col  = cfg_json.get("label_col")
lab_mapping_hint = cfg_json.get("label_mapping", "")

try:
    if lab_file and lab_file != "labels_anom.parquet":
        # usar archivo+columna si se registró en config
        yb, mapping_src = build_y_from_label_file(lab_file, lab_col)
    else:
        # por defecto (o si era labels_anom), usar JOIN por clave
        yb, mapping_src = build_y_from_labels_anom_join()
except Exception as e:
    # fallback: intentar archivo estándar de labels
    for candidate in ["windows_with_labels_aligned.parquet", "eval_labels_aligned.parquet"]:
        p = DATA_DIR / candidate
        if p.exists():
            yb, mapping_src = build_y_from_label_file(candidate, None)
            break
    if yb is None:
        # último recurso: indices desde labels_anom como posicional (puede desalinear)
        la = DATA_DIR / "labels_anom.parquet"
        assert la.exists(), "No encontré labels_anom.parquet para fallback."
        pf = pq.ParquetFile(la)
        key = next((k for k in ["window_id","idx","idx_end","row","row_id"] if k in pf.schema_arrow.names), pf.schema_arrow.names[0])
        idx = pf.read(columns=[key]).to_pandas()[key].astype(int).to_numpy()
        if idx.min()>=1 and idx.max()<=n_eval: idx = idx - 1
        idx = idx[(idx>=0)&(idx<n_eval)]
        yb = np.zeros(n_eval, dtype=int); yb[idx]=1
        mapping_src = f"indices from {la.name} (positional fallback)"

print(f"Etiquetas construidas: positivos={yb.sum()} / {len(yb)}  | fuente={mapping_src}")

# 3) Métricas con scores directos e invertidos
roc1, pr1, ap1 = metrics_from(scores, yb)
roc2, pr2, ap2 = metrics_from(-scores, yb)

print(f"Scores tal cual   -> ROC:{roc1:.4f} | PR-AUC:{pr1:.4f} | AP:{ap1:.4f}")
print(f"Scores invertidos -> ROC:{roc2:.4f} | PR-AUC:{pr2:.4f} | AP:{ap2:.4f}")
print("👉 Usa la versión con mejores métricas.")

Scores: (27789660,)


Etiquetas construidas: positivos=1003200 / 27789660  | fuente=JOIN on window_id from labels_anom.parquet
Scores tal cual   -> ROC:0.5069 | PR-AUC:0.0389 | AP:0.0367
Scores invertidos -> ROC:0.4931 | PR-AUC:0.0357 | AP:0.0369
👉 Usa la versión con mejores métricas.


In [10]:
# === Persistir métricas finales (polarity correcta + @k) ===
import os, json, numpy as np, pandas as pd
from pathlib import Path
from sklearn.metrics import roc_auc_score, average_precision_score, precision_recall_curve, auc
import pyarrow.parquet as pq

OUT = Path(CFG["out_dir"])
DATA_DIR = Path(CFG["external_data_dir"])
scores_path = OUT / f"{CFG['artifact_prefix']}_eval_scores_mm.dat"
cfg_path = OUT / f"{CFG['artifact_prefix']}_config.json"
assert scores_path.exists(), "No hay scores memmap."

# 1) Cargar scores y construir y por JOIN con window_id (lo que ya validaste)
n_eval = scores_path.stat().st_size // 4
scores = np.memmap(scores_path, dtype=np.float32, mode="r", shape=(n_eval,))

# encontrar parquet de eval para tomar el ORDEN exacto
eval_candidates = [
    "windows_with_labels_aligned.parquet",
    "eval_windows_aligned.parquet",
    "windows_with_labels.parquet",
    "windows_with_labels_aligned_normal.parquet",
]
EVAL_PATH = None
for name in eval_candidates:
    p = DATA_DIR / name
    if p.exists():
        EVAL_PATH = p; break
if EVAL_PATH is None:
    cands = sorted(DATA_DIR.glob("*windows*aligned*.parquet"), key=lambda x: x.stat().st_size if x.exists() else 0, reverse=True)
    assert cands, "No encontré parquet de EVAL para inferir orden."
    EVAL_PATH = cands[0]

pf_eval = pq.ParquetFile(EVAL_PATH)
key_eval = next((k for k in ["window_id","idx","idx_end","row","row_id"] if k in pf_eval.schema_arrow.names), None)
assert key_eval is not None, f"{EVAL_PATH.name} no tiene columna clave esperada."
eval_key = pf_eval.read(columns=[key_eval]).to_pandas()[key_eval].astype(np.int64).reset_index(drop=True)
assert len(eval_key) == n_eval, f"Desalineación: eval_key={len(eval_key)} vs scores={n_eval}"

# labels_anom como conjunto de claves
lab_anom = None
for p in [DATA_DIR / "labels_anom.parquet", *DATA_DIR.glob("*labels_anom*.parquet")]:
    if p.exists(): lab_anom = p; break
assert lab_anom is not None, "No hallé labels_anom.parquet."
pf_la = pq.ParquetFile(lab_anom)
key_lab = key_eval if key_eval in pf_la.schema_arrow.names else (
    next((k for k in ["window_id","idx","idx_end","row","row_id"] if k in pf_la.schema_arrow.names), None)
)
assert key_lab is not None, "labels_anom no tiene una clave compatible."
anom_keys = pf_la.read(columns=[key_lab]).to_pandas()[key_lab].astype(np.int64).to_numpy()
anom_set = set(anom_keys.tolist())

yb = eval_key.isin(anom_set).astype(int).to_numpy()
print("Positivos (JOIN):", yb.sum(), "/", len(yb))

# 2) Métricas con scores tal cual (polarity validada)
def metrics(scores, yb, k_rate=0.01):
    if len(np.unique(yb)) > 1:
        roc = roc_auc_score(yb, scores)
        prec, rec, _ = precision_recall_curve(yb, scores); pr_auc = auc(rec, prec)
        ap = average_precision_score(yb, scores)
    else:
        roc = pr_auc = ap = float('nan')
    k = max(1, int(len(scores) * k_rate))
    thr = np.partition(scores, -k)[-k]
    pred_topk = (scores >= thr).astype(np.int8)
    tp = int(((pred_topk==1) & (yb==1)).sum())
    fp = int(((pred_topk==1) & (yb==0)).sum())
    fn = int(((pred_topk==0) & (yb==1)).sum())
    prec_k = tp / (tp + fp) if (tp+fp)>0 else 0.0
    rec_k  = tp / (tp + fn) if (tp+fn)>0 else 0.0
    f1_k   = 2*prec_k*rec_k/(prec_k+rec_k) if (prec_k+rec_k)>0 else 0.0
    return {"roc_auc": float(roc), "pr_auc": float(pr_auc), "ap": float(ap),
            "k": int(k), "precision_k": float(prec_k), "recall_k": float(rec_k), "f1_k": float(f1_k)}

M = metrics(scores, yb, k_rate=0.01)
print(f"ROC-AUC:{M['roc_auc']:.4f} | PR-AUC:{M['pr_auc']:.4f} | AP:{M['ap']:.4f}")
print(f"@1% -> P:{M['precision_k']:.4f} | R:{M['recall_k']:.4f} | F1:{M['f1_k']:.4f} (k={M['k']})")

# 3) Guardar en config.json (idempotente)
cfg = json.loads(cfg_path.read_text()) if cfg_path.exists() else {}
cfg["external_data_dir"] = CFG["external_data_dir"]
cfg["kernel"] = CFG.get("kernel","rbf")
cfg["best_params"] = cfg.get("best_params", {})
cfg["metrics"] = {"roc_auc": M["roc_auc"], "pr_auc": M["pr_auc"], "ap": M["ap"]}
cfg.setdefault("metrics_at_k", {})["0.01"] = {
    "k": M["k"], "precision": M["precision_k"], "recall": M["recall_k"], "f1": M["f1_k"]
}
cfg.update({"label_file": lab_anom.name, "label_col": key_lab, "label_mapping": f"JOIN on {key_eval}"})
cfg_path.write_text(json.dumps(cfg, indent=2))
print("✅ Actualizado:", cfg_path)

Positivos (JOIN): 1003200 / 27789660
ROC-AUC:0.5069 | PR-AUC:0.0389 | AP:0.0367
@1% -> P:0.0423 | R:0.0117 | F1:0.0183 (k=277896)
✅ Actualizado: /teamspace/studios/this_studio/data/ocsvm_runs/ocsvm_rbf_config.json


In [11]:
# === Resumen final (listo para el informe) ===
import json, numpy as np, pandas as pd
from pathlib import Path

cfg_path = Path(CFG["out_dir"]) / f"{CFG['artifact_prefix']}_config.json"
cfg = json.loads(cfg_path.read_text())

metrics = cfg.get("metrics", {})
metrics_k = cfg.get("metrics_at_k", {}).get("0.01", {})
best_params = cfg.get("best_params", {})

row = {
    "Modelo": "One-Class SVM (RBF)",
    "nu": best_params.get("nu", CFG.get("svm_nu_grid",[None])[0]),
    "gamma": best_params.get("gamma", CFG.get("svm_gamma_grid",[None])[0]),
    "ROC-AUC": metrics.get("roc_auc"),
    "PR-AUC": metrics.get("pr_auc"),
    "AP": metrics.get("ap"),
    "P@1%": metrics_k.get("precision"),
    "R@1%": metrics_k.get("recall"),
    "F1@1%": metrics_k.get("f1"),
    "Labels fuente": f"{cfg.get('label_file','?')} · {cfg.get('label_col','?')} ({cfg.get('label_mapping','?')})",
}

df = pd.DataFrame([row])
display(df.style.format({
    "ROC-AUC":"{:.4f}", "PR-AUC":"{:.4f}", "AP":"{:.4f}",
    "P@1%":"{:.4f}", "R@1%":"{:.4f}", "F1@1%":"{:.4f}"
}))

Unnamed: 0,Modelo,nu,gamma,ROC-AUC,PR-AUC,AP,P@1%,R@1%,F1@1%,Labels fuente
0,One-Class SVM (RBF),0.05,0.01,0.5069,0.0389,0.0367,0.0423,0.0117,0.0183,labels_anom.parquet · window_id (JOIN on window_id)


In [12]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
idx = np.random.choice(len(scores), 20000, replace=False)
X2d = PCA(2).fit_transform(X_eval[idx])
plt.scatter(X2d[:,0], X2d[:,1], c=yb[idx], cmap='coolwarm', s=2)
plt.title("Distribución de anomalías (labels_anom)")

NameError: name 'X_eval' is not defined