# One-Class SVM (OC-SVM) — AIS Anomaly Detection (Galápagos)

**Objetivo:** Entrenar un modelo no supervisado (OC-SVM RBF) usando **ventanas pre-generadas** normal (train) y con etiquetas (eval). Evaluación por **lotes** (memmap) para evitar picos de memoria; artefactos bajo `./data/ocsvm_runs`.

In [7]:
import os, json, numpy as np
SAVE_ROOT = 'data'
os.makedirs(SAVE_ROOT, exist_ok=True)
print('Output root:', os.path.abspath(SAVE_ROOT))

Output root: /teamspace/studios/this_studio/data


In [8]:
import os, gc, json, pickle, warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.svm import OneClassSVM
from sklearn.model_selection import GroupKFold, KFold, ParameterGrid
from sklearn.metrics import roc_auc_score, average_precision_score, precision_recall_curve, auc

import random
SEED = 42
random.seed(SEED); np.random.seed(SEED)

CFG = {
  'input_parquet': '/teamspace/studios/profound-silver-kn8tf/ais_anomaly/data/ais_enriched.parquet',
  'train_filter_col': 'is_suspicious',
  'mmsi_col': 'mmsi',
  'timestamp_col': 'timestamp',
  'svm_nu_grid': [0.01, 0.05, 0.1],
  'svm_gamma_grid': ['scale', 0.01],
  'kernel': 'rbf',
  'kfold_splits': 5,
  'max_train_samples': 500_000,
  'max_search_samples': 200_000,
  'eval_batch_size': 200_000,
  'out_dir': 'data/ocsvm_runs',
  'artifact_prefix': 'ocsvm_rbf'
}
os.makedirs(CFG['out_dir'], exist_ok=True)
print('Config:', json.dumps(CFG, indent=2))

Config: {
  "input_parquet": "/teamspace/studios/profound-silver-kn8tf/ais_anomaly/data/ais_enriched.parquet",
  "train_filter_col": "is_suspicious",
  "mmsi_col": "mmsi",
  "timestamp_col": "timestamp",
  "svm_nu_grid": [
    0.01,
    0.05,
    0.1
  ],
  "svm_gamma_grid": [
    "scale",
    0.01
  ],
  "kernel": "rbf",
  "kfold_splits": 5,
  "max_train_samples": 500000,
  "max_search_samples": 200000,
  "eval_batch_size": 200000,
  "out_dir": "data/ocsvm_runs",
  "artifact_prefix": "ocsvm_rbf"
}


In [9]:
from pathlib import Path
inp = Path(CFG['input_parquet']).resolve()
out = Path(CFG['out_dir']).resolve()
if str(out).startswith(str(inp.parent)):
    print('[SAFETY] Rerouting outputs to ./data/ocsvm_runs')
    CFG['out_dir'] = 'data/ocsvm_runs'
os.makedirs(CFG['out_dir'], exist_ok=True)
print('Output dir:', CFG['out_dir'])

Output dir: data/ocsvm_runs


In [10]:
# --- Setup & Config (reemplazo) ---
import os, json, numpy as np
from pathlib import Path

# Ruta del OTRO workspace (solo lectura)
EXTERNAL_DATA_DIR = Path("/teamspace/studios/profound-silver-kn8tf/ais_anomaly/data").resolve()

# Carpeta local del proyecto actual (escritura segura para OC-SVM)
LOCAL_OUT_DIR = Path("data/ocsvm_runs").resolve()
os.makedirs(LOCAL_OUT_DIR, exist_ok=True)

CFG = {
    # Entradas (solo para trazabilidad, NO escribimos ahí)
    "external_data_dir": str(EXTERNAL_DATA_DIR),

    # Hiperparámetros del SVM
    "svm_nu_grid": [0.01, 0.05, 0.1],
    "svm_gamma_grid": ["scale", 0.01],
    "kernel": "rbf",
    "kfold_splits": 5,

    # Muestreos y lotes
    "max_train_samples": 500_000,    # cap de entrenamiento
    "max_search_samples": 200_000,   # cap para HP search
    "eval_batch_size": 200_000,      # lotes para eval

    # Salidas (siempre al proyecto actual)
    "out_dir": str(LOCAL_OUT_DIR),
    "artifact_prefix": "ocsvm_rbf"
}

print("EXTERNAL_DATA_DIR (read-only):", CFG["external_data_dir"])
print("LOCAL OUT_DIR (safe writes):  ", CFG["out_dir"])
print("Config:", json.dumps({k: v for k, v in CFG.items() if k not in ["external_data_dir"]}, indent=2))

EXTERNAL_DATA_DIR (read-only): /teamspace/studios/profound-silver-kn8tf/ais_anomaly/data
LOCAL OUT_DIR (safe writes):   /teamspace/studios/this_studio/data/ocsvm_runs
Config: {
  "svm_nu_grid": [
    0.01,
    0.05,
    0.1
  ],
  "svm_gamma_grid": [
    "scale",
    0.01
  ],
  "kernel": "rbf",
  "kfold_splits": 5,
  "max_train_samples": 500000,
  "max_search_samples": 200000,
  "eval_batch_size": 200000,
  "out_dir": "/teamspace/studios/this_studio/data/ocsvm_runs",
  "artifact_prefix": "ocsvm_rbf"
}


In [11]:
# --- Carga ROBUSTA de artefactos de ventanas (del otro workspace) ---
import os, gc, re, numpy as np, pandas as pd
from pathlib import Path

DATA_DIR = Path(CFG["external_data_dir"])
print("DATA_DIR:", DATA_DIR)
if not DATA_DIR.exists():
    raise FileNotFoundError(f"La ruta externa no existe: {DATA_DIR}")

# Listado informativo (por si toca debug)
parquets = sorted([p for p in DATA_DIR.glob("*.parquet")], key=lambda p: p.name.lower())
print("Parquets disponibles:")
for p in parquets:
    try:
        sz = p.stat().st_size/1e6
        print(f" - {p.name}  ({sz:.1f} MB)")
    except Exception:
        print(f" - {p.name}")

# Candidatos por prioridad (exactos y patrones)
TRAIN_EXACT = [
    "windows_aligned_normal.parquet",
    "norm_windows_flat.parquet",
    "ais_norm_windows.parquet",
]
EVAL_EXACT  = [
    "windows_with_labels_aligned.parquet",
    "eval_windows_aligned.parquet",
    "windows_with_labels.parquet",
]
LABEL_EXACT = [
    "eval_labels_aligned.parquet",
    "labels.parquet",
    "labels_anom.parquet",
]

TRAIN_PATTERNS = [
    "*windows_aligned_normal*.parquet",
    "*norm*_windows*.parquet",
    "*ais_norm_windows*.parquet",
]
EVAL_PATTERNS = [
    "*windows_with_labels_aligned*.parquet",
    "*eval_windows_aligned*.parquet",
    "*windows_with_labels*.parquet",
]
LABEL_PATTERNS = [
    "*eval_labels_aligned*.parquet",
    "labels*.parquet",
]

def pick_file(base: Path, exact_list, pattern_list, purpose):
    """Primero intenta coincidencias exactas; si no, busca por patrón.
       Si hay varias coincidencias, prioriza por orden y luego por tamaño."""
    # 1) Exactos en orden
    for name in exact_list:
        p = base / name
        if p.exists():
            return p

    # 2) Patrones en orden; si múltiples, elige el más grande
    for patt in pattern_list:
        matches = list(base.glob(patt))
        if matches:
            # tamaño desc
            matches = sorted(matches, key=lambda x: x.stat().st_size if x.exists() else 0, reverse=True)
            print(f"[pick_file:{purpose}] Elegido por patrón '{patt}':", matches[0].name)
            return matches[0]
    return None

train_path = pick_file(DATA_DIR, TRAIN_EXACT, TRAIN_PATTERNS, "train")
eval_path  = pick_file(DATA_DIR, EVAL_EXACT,  EVAL_PATTERNS,  "eval")
y_path     = pick_file(DATA_DIR, LABEL_EXACT, LABEL_PATTERNS, "labels")

if train_path is None:
    raise FileNotFoundError(
        "No encontré parquet de TRAIN (normal). "
        f"Busqué exactos {TRAIN_EXACT} y patrones {TRAIN_PATTERNS} en {DATA_DIR}"
    )
if eval_path is None:
    raise FileNotFoundError(
        "No encontré parquet de EVAL (ventanas). "
        f"Busqué exactos {EVAL_EXACT} y patrones {EVAL_PATTERNS} en {DATA_DIR}"
    )

print("TRAIN windows ->", train_path.name)
print("EVAL windows  ->", eval_path.name)
print("EVAL labels   ->", y_path.name if y_path else "(embebidas)")

def read_parquet_min(path: Path):
    df = pd.read_parquet(path, engine="pyarrow")
    # downcast básico
    for c in df.columns:
        if pd.api.types.is_float_dtype(df[c]): df[c] = df[c].astype(np.float32)
        elif pd.api.types.is_integer_dtype(df[c]) and df[c].max() <= np.iinfo(np.int32).max:
            df[c] = df[c].astype(np.int32)
    return df

df_tr = read_parquet_min(train_path)
df_ev = read_parquet_min(eval_path)

def detect_label_col(df):
    for k in ["y","label","is_suspicious","target"]:
        if k in df.columns: return k
    return None

def detect_group_col(df):
    for k in ["mmsi","group","ship_id"]:
        if k in df.columns: return k
    return None

ycol_train = detect_label_col(df_tr)   # train normal no debería tener; si tiene, se ignora
ycol_eval  = detect_label_col(df_ev)
gcol_train = detect_group_col(df_tr)
gcol_eval  = detect_group_col(df_ev)

# columnas que NO van a X (coordenadas crudas, índices auxiliares, ids de ventana)
drop_common = {"lat","lon","idx","idx_end","window_id"}

# ---- TRAIN (solo normales) ----
drop_train = set([c for c in [ycol_train, gcol_train] if c]) | drop_common
feat_tr = [c for c in df_tr.columns if c not in drop_train]
X_train = df_tr[feat_tr].to_numpy(dtype=np.float32)
groups_train = df_tr[gcol_train].to_numpy() if gcol_train else None

# ---- EVAL (con o sin etiquetas embebidas) ----
if ycol_eval is not None:
    drop_eval = set([c for c in [ycol_eval, gcol_eval] if c]) | drop_common
    feat_ev = [c for c in df_ev.columns if c not in drop_eval]
    X_eval = df_ev[feat_ev].to_numpy(dtype=np.float32)
    y_eval = df_ev[ycol_eval].astype(np.int8).to_numpy()
    groups_eval = df_ev[gcol_eval].to_numpy() if gcol_eval else None
else:
    if y_path is None:
        raise FileNotFoundError(
            "No hay etiquetas para eval: esperaba windows_with_labels_aligned*.parquet "
            "o eval_windows_aligned*.parquet + eval_labels_aligned*.parquet"
        )
    df_y = read_parquet_min(y_path)
    # detectar columna de etiqueta:
    ycol_y = detect_label_col(df_y)
    if ycol_y is None:
        # fallback: última entera
        ints = df_y.select_dtypes(include=["int32","int16","int8"]).columns
        if len(ints) == 0:
            raise ValueError("No se detecta columna de etiqueta en el archivo de labels.")
        ycol_y = ints[-1]
    if len(df_y) != len(df_ev):
        raise ValueError(f"Desalineación: len(eval)={len(df_ev)} vs len(labels)={len(df_y)}")
    drop_eval = set([gcol_eval]) | drop_common
    feat_ev = [c for c in df_ev.columns if c not in drop_eval]
    X_eval = df_ev[feat_ev].to_numpy(dtype=np.float32)
    y_eval = df_y[ycol_y].astype(np.int8).to_numpy()
    groups_eval = df_ev[gcol_eval].to_numpy() if gcol_eval else None

del df_tr; gc.collect()
print("Train -> X:", X_train.shape, "| groups:", None if groups_train is None else len(groups_train))
print("Eval  -> X:", X_eval.shape,  "| y:", y_eval.shape, "| groups:", None if groups_eval is None else len(groups_eval))
print("N feats (train):", X_train.shape[1], " | N feats (eval):", X_eval.shape[1])

DATA_DIR: /teamspace/studios/profound-silver-kn8tf/ais_anomaly/data


FileNotFoundError: La ruta externa no existe: /teamspace/studios/profound-silver-kn8tf/ais_anomaly/data

In [None]:
import numpy as np, os

def sample_by_group(n_max, X, groups):
    if n_max is None or X.shape[0] <= n_max:
        idx = np.arange(X.shape[0]); return X, (groups if groups is not None else None), idx
    rng = np.random.default_rng(42)
    if groups is None:
        idx = rng.choice(X.shape[0], n_max, replace=False); return X[idx], None, idx
    uniq = np.unique(groups); per_g = max(1, n_max // len(uniq)); take = []
    for g in uniq:
        g_idx = np.where(groups == g)[0]
        if g_idx.size > per_g: take.extend(rng.choice(g_idx, per_g, replace=False).tolist())
        else: take.extend(g_idx.tolist())
    take = np.array(take)
    if take.size > n_max: take = rng.choice(take, n_max, replace=False)
    return X[take], groups[take], take

def colwise_nanmedian(X):
    Xc = X.copy(); Xc[~np.isfinite(Xc)] = np.nan
    med = np.nanmedian(Xc, axis=0); med = np.where(np.isfinite(med), med, 0.0).astype(np.float32)
    return med

def impute_inplace(X, medians):
    bad = ~np.isfinite(X)
    if bad.any(): X[bad] = np.take(medians, np.where(bad)[1])

def fit_standardizer(X):
    mean = X.mean(axis=0).astype(np.float32)
    var  = X.var(axis=0).astype(np.float32)
    std  = np.sqrt(var, dtype=np.float32); std[std == 0.0] = 1.0
    return mean, std

def apply_standardizer_inplace(X, mean, std):
    X -= mean; X /= std

# 1) Sample train first
X_train_sampled, groups_train_sampled, train_sel_idx = sample_by_group(CFG['max_train_samples'], X_train, groups_train)
print('Train sampled:', X_train_sampled.shape, '| groups:', None if groups_train_sampled is None else len(groups_train_sampled))

# 2) Impute & scale on sampled train
X_train_sampled = X_train_sampled.astype(np.float32, copy=False)
X_train_sampled[~np.isfinite(X_train_sampled)] = np.nan
train_medians = colwise_nanmedian(X_train_sampled)
impute_inplace(X_train_sampled, train_medians)
train_mean, train_std = fit_standardizer(X_train_sampled)
apply_standardizer_inplace(X_train_sampled, train_mean, train_std)

X_train_sc = X_train_sampled
groups_train = groups_train_sampled
print('Scaled train shape:', X_train_sc.shape)

# Save params
os.makedirs(CFG['out_dir'], exist_ok=True)
with open(os.path.join(CFG['out_dir'], f"{CFG['artifact_prefix']}_imputer_medians.npy"), 'wb') as f:
    np.save(f, train_medians)
with open(os.path.join(CFG['out_dir'], f"{CFG['artifact_prefix']}_scaler_params.npz"), 'wb') as f:
    np.savez(f, mean=train_mean, std=train_std)

def transform_eval_in_batches(X, batch_size=CFG['eval_batch_size']):
    n = X.shape[0]
    for s in range(0, n, batch_size):
        e = min(s + batch_size, n)
        Xe = X[s:e].astype(np.float32, copy=False)
        Xe[~np.isfinite(Xe)] = np.nan
        impute_inplace(Xe, train_medians)
        apply_standardizer_inplace(Xe, train_mean, train_std)
        yield s, e, Xe

In [None]:
import numpy as np, pandas as pd

param_grid = list(ParameterGrid({'nu': CFG['svm_nu_grid'], 'gamma': CFG['svm_gamma_grid']}))
target_outlier_rate = 0.05

def build_search_subset(X, groups, n_max):
    if n_max is None or X.shape[0] <= n_max: return X, groups
    rng = np.random.default_rng(123)
    if groups is None:
        idx = rng.choice(X.shape[0], n_max, replace=False); return X[idx], None
    uniq = np.unique(groups); per_g = max(1, n_max // len(uniq)); take = []
    for g in uniq:
        g_idx = np.where(groups == g)[0]
        if g_idx.size > per_g: take.extend(rng.choice(g_idx, per_g, replace=False).tolist())
        else: take.extend(g_idx.tolist())
    take = np.array(take)
    if take.size > n_max: take = rng.choice(take, n_max, replace=False)
    return X[take], groups[take]

X_search, groups_search = build_search_subset(X_train_sc, groups_train, CFG['max_search_samples'])

if groups_search is not None and len(np.unique(groups_search)) >= 2:
    n_splits = min(CFG['kfold_splits'], len(np.unique(groups_search)))
    splitter = GroupKFold(n_splits=n_splits); split_args = dict(X=X_search, y=None, groups=groups_search)
else:
    n_splits = max(2, CFG['kfold_splits'])
    splitter = KFold(n_splits=n_splits, shuffle=True, random_state=42); split_args = dict(X=X_search, y=None)

def outlier_rate(pred): return float((pred == -1).mean())

best_cfg, best_obj, results = None, None, []
for p in param_grid:
    fold_rates = []
    for tr_idx, va_idx in splitter.split(**split_args):
        Xtr, Xva = X_search[tr_idx], X_search[va_idx]
        m = OneClassSVM(kernel=CFG['kernel'], nu=p['nu'], gamma=p['gamma'])
        m.fit(Xtr)
        pred = m.predict(Xva)
        fold_rates.append(outlier_rate(pred))
    rate_mean, rate_std = float(np.mean(fold_rates)), float(np.std(fold_rates))
    obj = abs(rate_mean - target_outlier_rate) + rate_std
    results.append({'params': p, 'rate_mean': rate_mean, 'rate_std': rate_std, 'obj': obj})
    if best_obj is None or obj < best_obj:
        best_obj, best_cfg = obj, p

res_df = pd.DataFrame(results).sort_values('obj')
display(res_df.head(5))
print('Best params:', best_cfg, '| splits:', n_splits, '| search_subset:', X_search.shape)

In [None]:
final_model = OneClassSVM(kernel=CFG['kernel'], nu=best_cfg['nu'], gamma=best_cfg['gamma'])
final_model.fit(X_train_sc)
print('Final model trained.')

In [None]:
import numpy as np, os, json, pickle

n_eval = X_eval.shape[0]
scores_path_mm = os.path.join(CFG['out_dir'], f"{CFG['artifact_prefix']}_eval_scores_mm.dat")
anomaly_scores_mm = np.memmap(scores_path_mm, dtype=np.float32, mode='w+', shape=(n_eval,))

for s, e, Xe_sc in transform_eval_in_batches(X_eval):
    anomaly_scores_mm[s:e] = -final_model.decision_function(Xe_sc)

scores = anomaly_scores_mm
yb = y_eval.astype(int)
if len(np.unique(yb)) > 1:
    roc = roc_auc_score(yb, scores)
    prec, rec, thr = precision_recall_curve(yb, scores)
    pr_auc = auc(rec, prec)
    ap = average_precision_score(yb, scores)
else:
    roc = pr_auc = ap = np.nan

print(f'ROC-AUC: {roc:.4f} | PR-AUC: {pr_auc:.4f} | AP: {ap:.4f}')

with open(os.path.join(CFG['out_dir'], f"{CFG['artifact_prefix']}_model.pkl"), 'wb') as f:
    pickle.dump(final_model, f)
with open(os.path.join(CFG['out_dir'], f"{CFG['artifact_prefix']}_config.json"), 'w') as f:
    json.dump(CFG | {'best_params': best_cfg, 'metrics': {'roc_auc': float(roc), 'pr_auc': float(pr_auc), 'ap': float(ap)}}, f, indent=2)

print('Memmap scores stored at:', scores_path_mm)

In [None]:
import numpy as np, pandas as pd, os

scores = np.memmap(os.path.join(CFG['out_dir'], f"{CFG['artifact_prefix']}_eval_scores_mm.dat"),
                   dtype=np.float32, mode='r', shape=(X_eval.shape[0],))

k_rate = 0.01
k = max(1, int(len(scores) * k_rate))
thr_k = np.partition(scores, -k)[-k]
pred_topk = (scores >= thr_k).astype(np.int8)

topk_idx = np.where(pred_topk == 1)[0]
topk_df = pd.DataFrame({'idx': topk_idx.astype(np.int64),
                        'anomaly_score': scores[topk_idx].astype(np.float32),
                        'y_eval': y_eval[topk_idx].astype(np.int8)})
if 'groups_eval' in globals() and groups_eval is not None:
    topk_df['mmsi'] = groups_eval[topk_idx].astype(np.int64)
topk_path = os.path.join(CFG['out_dir'], f"{CFG['artifact_prefix']}_topk_{int(k_rate*100)}pct.parquet")
topk_df.to_parquet(topk_path, index=False)
print('Saved TOP-K detailed:', topk_path, '| rows:', len(topk_df))

if 'y_eval' in globals():
    yb = y_eval.astype(int)
    tp = int(((pred_topk==1) & (yb==1)).sum())
    fp = int(((pred_topk==1) & (yb==0)).sum())
    fn = int(((pred_topk==0) & (yb==1)).sum())
    prec_k = tp / (tp + fp) if (tp+fp)>0 else 0.0
    rec_k  = tp / (tp + fn) if (tp+fn)>0 else 0.0
    f1_k   = 2*prec_k*rec_k/(prec_k+rec_k) if (prec_k+rec_k)>0 else 0.0
    print(f"@k={k_rate*100:.1f}% -> P: {prec_k:.3f} | R: {rec_k:.3f} | F1: {f1_k:.3f}  (k={k})")

if 'groups_eval' in globals() and groups_eval is not None:
    mmsi_all, n_by_mmsi = np.unique(groups_eval, return_counts=True)
    mmsi_top, n_top_by_mmsi = np.unique(groups_eval[pred_topk==1], return_counts=True)
    top_map = dict(zip(mmsi_top.tolist(), n_top_by_mmsi.tolist()))
    anom_win = np.array([top_map.get(m, 0) for m in mmsi_all], dtype=np.int32)
    anom_rate = anom_win / n_by_mmsi
    agg_df = pd.DataFrame({'mmsi': mmsi_all, 'n_win': n_by_mmsi, 'anom_win': anom_win, 'anom_rate': anom_rate})
    agg_path = os.path.join(CFG['out_dir'], f"{CFG['artifact_prefix']}_mmsi_agg.parquet")
    agg_df.to_parquet(agg_path, index=False)
    print('Saved MMSI agg:', agg_path)
    display(agg_df.sort_values('anom_rate', ascending=False).head(10))

**Listo.** Corre las celdas en orden. Este cuaderno evita construir matrices gigantes para eval, usa memmap para scores, y guarda artefactos en `./data/ocsvm_runs`.