In [1]:
# === Crypto Anomaly Detector — CoinGecko-first, single cell ===
# First run (optional):
# !pip -q install yfinance pandas numpy scikit-learn scipy requests plotly gradio

import sys, platform, datetime as dt, warnings
import numpy as np
import pandas as pd
import requests
import plotly.graph_objects as go
from scipy.stats import median_abs_deviation
from sklearn.ensemble import IsolationForest
from IPython.display import display

print("Python:", sys.version)
print("Platform:", platform.platform())
print("Run at:", dt.datetime.now())

# ----------------------------
# 1) DATA LAYER (CoinGecko -> yfinance fallback)
# ----------------------------

# Symbol -> CoinGecko ID
COINGECKO_IDS = {
    "BTCUSDT": "bitcoin",
    "ETHUSDT": "ethereum",
    "BNBUSDT": "binancecoin",
    "SOLUSDT": "solana",
    "XRPUSDT": "ripple",
    "ADAUSDT": "cardano",
    "DOGEUSDT": "dogecoin",
    "DOTUSDT": "polkadot",
    "MATICUSDT": "matic-network",
}

def _cg_days_from_params(interval: str, limit: int):
    """Approximate CoinGecko OHLC 'days' parameter from interval/limit."""
    if interval == "1h":
        days = max(int(limit / 24) + 2, 1)
    elif interval == "1d":
        days = max(limit + 2, 2)
    else:
        days = max(int(limit / 24) + 2, 2)
    # CoinGecko OHLC accepts days in {1,7,14,30,90,180,365}
    cutoffs = [1, 7, 14, 30, 90, 180, 365]
    for c in cutoffs:
        if days <= c:
            return c
    return 365

def fetch_ohlcv_coingecko(symbol="BTCUSDT", interval="1h", limit=800):
    """Fetch OHLC from CoinGecko. Interval is approximated via 'days'."""
    coin = COINGECKO_IDS.get(symbol, None)
    if coin is None:
        raise RuntimeError(f"No CoinGecko ID for symbol: {symbol}")
    days = _cg_days_from_params(interval, limit)
    url = f"https://api.coingecko.com/api/v3/coins/{coin}/ohlc"
    r = requests.get(url, params={"vs_currency": "usd", "days": days}, timeout=30)
    r.raise_for_status()
    data = r.json()
    if not isinstance(data, list) or len(data) == 0:
        raise RuntimeError("CoinGecko OHLC returned empty payload.")
    # data: [[timestamp_ms, open, high, low, close], ...]
    df = pd.DataFrame(data, columns=["t", "open", "high", "low", "close"])
    df["open_time"] = pd.to_datetime(df["t"], unit="ms", utc=True)
    df["volume"] = np.nan  # OHLC endpoint doesn’t include volume
    df["close_time"] = df["open_time"]
    df["num_trades"] = np.nan
    df = df[["open_time", "open", "high", "low", "close", "volume", "close_time", "num_trades"]]
    df = df.sort_values("open_time").tail(limit).reset_index(drop=True)
    if df.empty:
        raise RuntimeError("CoinGecko data is empty after trimming to limit.")
    return df

def _yf_symbol(symbol):
    return symbol.replace("USDT", "-USD")

def fetch_ohlcv_yfinance_daily(symbol="BTCUSDT", limit=365):
    import yfinance as yf
    tkr = _yf_symbol(symbol)
    period = f"{max(limit + 5, 30)}d"
    df = yf.download(tkr, period=period, interval="1d", progress=False, auto_adjust=True)
    if df is None or df.empty:
        raise RuntimeError("yfinance daily data is empty.")
    df = df.reset_index()
    time_col = "Date" if "Date" in df.columns else ("Datetime" if "Datetime" in df.columns else None)
    if time_col is None:
        df["open_time"] = pd.to_datetime(df.index, utc=True)
    else:
        df = df.rename(columns={time_col: "open_time"})
        df["open_time"] = pd.to_datetime(df["open_time"], utc=True, errors="coerce")
    close_col = "Adj Close" if "Adj Close" in df.columns else ("Close" if "Close" in df.columns else None)
    open_col  = "Open" if "Open" in df.columns else close_col
    high_col  = "High" if "High" in df.columns else close_col
    low_col   = "Low"  if "Low"  in df.columns else close_col
    vol_col   = "Volume" if "Volume" in df.columns else None
    if close_col is None:
        raise RuntimeError(f"Unexpected yfinance columns: {list(df.columns)}")
    out = pd.DataFrame({
        "open_time": df["open_time"],
        "open":  pd.to_numeric(df[open_col], errors="coerce"),
        "high":  pd.to_numeric(df[high_col], errors="coerce"),
        "low":   pd.to_numeric(df[low_col], errors="coerce"),
        "close": pd.to_numeric(df[close_col], errors="coerce"),
        "volume": pd.to_numeric(df[vol_col], errors="coerce") if vol_col else np.nan,
    })
    out["close_time"] = out["open_time"]
    out["num_trades"] = np.nan
    out = out.sort_values("open_time").dropna(subset=["close"]).tail(limit).reset_index(drop=True)
    return out

def fetch_ohlcv(symbol="BTCUSDT", interval="1h", limit=800):
    try:
        return fetch_ohlcv_coingecko(symbol, interval, limit)
    except Exception as e:
        print(f"[INFO] CoinGecko failed ({type(e).__name__}): {e}. Falling back to yfinance daily…")
    # fallback: daily data
    return fetch_ohlcv_yfinance_daily(symbol, limit=max(limit // 24, 200) if interval == "1h" else limit)

# ----------------------------
# 2) FEATURES
# ----------------------------
def add_basic_features(df: pd.DataFrame):
    df = df.copy().sort_values("open_time")
    df["return"] = df["close"].pct_change(fill_method=None)
    df["log_return"] = np.log(df["close"]).diff()
    df["volatility_24"] = df["return"].rolling(24, min_periods=4).std()
    df["volume"] = df.get("volume", np.nan).astype(float)
    df["volume_change"] = df["volume"].pct_change(fill_method=None)
    # SMA(24) & deviation
    df["ma_24"] = df["close"].rolling(24, min_periods=4).mean()
    df["ma_24_dev"] = (df["close"] - df["ma_24"]) / df["ma_24"]
    # RSI(14)
    delta = df["close"].diff()
    up = delta.clip(lower=0).rolling(14, min_periods=4).mean()
    down = (-delta.clip(upper=0)).rolling(14, min_periods=4).mean()
    rs = up / (down.replace(0, np.nan))
    df["rsi_14"] = 100 - (100 / (1 + rs))
    # MACD (12,26) + signal(9)
    ema12 = df["close"].ewm(span=12, adjust=False, min_periods=4).mean()
    ema26 = df["close"].ewm(span=26, adjust=False, min_periods=4).mean()
    df["macd"] = ema12 - ema26
    df["macd_signal"] = df["macd"].ewm(span=9, adjust=False, min_periods=4).mean()
    df["macd_hist"] = df["macd"] - df["macd_signal"]
    return df

def add_extra_indicators(df: pd.DataFrame):
    """ATR(14), Bollinger(20), OBV."""
    df = df.copy()
    prev_close = df["close"].shift()
    tr = np.maximum(df["high"] - df["low"],
                    np.maximum((df["high"] - prev_close).abs(), (df["low"] - prev_close).abs()))
    df["atr_14"] = tr.rolling(14, min_periods=7).mean()
    ma20 = df["close"].rolling(20, min_periods=10).mean()
    std20 = df["close"].rolling(20, min_periods=10).std()
    df["bb_upper"] = ma20 + 2 * std20
    df["bb_lower"] = ma20 - 2 * std20
    df["bb_width"] = (df["bb_upper"] - df["bb_lower"]) / (ma20.replace(0, np.nan))
    vol = df["volume"].fillna(0.0)
    direction = np.sign(df["close"].diff()).fillna(0)
    df["obv"] = (vol * direction).cumsum()
    return df

def build_feature_matrix(df: pd.DataFrame, cols=None):
    if cols is None:
        cols = [
            "return", "log_return", "volatility_24", "volume_change",
            "ma_24_dev", "rsi_14", "macd", "macd_signal", "macd_hist",
            "atr_14", "bb_width", "obv"
        ]
    X = df[cols].replace([np.inf, -np.inf], np.nan).fillna(0.0).values
    return X, cols

# ----------------------------
# 3) DETECTORS
# ----------------------------
def zscore_mad_detector(series: pd.Series, z=3.0):
    x = series.values
    med = np.nanmedian(x)
    mad = median_abs_deviation(x, nan_policy="omit")
    denom = (mad * 1.4826) if (mad is not None and mad > 0) else np.nanstd(x)
    rzs = (x - med) / (denom if denom else 1e-8)
    score = np.abs(rzs)
    flags = (np.abs(rzs) >= z).astype(int)
    return score, flags

def isolation_forest_scores(X, random_state=42, contamination="auto"):
    clf = IsolationForest(
        n_estimators=300, max_samples="auto",
        contamination=contamination, n_jobs=-1, random_state=random_state
    )
    clf.fit(X)
    return -clf.decision_function(X)  # larger = more anomalous

def lof_scores(X, n_neighbors=35, contamination='auto'):
    from sklearn.neighbors import LocalOutlierFactor
    n = X.shape[0]
    if n < 10:
        return np.zeros(n)
    n_neighbors = min(n_neighbors, max(5, n // 20), n - 1)
    lof = LocalOutlierFactor(n_neighbors=n_neighbors, contamination=contamination, novelty=False)
    _ = lof.fit_predict(X)
    raw = -lof.negative_outlier_factor_  # larger = more anomalous
    return (raw - raw.min()) / (raw.max() - raw.min() + 1e-8)

def ocs_svm_scores(X, nu=0.05, gamma='scale', kernel='rbf'):
    from sklearn.svm import OneClassSVM
    n = X.shape[0]
    if n < 10:
        return np.zeros(n)
    ocs = OneClassSVM(kernel=kernel, nu=min(max(nu, 0.01), 0.5), gamma=gamma)
    ocs.fit(X)
    s = -ocs.decision_function(X)
    return (s - s.min()) / (s.max() - s.min() + 1e-8)

def combine_scores(*arrays, method="rank_sum"):
    M = np.column_stack(arrays)
    ranks = np.argsort(np.argsort(M, axis=0), axis=0)   # ascending
    rev = (M.shape[0] - 1) - ranks                      # higher = more anomalous
    combo = rev.sum(axis=1)
    return (combo - combo.min()) / (combo.max() - combo.min() + 1e-8)

# ----------------------------
# 4) VISUALIZATION (+ optional event overlay later)
# ----------------------------
def timeseries_with_anomalies(df: pd.DataFrame, score_col="anomaly_score", threshold=0.95):
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=df["open_time"], y=df["close"], mode="lines", name="Close"))
    anoms = df[df[score_col] >= threshold]
    fig.add_trace(go.Scatter(
        x=anoms["open_time"], y=anoms["close"],
        mode="markers", name="Anomalies", marker=dict(size=8, symbol="x")
    ))
    fig.update_layout(height=500, title=f"Anomaly threshold ≥ {threshold}")
    return fig

def overlay_events(fig, events):
    """events: [{'date': 'YYYY-MM-DD' or ISO string, 'label': '...'}, ...]"""
    for e in events:
        t = pd.to_datetime(e.get("date"), utc=True)
        label = e.get("label", "")
        fig.add_vline(x=t, line_width=1, line_dash="dot")
        fig.add_annotation(x=t, y=1.02, xref="x", yref="paper",
                           text=label, showarrow=False, yanchor="bottom", align="center")
    return fig

# ----------------------------
# 5) OPTIONAL: LSTM Autoencoder (PyTorch)
# ----------------------------
try:
    import torch, torch.nn as nn
    TORCH_OK = True
except Exception:
    TORCH_OK = False
    print("[INFO] PyTorch not available; AE score will be disabled.")

class LSTMAutoencoder(nn.Module if TORCH_OK else object):
    def __init__(self, input_size=1, hidden_size=64, num_layers=1):
        if not TORCH_OK: return
        super().__init__()
        self.encoder = nn.LSTM(input_size, hidden_size, num_layers=num_layers, batch_first=True)
        self.decoder = nn.LSTM(hidden_size, input_size, num_layers=num_layers, batch_first=True)
    def forward(self, x):
        z, _ = self.encoder(x)
        out, _ = self.decoder(z)
        return out

def _make_sequences(x1d: np.ndarray, seq_len=24):
    x1d = np.asarray(x1d, dtype=float)
    if len(x1d) < seq_len: return np.zeros((0, seq_len, 1), dtype=float)
    seqs = [x1d[i:i+seq_len] for i in range(len(x1d)-seq_len+1)]
    return np.stack(seqs)[:, :, None]

def lstm_ae_scores_from_close(df, seq_len=24, epochs=6, lr=1e-3, hidden=64, layers=1):
    if not TORCH_OK:
        return np.zeros(len(df))
    s = df["close"].astype(float).interpolate().ffill().bfill().values
    mu, sg = np.nanmean(s), np.nanstd(s) + 1e-8
    z = (s - mu) / sg
    S = _make_sequences(z, seq_len=seq_len)
    if S.shape[0] == 0: return np.zeros(len(df))
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = LSTMAutoencoder(1, hidden, layers).to(device)
    X = torch.tensor(S, dtype=torch.float32).to(device)
    opt = torch.optim.Adam(model.parameters(), lr=lr)
    crit = nn.MSELoss()
    model.train()
    for _ in range(epochs):
        opt.zero_grad(); out = model(X); loss = crit(out, X); loss.backward(); opt.step()
    model.eval()
    with torch.no_grad():
        recon = model(X)
        err = ((recon - X)**2).mean(dim=(1,2)).cpu().numpy()
    full = np.zeros(len(z)); cnt = np.zeros(len(z))
    for i, e in enumerate(err): full[i:i+seq_len] += e; cnt[i:i+seq_len] += 1
    full = np.divide(full, cnt, out=np.zeros_like(full), where=cnt>0)
    return (full - np.nanmin(full)) / (np.nanmax(full) - np.nanmin(full) + 1e-8)

# ----------------------------
# 6) PIPELINES (V1/V2/V3/V4)
# ----------------------------

# V1 (basic IF + z-score)
def run_pipeline(symbol="BTCUSDT", interval="1h", limit=800, threshold_quantile=0.97, contamination="auto"):
    df = fetch_ohlcv(symbol, interval, limit)
    if df is None or df.empty:
        raise RuntimeError("No data (CoinGecko & yfinance both failed).")
    # Note: fallback may be daily. It still works but with daily resolution.
    df = add_basic_features(df)
    zscore, _ = zscore_mad_detector(df["return"].fillna(0.0), z=3.0)
    X, _ = build_feature_matrix(df, cols=["return","log_return","volatility_24","volume_change","ma_24_dev","rsi_14","macd","macd_signal","macd_hist"])
    if X.shape[0] >= 10:
        iso_score = isolation_forest_scores(X, contamination=contamination)
        combo = combine_scores(zscore, iso_score)
        df["iso_score"] = iso_score
    else:
        warnings.warn(f"Too few samples ({X.shape[0]}). Using z-score only.")
        zmin, zmax = np.nanmin(zscore), np.nanmax(zscore)
        combo = (zscore - zmin) / (zmax - zmin + 1e-8)
    df["zscore_score"] = zscore
    df["anomaly_score"] = combo
    cutoff = np.nanquantile(df["anomaly_score"], float(threshold_quantile))
    df["is_anomaly"] = (df["anomaly_score"] >= cutoff).astype(int)
    cols = ["open_time","close","anomaly_score","zscore_score"] + (["iso_score"] if "iso_score" in df.columns else [])
    topk = df.nlargest(20, "anomaly_score")[cols]
    fig = timeseries_with_anomalies(df, "anomaly_score", cutoff)
    return df, fig, topk, cutoff

# V2 (adds LOF + OCSVM)
def run_pipeline_v2(symbol="BTCUSDT", interval="1h", limit=800, q=0.97, contam="auto"):
    df = fetch_ohlcv(symbol, interval, limit)
    df = add_basic_features(df)
    df = add_extra_indicators(df)
    num_cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])]
    X, _ = build_feature_matrix(df, cols=num_cols)
    zscore, _ = zscore_mad_detector(df["return"].fillna(0.0), z=3.0)
    iso = isolation_forest_scores(X, contamination=contam) if X.shape[0] >= 10 else np.zeros(len(df))
    lof = lof_scores(X) if X.shape[0] >= 10 else np.zeros(len(df))
    ocs = ocs_svm_scores(X) if X.shape[0] >= 10 else np.zeros(len(df))
    df["zscore_score"] = zscore
    df["iso_score"] = iso
    df["lof_score"] = lof
    df["ocs_score"] = ocs
    combo = combine_scores(zscore, iso, lof, ocs)
    df["anomaly_score"] = combo
    cutoff = np.nanquantile(df["anomaly_score"], float(q))
    df["is_anomaly"] = (df["anomaly_score"] >= cutoff).astype(int)
    compare = df.nlargest(20, "anomaly_score")[["open_time","close","anomaly_score","zscore_score","iso_score","lof_score","ocs_score"]]
    fig = timeseries_with_anomalies(df, "anomaly_score", cutoff)
    return df, fig, compare, cutoff

# V3 (adds LSTM-AE and re-combines)
def run_pipeline_v3(symbol="BTCUSDT", interval="1h", limit=800, q=0.97, contam="auto",
                    use_v2=True, ae_seq=24, ae_epochs=6):
    if use_v2:
        df, _, _, _ = run_pipeline_v2(symbol, interval, limit, q=min(q, 0.95), contam=contam)
    else:
        df = fetch_ohlcv(symbol, interval, limit)
        df = add_basic_features(df)
        df = add_extra_indicators(df)
    # AE score
    ae = lstm_ae_scores_from_close(df, seq_len=ae_seq, epochs=ae_epochs)
    if len(ae) != len(df):
        m = min(len(ae), len(df))
        tmp = np.zeros(len(df)); tmp[-m:] = ae[-m:]; ae = tmp
    df["ae_score"] = ae
    parts = [df[c].values for c in ["zscore_score","iso_score","lof_score","ocs_score","ae_score"] if c in df.columns]
    combo = combine_scores(*parts) if len(parts) > 1 else parts[0]
    df["anomaly_score_v3"] = combo
    cutoff = np.nanquantile(df["anomaly_score_v3"], float(q))
    df["is_anomaly_v3"] = (df["anomaly_score_v3"] >= cutoff).astype(int)
    cols = ["open_time","close","anomaly_score_v3","ae_score"] + [c for c in ["zscore_score","iso_score","lof_score","ocs_score"] if c in df.columns]
    topk = df.nlargest(20, "anomaly_score_v3")[cols]
    fig = timeseries_with_anomalies(df, score_col="anomaly_score_v3", threshold=cutoff)
    return df, fig, topk, cutoff

# V4 (event overlay on top of V3)
def run_pipeline_v4(symbol="BTCUSDT", interval="1h", limit=800, q=0.97, contam="auto",
                    ae_seq=24, ae_epochs=6, events=None):
    if events is None:
        events = []
    df, fig_v3, topk, cutoff = run_pipeline_v3(symbol, interval, limit, q, contam, use_v2=True, ae_seq=ae_seq, ae_epochs=ae_epochs)
    fig_over = overlay_events(fig_v3, events)
    fig_over.update_layout(title="Anomalies + Event Overlay (V4)")
    return df, fig_over, topk, cutoff

# ----------------------------
# 7) EXAMPLE RUN (V4)
# ----------------------------
SYMBOL   = "BTCUSDT"   # e.g. "ETHUSDT", "SOLUSDT", ...
INTERVAL = "1h"        # "1h" or "1d"
LIMIT    = 800
Q        = 0.97
CONTAM   = "auto"
AE_SEQ   = 24
AE_EPOCH = 6

events = [
    {"date": "2025-06-13", "label": "Event A"},
    {"date": "2025-07-08", "label": "Event B"},
    {"date": "2025-08-12", "label": "Event C"},
]

df_v4, fig_v4, topk_v4, cut_v4 = run_pipeline_v4(SYMBOL, INTERVAL, LIMIT, Q, CONTAM, AE_SEQ, AE_EPOCH, events)
fig_v4.show()
display(topk_v4.head(20))
print("V4 Cutoff:", cut_v4)

# CSV export
out_path = f"results_{SYMBOL}_{INTERVAL}_v4.csv"
df_v4.to_csv(out_path, index=False)
print("Saved:", out_path)

# ----------------------------
# 8) DEV PATCH: reproducibility + scaling + walk-forward + improved AE + V5
# ----------------------------
# Reproducibility + metadata + scaling + walk-forward + improved AE trainer
import os, json, random
from sklearn.preprocessing import StandardScaler

# Seed
SEED = 42
random.seed(SEED); np.random.seed(SEED)
try:
    import torch
    if TORCH_OK:
        torch.manual_seed(SEED)
        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(SEED)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
except Exception:
    pass

CONFIG = dict(SYMBOL=SYMBOL, INTERVAL=INTERVAL, LIMIT=LIMIT, Q=Q, CONTAM=CONTAM, AE_SEQ=AE_SEQ, AE_EPOCH=AE_EPOCH, SEED=SEED)
def save_metadata(path="metadata.json", config=CONFIG, extra=None):
    meta = dict(config)
    meta["run_time"] = pd.Timestamp.now(tz="UTC").isoformat()
    if extra: meta.update(extra)
    with open(path, "w", encoding="utf-8") as f:
        json.dump(meta, f, indent=2, ensure_ascii=False)
    print(f"[INFO] metadata saved → {path}")

scaler = StandardScaler()
def build_feature_matrix_scaled(df: pd.DataFrame, cols=None):
    if cols is None:
        cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])]
    X = df[cols].replace([np.inf, -np.inf], np.nan).fillna(0.0).values
    Xs = scaler.fit_transform(X)
    return Xs, cols

# Walk-forward (simple)
def walk_forward_validate(series, window=200, step=50, model_fn=None):
    results = []
    for start in range(0, len(series)-window, step):
        train = series.iloc[start:start+int(window*0.8)]
        test  = series.iloc[start+int(window*0.8):start+window]
        if len(test)==0: break
        score = model_fn(train, test)
        results.append(dict(start=start, end=start+window, score=score))
    return results

# Improved LSTM-AE trainer (mini-batch + early stopping)
if TORCH_OK:
    import torch.nn as nn
    class LSTMAutoencoderImproved(nn.Module):
        def __init__(self, input_size=1, hidden_size=64, num_layers=1):
            super().__init__()
            self.encoder = nn.LSTM(input_size, hidden_size, num_layers=num_layers, batch_first=True)
            self.decoder = nn.LSTM(hidden_size, input_size, num_layers=num_layers, batch_first=True)
        def forward(self, x):
            z, _ = self.encoder(x)
            out, _ = self.decoder(z)
            return out

    def train_lstm_ae(X, seq_len=24, epochs=30, batch_size=32, patience=5, lr=1e-3, hidden=64):
        device = "cuda" if torch.cuda.is_available() else "cpu"
        model = LSTMAutoencoderImproved(1, hidden).to(device)
        opt = torch.optim.Adam(model.parameters(), lr=lr)
        crit = nn.MSELoss()
        best_loss = np.inf; patience_counter = 0
        X_tensor = torch.tensor(X, dtype=torch.float32).to(device)
        dataset = torch.utils.data.TensorDataset(X_tensor)
        loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
        for ep in range(epochs):
            model.train(); losses=[]
            for (batch,) in loader:
                opt.zero_grad()
                out = model(batch)
                loss = crit(out, batch)
                loss.backward(); opt.step()
                losses.append(loss.item())
            mean_loss = float(np.mean(losses))
            if mean_loss < best_loss - 1e-4:
                best_loss = mean_loss; patience_counter = 0
            else:
                patience_counter += 1
                if patience_counter >= patience:
                    print(f"[INFO] Early stopping at epoch {ep}")
                    break
        return model

# Metadata now
extra = {"rows": int(len(df_v4)), "cols": list(map(str, df_v4.columns)),
         "cutoff_v4": float(df_v4["anomaly_score_v3"].quantile(CONFIG["Q"])) if "anomaly_score_v3" in df_v4 else None}
save_metadata("metadata.json", CONFIG, extra)

# Scaled IF score
num_cols = [c for c in df_v4.columns if pd.api.types.is_numeric_dtype(df_v4[c])]
Xs, feat_cols = build_feature_matrix_scaled(df_v4, cols=num_cols)
clf = IsolationForest(random_state=SEED, contamination="auto")
clf.fit(Xs)
iso_scaled = -clf.decision_function(Xs)
df_v4["iso_scaled"] = (iso_scaled - iso_scaled.min()) / (iso_scaled.max() - iso_scaled.min() + 1e-8)
print("iso_scaled added. features:", len(feat_cols))

# Walk-forward with IF
def wf_if(train_close: pd.Series, test_close: pd.Series):
    idx_train = train_close.index
    idx_test  = test_close.index
    Xtr, _ = build_feature_matrix_scaled(df_v4.loc[idx_train], cols=num_cols)
    Xte, _ = build_feature_matrix_scaled(df_v4.loc[idx_test],  cols=num_cols)
    clf = IsolationForest(random_state=SEED, contamination="auto").fit(Xtr)
    s = -clf.decision_function(Xte)
    return float(np.mean(s))

wf_res = walk_forward_validate(df_v4["close"], window=240, step=48, model_fn=wf_if)
wf_df = pd.DataFrame(wf_res)
display(wf_df.head(10))
print("walk-forward mean score:", float(wf_df["score"].mean()) if not wf_df.empty else "n/a")

# Improved AE score
seq_len = CONFIG["AE_SEQ"]
series = df_v4["close"].astype(float).interpolate().ffill().bfill().values
mu, sg = np.mean(series), np.std(series) + 1e-8
z = (series - mu) / sg
S = np.stack([z[i:i+seq_len] for i in range(len(z)-seq_len+1)], axis=0)[:, :, None] if len(z) >= seq_len else np.zeros((0, seq_len, 1))
if TORCH_OK and S.shape[0] >= 16:
    model = train_lstm_ae(S, seq_len=seq_len, epochs=30, batch_size=32, patience=5, hidden=64)
    with torch.no_grad():
        X = torch.tensor(S, dtype=torch.float32)
        recon = model(X).cpu().numpy()
        err = ((recon - S)**2).mean(axis=(1,2))
    full = np.zeros(len(z)); cnt = np.zeros(len(z))
    for i, e in enumerate(err):
        full[i:i+seq_len] += e; cnt[i:i+seq_len] += 1
    ae_imp = np.divide(full, cnt, out=np.zeros_like(full), where=cnt>0)
    df_v4["ae_score_improved"] = (ae_imp - ae_imp.min()) / (ae_imp.max() - ae_imp.min() + 1e-8)
    print("ae_score_improved added.")
else:
    df_v4["ae_score_improved"] = 0.0
    print("[INFO] PyTorch not available or data too short; ae_score_improved set to 0.")

# V5 combined score
parts_cols = [c for c in ["zscore_score","iso_score","lof_score","ocs_score","iso_scaled","ae_score_improved"]
              if c in df_v4.columns and df_v4[c].notna().any()]
assert len(parts_cols) >= 1, "No usable score columns."
combo_v5 = combine_scores(*[df_v4[c].values for c in parts_cols]) if len(parts_cols) > 1 else df_v4[parts_cols[0]].values
df_v4["anomaly_score_v5"] = combo_v5
cut_v5 = float(df_v4["anomaly_score_v5"].quantile(CONFIG["Q"]))
fig_v5 = timeseries_with_anomalies(df_v4, score_col="anomaly_score_v5", threshold=cut_v5)
fig_v5.update_layout(title="Anomalies — V5 (Scaled IF + Improved AE)")
fig_v5.show()
df_v4.to_csv("results_v5.csv", index=False)
save_metadata("metadata.json", CONFIG, {"parts_used": parts_cols, "cutoff_v5": cut_v5})
print("Saved: results_v5.csv | cutoff_v5:", cut_v5)

# ----------------------------
# 9) OPTIONAL: Gradio UI (V5)
# ----------------------------
# !pip -q install gradio
import gradio as gr
import json as _json

def ui_analyze(symbol, interval, limit, q, use_if, use_lof, use_ocs, use_iso_scaled, use_ae_imp, events_json):
    df, _, _, _ = run_pipeline_v4(symbol, interval, int(limit), float(q), "auto", AE_SEQ, AE_EPOCH, [])
    parts = []
    if use_if and "iso_score" in df: parts.append("iso_score")
    if use_lof and "lof_score" in df: parts.append("lof_score")
    if use_ocs and "ocs_score" in df: parts.append("ocs_score")
    if use_iso_scaled and "iso_scaled" in df: parts.append("iso_scaled")
    if use_ae_imp and "ae_score_improved" in df: parts.append("ae_score_improved")
    if not parts:
        parts = [c for c in ["zscore_score","iso_score","lof_score","ocs_score","iso_scaled","ae_score_improved"] if c in df.columns]
    combo = combine_scores(*[df[c].values for c in parts]) if len(parts) > 1 else df[parts[0]].values
    df["anomaly_score_v5"] = combo
    cut = float(df["anomaly_score_v5"].quantile(float(q)))
    fig2 = timeseries_with_anomalies(df, "anomaly_score_v5", cut)
    try:
        ev = _json.loads(events_json) if events_json.strip() else []
        if isinstance(ev, dict): ev = [ev]
        fig2 = overlay_events(fig2, ev) if ev else fig2
    except Exception:
        pass
    return fig2, df.nlargest(20,"anomaly_score_v5")[["open_time","close","anomaly_score_v5"] + parts], cut, _json.dumps(parts)

with gr.Blocks() as app:
    gr.Markdown("## Crypto Anomaly Detector — V5 UI")
    with gr.Row():
        symbol = gr.Dropdown(list(COINGECKO_IDS.keys()), value="BTCUSDT", label="Symbol")
        interval = gr.Dropdown(["1h","1d"], value="1h", label="Interval")
        limit = gr.Slider(200, 1200, value=800, step=50, label="Limit")
        q = gr.Slider(0.90, 0.99, value=0.97, step=0.001, label="Quantile (Q)")
    gr.Markdown("**Score components**")
    with gr.Row():
        use_if = gr.Checkbox(True, label="IsolationForest")
        use_lof = gr.Checkbox(True, label="LOF")
        use_ocs = gr.Checkbox(True, label="One-Class SVM")
        use_iso_scaled = gr.Checkbox(True, label="IF (Scaled)")
        use_ae_imp = gr.Checkbox(False, label="AE Improved (PyTorch required)")
    events_json = gr.Textbox(value='[{"date":"2025-06-13","label":"Event A"}]', lines=3, label="Events JSON")
    btn = gr.Button("Analyze")
    plot = gr.Plot(label="Anomaly Plot (V5)")
    table = gr.Dataframe(label="Top 20 (V5)")
    cut = gr.Number(label="Cutoff (Q)")
    parts = gr.Textbox(label="Parts Used (JSON)")
    btn.click(ui_analyze, [symbol,interval,limit,q,use_if,use_lof,use_ocs,use_iso_scaled,use_ae_imp,events_json],
              [plot,table,cut,parts])

# Uncomment when you want to run the UI:
# app.launch(share=True)


Python: 3.12.11 (main, Jun  4 2025, 08:56:18) [GCC 11.4.0]
Platform: Linux-6.1.123+-x86_64-with-glibc2.35
Run at: 2025-08-27 10:13:24.269684


Unnamed: 0,open_time,close,anomaly_score_v3,ae_score,zscore_score,iso_score,lof_score,ocs_score
4,2025-06-13 00:00:00+00:00,105979.0,1.0,0.0,0.03778,-0.025055,0.002242,0.914364
16,2025-07-31 00:00:00+00:00,117833.0,0.766667,0.0,0.201043,-0.044227,0.010027,0.712694
15,2025-07-27 00:00:00+00:00,117960.0,0.716667,0.0,0.871077,-0.022669,0.0,0.651443
5,2025-06-17 00:00:00+00:00,106951.0,0.7,0.0,0.240182,-0.021716,0.010158,0.937651
8,2025-06-29 00:00:00+00:00,107332.0,0.7,0.0,0.396196,-0.017446,0.00909,0.91
13,2025-07-19 00:00:00+00:00,117989.0,0.7,0.0,0.817556,-0.01143,0.003412,0.444344
14,2025-07-23 00:00:00+00:00,119956.0,0.7,0.0,0.563059,-0.018861,0.008072,0.578107
19,2025-08-12 00:00:00+00:00,118774.0,0.7,0.0,0.32583,-0.042237,0.003872,0.885695
10,2025-07-07 00:00:00+00:00,109215.0,0.65,0.0,0.0,0.017575,0.155909,0.007762
9,2025-07-03 00:00:00+00:00,108824.0,0.566667,0.0,0.443789,0.012012,0.168899,0.0


V4 Cutoff: 0.845999999859
Saved: results_BTCUSDT_1h_v4.csv
[INFO] metadata saved → metadata.json
iso_scaled added. features: 30


walk-forward mean score: n/a
[INFO] PyTorch not available or data too short; ae_score_improved set to 0.


[INFO] metadata saved → metadata.json
Saved: results_v5.csv | cutoff_v5: 0.9793749998469727


In [2]:
# === V5 block — scaled IF, walk-forward, improved AE, combined score, SAVE TO REPO DIR ===
import os, json, random
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest

# ---------- Output directory (your repo) ----------
OUTPUT_DIR = os.path.expanduser("~/Documents/crypto-anomaly-detector")
os.makedirs(OUTPUT_DIR, exist_ok=True)

# ---------- Safety: get df_v4 / CONFIG from previous cells or create sane defaults ----------
try:
    _ = df_v4
except NameError:
    # Try to run V4 if available
    if "run_pipeline_v4" in globals():
        SYMBOL   = globals().get("SYMBOL", "BTCUSDT")
        INTERVAL = globals().get("INTERVAL", "1h")
        LIMIT    = globals().get("LIMIT", 800)
        Q        = globals().get("Q", 0.97)
        CONTAM   = globals().get("CONTAM", "auto")
        AE_SEQ   = globals().get("AE_SEQ", 24)
        AE_EPOCH = globals().get("AE_EPOCH", 6)
        df_v4, _, _, _ = run_pipeline_v4(SYMBOL, INTERVAL, LIMIT, Q, CONTAM, AE_SEQ, AE_EPOCH, events=[])
    else:
        raise RuntimeError("df_v4 not found and run_pipeline_v4 is not defined. Please run V4 first.")

SEED     = 42
SYMBOL   = globals().get("SYMBOL", "BTCUSDT")
INTERVAL = globals().get("INTERVAL", "1h")
LIMIT    = globals().get("LIMIT", 800)
Q        = globals().get("Q", 0.97)
CONTAM   = globals().get("CONTAM", "auto")
AE_SEQ   = globals().get("AE_SEQ", 24)
AE_EPOCH = globals().get("AE_EPOCH", 6)

CONFIG = dict(SYMBOL=SYMBOL, INTERVAL=INTERVAL, LIMIT=LIMIT, Q=Q, CONTAM=CONTAM, AE_SEQ=AE_SEQ, AE_EPOCH=AE_EPOCH, SEED=SEED)

# Reproducibility
random.seed(SEED); np.random.seed(SEED)
try:
    import torch
    TORCH_OK = True
    torch.manual_seed(SEED)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(SEED)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
except Exception:
    TORCH_OK = False

# ---------- Utilities ----------
def save_metadata(path="metadata.json", config=CONFIG, extra=None):
    meta = dict(config)
    meta["run_time"] = pd.Timestamp.now(tz="UTC").isoformat()
    if extra: meta.update(extra)
    with open(path, "w", encoding="utf-8") as f:
        json.dump(meta, f, indent=2, ensure_ascii=False)
    print(f"[INFO] metadata saved → {path}")

scaler = StandardScaler()
def build_feature_matrix_scaled(df: pd.DataFrame, cols=None):
    if cols is None:
        cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])]
    X = df[cols].replace([np.inf, -np.inf], np.nan).fillna(0.0).values
    Xs = scaler.fit_transform(X)
    return Xs, cols

def walk_forward_validate(series, window=200, step=50, model_fn=None):
    results = []
    for start in range(0, len(series)-window, step):
        train = series.iloc[start:start+int(window*0.8)]
        test  = series.iloc[start+int(window*0.8):start+window]
        if len(test)==0: break
        score = model_fn(train, test)
        results.append(dict(start=start, end=start+window, score=score))
    return results

def combine_scores(*arrays):
    M = np.column_stack(arrays)
    ranks = np.argsort(np.argsort(M, axis=0), axis=0)
    rev = (M.shape[0] - 1) - ranks
    combo = rev.sum(axis=1)
    return (combo - combo.min()) / (combo.max() - combo.min() + 1e-8)

# ---------- Scaled IsolationForest ----------
num_cols = [c for c in df_v4.columns if pd.api.types.is_numeric_dtype(df_v4[c])]
Xs, feat_cols = build_feature_matrix_scaled(df_v4, cols=num_cols)
clf = IsolationForest(random_state=SEED, contamination="auto")
clf.fit(Xs)
iso_scaled = -clf.decision_function(Xs)
df_v4["iso_scaled"] = (iso_scaled - iso_scaled.min()) / (iso_scaled.max() - iso_scaled.min() + 1e-8)
print(f"[INFO] iso_scaled added (features={len(feat_cols)})")

# ---------- Walk-forward (IF on rolling windows) ----------
def wf_if(train_close: pd.Series, test_close: pd.Series):
    idx_train = train_close.index
    idx_test  = test_close.index
    Xtr, _ = build_feature_matrix_scaled(df_v4.loc[idx_train], cols=num_cols)
    Xte, _ = build_feature_matrix_scaled(df_v4.loc[idx_test],  cols=num_cols)
    _clf = IsolationForest(random_state=SEED, contamination="auto").fit(Xtr)
    s = -_clf.decision_function(Xte)
    return float(np.mean(s))

wf_res = walk_forward_validate(df_v4["close"], window=240, step=48, model_fn=wf_if)
wf_df = pd.DataFrame(wf_res)
display(wf_df.head(10))
print("walk-forward mean score:", float(wf_df["score"].mean()) if not wf_df.empty else "n/a")

# ---------- Improved AE (optional, PyTorch) ----------
def _make_sequences(x1d: np.ndarray, seq_len=24):
    x1d = np.asarray(x1d, dtype=float)
    if len(x1d) < seq_len: return np.zeros((0, seq_len, 1), dtype=float)
    seqs = [x1d[i:i+seq_len] for i in range(len(x1d)-seq_len+1)]
    return np.stack(seqs)[:, :, None]

if TORCH_OK:
    import torch.nn as nn
    class LSTMAutoencoderImproved(nn.Module):
        def __init__(self, input_size=1, hidden_size=64, num_layers=1):
            super().__init__()
            self.encoder = nn.LSTM(input_size, hidden_size, num_layers=num_layers, batch_first=True)
            self.decoder = nn.LSTM(hidden_size, input_size, num_layers=num_layers, batch_first=True)
        def forward(self, x):
            z, _ = self.encoder(x)
            out, _ = self.decoder(z)
            return out

    def train_lstm_ae(X, epochs=30, batch_size=32, patience=5, lr=1e-3, hidden=64):
        device = "cuda" if torch.cuda.is_available() else "cpu"
        model = LSTMAutoencoderImproved(1, hidden).to(device)
        opt = torch.optim.Adam(model.parameters(), lr=lr)
        crit = nn.MSELoss()
        best = np.inf; wait = 0
        X_tensor = torch.tensor(X, dtype=torch.float32).to(device)
        ds = torch.utils.data.TensorDataset(X_tensor)
        dl = torch.utils.data.DataLoader(ds, batch_size=batch_size, shuffle=True)
        for ep in range(epochs):
            model.train(); losses=[]
            for (batch,) in dl:
                opt.zero_grad(); out = model(batch); loss = crit(out, batch)
                loss.backward(); opt.step(); losses.append(loss.item())
            m = float(np.mean(losses))
            if m < best - 1e-4: best, wait = m, 0
            else:
                wait += 1
                if wait >= patience:
                    print(f"[INFO] AE early stop @ epoch {ep}")
                    break
        return model

# Build normalized series and sequences
seq_len = AE_SEQ
series = df_v4["close"].astype(float).interpolate().ffill().bfill().values
mu, sg = np.mean(series), np.std(series) + 1e-8
z = (series - mu) / sg
S = np.stack([z[i:i+seq_len] for i in range(len(z)-seq_len+1)], axis=0)[:, :, None] if len(z) >= seq_len else np.zeros((0, seq_len, 1))

if TORCH_OK and S.shape[0] >= 16:
    model = train_lstm_ae(S, epochs=30, batch_size=32, patience=5, hidden=64)
    with torch.no_grad():
        X = torch.tensor(S, dtype=torch.float32)
        recon = model(X).cpu().numpy()
        err = ((recon - S)**2).mean(axis=(1,2))
    full = np.zeros(len(z)); cnt = np.zeros(len(z))
    for i, e in enumerate(err):
        full[i:i+seq_len] += e; cnt[i:i+seq_len] += 1
    ae_imp = np.divide(full, cnt, out=np.zeros_like(full), where=cnt>0)
    df_v4["ae_score_improved"] = (ae_imp - ae_imp.min()) / (ae_imp.max() - ae_imp.min() + 1e-8)
    print("[INFO] ae_score_improved added.")
else:
    df_v4["ae_score_improved"] = 0.0
    print("[INFO] PyTorch not available or data too short; ae_score_improved set to 0.")

# ---------- V5 combined score ----------
parts_cols = [c for c in ["zscore_score","iso_score","lof_score","ocs_score","iso_scaled","ae_score_improved"]
              if c in df_v4.columns and df_v4[c].notna().any()]
assert len(parts_cols) >= 1, "No usable score columns were found."
combo_v5 = combine_scores(*[df_v4[c].values for c in parts_cols]) if len(parts_cols) > 1 else df_v4[parts_cols[0]].values
df_v4["anomaly_score_v5"] = combo_v5
cut_v5 = float(df_v4["anomaly_score_v5"].quantile(Q))

# Plot
fig_v5 = timeseries_with_anomalies(df_v4, score_col="anomaly_score_v5", threshold=cut_v5)
fig_v5.update_layout(title="Anomalies — V5 (Scaled IF + Improved AE)")
fig_v5.show()

# ---------- SAVE to your repo folder ----------
csv_path  = os.path.join(OUTPUT_DIR, "results_v5.csv")
meta_path = os.path.join(OUTPUT_DIR, "metadata.json")
df_v4.to_csv(csv_path, index=False)
save_metadata(meta_path, CONFIG, {"parts_used": parts_cols, "cutoff_v5": cut_v5})

print("Saved CSV :", csv_path)
print("Saved JSON:", meta_path)


[INFO] iso_scaled added (features=33)


walk-forward mean score: n/a
[INFO] PyTorch not available or data too short; ae_score_improved set to 0.


[INFO] metadata saved → /root/Documents/crypto-anomaly-detector/metadata.json
Saved CSV : /root/Documents/crypto-anomaly-detector/results_v5.csv
Saved JSON: /root/Documents/crypto-anomaly-detector/metadata.json
