In [1]:
# Import the class from the Python file (module)
import pandas as pd
import matplotlib.pyplot as plt
import os
# from dotenv import load_dotenv
# from pathlib import Path
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from BinanceClient import BinanceClient
import numpy as np
from typing import Final
import joblib
from BatchFeatures import BatchFeatures
from datetime import datetime, timedelta
%matplotlib widget

In [3]:
from pathlib import Path

def pairs_from_db_dir(db_dir="db", strict_ascii=True):
    db_dir = Path(db_dir)   # <-- FIX
    pairs = []

    for p in db_dir.glob("*.db"):
        sym = p.name.split("_", 1)[0]
        if not sym.endswith("USDT"):
            continue
        if strict_ascii and not sym.isascii():
            continue
        pairs.append(sym)

    return sorted(set(pairs))


pairs = pairs_from_db_dir("db", strict_ascii=True)
print(len(pairs))


122


In [5]:
import os
from datetime import datetime, timedelta, timezone

def interval_slug(s: str) -> str:
    return s.strip().replace(" ", "").replace("/", "").lower()

def make_db_name(pair: str, interval: str, weeks: int) -> str:
    return f"{pair}_{interval_slug(interval)}_{weeks}weeks.db"

def load_or_fetch_pair_df(pair: str, interval: str, weeks: int) -> tuple[str, "pd.DataFrame"]:
    db_name = make_db_name(pair, interval, weeks)
    db_path = "./db/" + db_name

    print(f"[{pair}] DB: {db_path}")

    binance_client = BinanceClient(db_path)
    binance_client.set_interval(interval)

    df = None

    if os.path.exists(db_path):
        df = binance_client.fetch_data_from_db(pair)
        if df is not None and not df.empty:
            print(f"[{pair}] Loaded {len(df):,} rows from DB.")
        else:
            df = None

    if df is None:
        print(f"[{pair}] No usable DB data found -> fetching from Binance...")

        api_secret = os.getenv("BINANCE_SECRET_KEY")
        api_key = os.getenv("BINANCE_API_KEY")
        binance_client.make(api_key, api_secret)

        server_time = binance_client.get_server_time()
        end_dt = datetime.fromtimestamp(server_time["serverTime"] / 1000, tz=timezone.utc)
        start_dt = end_dt - timedelta(weeks=weeks)

        start_ms = int(start_dt.timestamp() * 1000)
        end_ms = int(end_dt.timestamp() * 1000)

        data = binance_client.fetch_data(pair, start_ms, end_ms)
        if data is None or data.empty:
            raise RuntimeError(f"[{pair}] No data returned from Binance for the requested range.")

        binance_client.store_data_to_db(pair, data)

        df = binance_client.fetch_data_from_db(pair)
        if df is None or df.empty:
            raise RuntimeError(f"[{pair}] Data fetched/stored but DB load returned empty.")

        print(f"[{pair}] Fetched + stored + loaded {len(df):,} rows.")

    df = df.sort_index()
    return db_path, df


In [6]:
interval = "5m"
weeks = 52

dfs = {}
paths = {}

for sym in pairs:
    db_path, df = load_or_fetch_pair_df(sym, interval, weeks)
    dfs[sym] = df
    paths[sym] = db_path


[0GUSDT] DB: ./db/0GUSDT_5m_52weeks.db
[0GUSDT] Loaded 36,249 rows from DB.
[1INCHUSDT] DB: ./db/1INCHUSDT_5m_52weeks.db
[1INCHUSDT] Loaded 104,832 rows from DB.
[2ZUSDT] DB: ./db/2ZUSDT_5m_52weeks.db
[2ZUSDT] Loaded 33,383 rows from DB.
[AAVEUSDT] DB: ./db/AAVEUSDT_5m_52weeks.db
[AAVEUSDT] Loaded 104,832 rows from DB.
[ADAUSDT] DB: ./db/ADAUSDT_5m_52weeks.db
[ADAUSDT] Loaded 104,832 rows from DB.
[AGLDUSDT] DB: ./db/AGLDUSDT_5m_52weeks.db
[AGLDUSDT] Loaded 104,832 rows from DB.
[ALGOUSDT] DB: ./db/ALGOUSDT_5m_52weeks.db
[ALGOUSDT] Loaded 104,832 rows from DB.
[ALLOUSDT] DB: ./db/ALLOUSDT_5m_52weeks.db
[ALLOUSDT] Loaded 21,807 rows from DB.
[APTUSDT] DB: ./db/APTUSDT_5m_52weeks.db
[APTUSDT] Loaded 104,832 rows from DB.
[ARBUSDT] DB: ./db/ARBUSDT_5m_52weeks.db
[ARBUSDT] Loaded 104,832 rows from DB.
[ASTERUSDT] DB: ./db/ASTERUSDT_5m_52weeks.db
[ASTERUSDT] Loaded 32,185 rows from DB.
[ASTRUSDT] DB: ./db/ASTRUSDT_5m_52weeks.db
[ASTRUSDT] Loaded 104,832 rows from DB.
[AUCTIONUSDT] DB: ./db/

In [7]:
# Align timestamps across all dfs
common_index = None

for df in dfs.values():
    idx = df.index
    common_index = idx if common_index is None else common_index.intersection(idx)

for sym in dfs:
    dfs[sym] = dfs[sym].loc[common_index]


In [8]:
def detect_volume_events(
    df,
    vol_win=144,
    impulse_k=12,
    rvol_thresh=6.0,
    impulse_thresh=0.04,
    lookahead=24,
    cooldown=12,
):
    d = df[["open","high","low","close","volume"]].dropna()
    vol_med = d["volume"].rolling(vol_win).median()
    rvol = d["volume"] / vol_med
    impulse = d["close"] / d["close"].shift(impulse_k) - 1

    out = []
    i = 0
    while i < len(d) - lookahead:
        if rvol.iloc[i] >= rvol_thresh and impulse.iloc[i] >= impulse_thresh:
            out.append({
                "event_ts": d.index[i],
                "rvol": float(rvol.iloc[i]),
                "impulse": float(impulse.iloc[i]),
            })
            i += cooldown
        else:
            i += 1

    return pd.DataFrame(out)


In [9]:
events = []
for sym, df in dfs.items():
    ev = detect_volume_events(df)
    ev["symbol"] = sym
    events.append(ev)

events = pd.concat(events, ignore_index=True)


In [10]:
def compute_instability_features_over_window(
    d: pd.DataFrame,
    win: int = 36,  # 180 min
):
    out = pd.DataFrame(index=d.index)

    rng = (d["high"] - d["low"]) / d["close"]
    atr = rng.rolling(14).mean()
    vol_med = d["volume"].rolling(144).median()
    rvol = d["volume"] / vol_med

    body = (d["close"] - d["open"]).abs()
    wick = (d["high"] - d["low"]) - body

    out["range_pct"] = rng.rolling(win).mean()
    out["atr_mean_ratio"] = atr.rolling(win).mean() / atr.rolling(200).mean()
    out["atr_slope"] = atr.rolling(win).apply(lambda x: pd.Series(x).diff().mean())
    out["rvol_mean"] = rvol.rolling(win).mean()
    out["rvol_slope"] = rvol.rolling(win).apply(lambda x: pd.Series(x).diff().mean())
    out["overlap_ratio"] = (
        ((d["low"].shift(1) <= d["high"]) &
         (d["high"].shift(1) >= d["low"]))
        .rolling(win)
        .mean()
    )
    out["wick_to_body"] = (wick / (body + 1e-6)).rolling(win).mean()

    return out


In [11]:
def classify_pre_shock_instability_v1(df_feat):
    return (
        (df_feat["range_pct"] >= 0.036) &
        (df_feat["rvol_slope"] >= 0.03) &
        (df_feat["atr_mean_ratio"] >= 1.1)
    )


In [12]:
instability_cache = {}

for sym, df in dfs.items():
    feat = compute_instability_features_over_window(df)
    feat["is_instability"] = classify_pre_shock_instability_v1(feat)
    instability_cache[sym] = feat


In [13]:
def mark_future_shock_fast(df_feat, ev, horizon_bars=24):
    shock_times = ev["event_ts"].values
    idx = df_feat.index.values

    future_shock = np.zeros(len(idx), dtype=bool)
    j = 0

    for i in range(len(idx)):
        while j < len(shock_times) and shock_times[j] <= idx[i]:
            j += 1
        if j < len(shock_times) and shock_times[j] <= idx[i + horizon_bars]:
            future_shock[i] = True

    df_feat = df_feat.copy()
    df_feat["future_shock"] = future_shock
    return df_feat


In [15]:
rows = []

for sym, df_feat in instability_cache.items():
    ev = events.query("symbol == @sym")
    if ev.empty:
        continue

    df2 = mark_future_shock_fast(df_feat, ev, horizon_bars=24)

    n_inst = df2["is_instability"].sum()
    if n_inst < 5:
        continue

    p_base = df2["future_shock"].mean()
    p_cond = df2.loc[df2["is_instability"], "future_shock"].mean()

    rows.append({
        "symbol": sym,
        "base_prob": p_base,
        "cond_prob": p_cond,
        "lift": p_cond / p_base if p_base > 0 else None,
        "instability_rate": df2["is_instability"].mean(),
    })

prob_df = pd.DataFrame(rows)

if prob_df.empty:
    print("⚠️ No symbols passed the filters (rows empty).")
else:
    prob_df = prob_df.sort_values("lift", ascending=False)

prob_df



⚠️ No symbols passed the filters (rows empty).


In [16]:
print("Rows:", len(rows))
if rows:
    print("Keys:", rows[0].keys())
dict_keys(['symbol','base_prob','cond_prob','lift','instability_rate'])


Rows: 0


In [17]:
dict_keys(['symbol','base_prob','cond_prob','lift','instability_rate'])


NameError: name 'dict_keys' is not defined

In [18]:
print(sym, "n_inst:", n_inst, "n_shocks:", len(ev))


ZROUSDT n_inst: 0 n_shocks: 0
