In [1]:
# Import the class from the Python file (module)
import pandas as pd
import matplotlib.pyplot as plt
import os
# from dotenv import load_dotenv
# from pathlib import Path
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from BinanceClient import BinanceClient
import numpy as np
from typing import Final
import joblib
from BatchFeatures import BatchFeatures
from datetime import datetime, timedelta
%matplotlib widget

## Load pair df

In [6]:
import os
from datetime import datetime, timedelta, timezone

def interval_slug(s: str) -> str:
    return s.strip().replace(" ", "").replace("/", "").lower()

def make_db_name(pair: str, interval: str, weeks: int) -> str:
    return f"{pair}_{interval_slug(interval)}_{weeks}weeks.db"

def load_or_fetch_pair_df(pair: str, interval: str, weeks: int) -> tuple[str, "pd.DataFrame"]:
    db_name = make_db_name(pair, interval, weeks)
    db_path = "./db/" + db_name

    print(f"[{pair}] DB: {db_path}")

    binance_client = BinanceClient(db_path)
    binance_client.set_interval(interval)

    df = None

    if os.path.exists(db_path):
        df = binance_client.fetch_data_from_db(pair)
        if df is not None and not df.empty:
            print(f"[{pair}] Loaded {len(df):,} rows from DB.")
        else:
            df = None

    if df is None:
        print(f"[{pair}] No usable DB data found -> fetching from Binance...")

        api_secret = os.getenv("BINANCE_SECRET_KEY")
        api_key = os.getenv("BINANCE_API_KEY")
        binance_client.make(api_key, api_secret)

        server_time = binance_client.get_server_time()
        end_dt = datetime.fromtimestamp(server_time["serverTime"] / 1000, tz=timezone.utc)
        start_dt = end_dt - timedelta(weeks=weeks)

        start_ms = int(start_dt.timestamp() * 1000)
        end_ms = int(end_dt.timestamp() * 1000)

        data = binance_client.fetch_data(pair, start_ms, end_ms)
        if data is None or data.empty:
            raise RuntimeError(f"[{pair}] No data returned from Binance for the requested range.")

        binance_client.store_data_to_db(pair, data)

        df = binance_client.fetch_data_from_db(pair)
        if df is None or df.empty:
            raise RuntimeError(f"[{pair}] Data fetched/stored but DB load returned empty.")

        print(f"[{pair}] Fetched + stored + loaded {len(df):,} rows.")

    df = df.sort_index()
    return db_path, df


## Load COINS, then align timestamps

In [7]:
import pandas as pd
import numpy as np

def detect_volume_events(
    df: pd.DataFrame,
    symbol: str,
    vol_win: int = 144,          # 12 hours on 5m
    impulse_k: int = 12,         # 60 min impulse
    rvol_thresh: float = 6.0,    # strict
    impulse_thresh: float = 0.04,# +4% over impulse_k
    lookahead: int = 24,         # 2 hours forward path
    cooldown: int = 12,          # avoid logging same burst repeatedly (60 min)
):
    """
    Logs candidate 'flow shock' events:
      - RVOL spike relative to rolling median
      - Positive impulse over last impulse_k bars
    Then measures forward path stats over lookahead bars.
    """
    d = df.copy().sort_index()
    d = d[["open","high","low","close","volume"]].dropna()

    vol_med = d["volume"].rolling(vol_win).median()
    rvol = d["volume"] / vol_med
    impulse = d["close"] / d["close"].shift(impulse_k) - 1.0

    out = []
    i = 0
    n = len(d)

    while i < n - lookahead:
        if (rvol.iloc[i] >= rvol_thresh) and (impulse.iloc[i] >= impulse_thresh):
            px0 = float(d["close"].iloc[i])
            ts0 = d.index[i]

            future = d["close"].iloc[i+1:i+1+lookahead]
            fmax = float(future.max())
            fmin = float(future.min())
            max_fwd_return = fmax / px0 - 1.0
            max_drawdown = fmin / px0 - 1.0

            # retrace from the peak within the lookahead window
            # find peak time then worst after that peak
            peak_idx = future.values.argmax()
            peak_px = float(future.iloc[peak_idx])
            after_peak = future.iloc[peak_idx:]  # includes peak bar
            trough_after_peak = float(after_peak.min())
            max_retrace = trough_after_peak / peak_px - 1.0  # negative means retrace

            # time to max retrace (bars after event)
            trough_idx = after_peak.values.argmin()
            time_to_max_retrace_bars = int(peak_idx + trough_idx + 1)

            out.append({
                "symbol": symbol,
                "event_ts": ts0,
                "close_event": px0,
                "rvol": float(rvol.iloc[i]),
                "impulse": float(impulse.iloc[i]),
                "max_fwd_return": max_fwd_return,
                "max_drawdown": max_drawdown,
                "max_retrace": max_retrace,
                "time_to_max_retrace_bars": time_to_max_retrace_bars,
            })

            i += cooldown  # skip ahead so we don't log every bar of the same burst
        else:
            i += 1

    return pd.DataFrame(out)


## Get all Binance coin pairs

In [3]:
import requests
import pandas as pd

BINANCE_REST = "https://api.binance.com"

def get_spot_usdt_symbols():
    """All Spot symbols that trade against USDT and are currently TRADING."""
    info = requests.get(f"{BINANCE_REST}/api/v3/exchangeInfo", timeout=20).json()
    syms = []
    for s in info["symbols"]:
        if s.get("status") != "TRADING":
            continue
        if s.get("isSpotTradingAllowed") is not True:
            continue
        if s.get("quoteAsset") != "USDT":
            continue

        sym = s["symbol"]

        # Exclude leveraged tokens & some common non-spot-like tickers
        bad_substrings = ["UPUSDT", "DOWNUSDT", "BULLUSDT", "BEARUSDT", "3LUSDT", "3SUSDT", "5LUSDT", "5SUSDT"]
        if any(sym.endswith(x) for x in bad_substrings):
            continue

        syms.append(sym)
    return sorted(set(syms))

def rank_symbols_by_quote_volume(symbols):
    """Return DataFrame of symbols with 24h quoteVolume (USDT) sorted desc."""
    tickers = requests.get(f"{BINANCE_REST}/api/v3/ticker/24hr", timeout=20).json()
    # Build a map for fast lookup
    wanted = set(symbols)

    rows = []
    for t in tickers:
        sym = t.get("symbol")
        if sym not in wanted:
            continue
        # quoteVolume is in quoteAsset units, here USDT
        qv = float(t.get("quoteVolume", 0.0))
        rows.append({
            "symbol": sym,
            "quoteVolumeUSDT_24h": qv,
            "lastPrice": float(t.get("lastPrice", 0.0)),
            "priceChangePercent": float(t.get("priceChangePercent", 0.0)),
            "count": int(t.get("count", 0)),  # trade count 24h
        })

    df = pd.DataFrame(rows)
    df = df.sort_values("quoteVolumeUSDT_24h", ascending=False).reset_index(drop=True)
    return df

def get_top_usdt_pairs(n=100, min_quote_vol_usdt=None):
    """Top-N by 24h quote volume; optionally filter by minimum quote volume."""
    syms = get_spot_usdt_symbols()
    ranked = rank_symbols_by_quote_volume(syms)

    if min_quote_vol_usdt is not None:
        ranked = ranked[ranked["quoteVolumeUSDT_24h"] >= float(min_quote_vol_usdt)].copy()

    top = ranked.head(n).copy()
    return top, ranked


In [4]:
top100, ranked_all = get_top_usdt_pairs(n=100)
pairs = top100["symbol"].tolist()

len(pairs)


100

In [8]:
interval = "5m"
weeks = 52

paths = {}
dfs = {}

for sym in pairs:
    db_path, df = load_or_fetch_pair_df(sym, interval, weeks)
    paths[sym] = db_path
    dfs[sym] = df
    dfs[sym] = df


[USDCUSDT] DB: ./db/USDCUSDT_5m_52weeks.db
[USDCUSDT] Loaded 104,832 rows from DB.
[ETHUSDT] DB: ./db/ETHUSDT_5m_52weeks.db
[ETHUSDT] Loaded 104,832 rows from DB.
[BTCUSDT] DB: ./db/BTCUSDT_5m_52weeks.db
[BTCUSDT] Loaded 104,832 rows from DB.
[USD1USDT] DB: ./db/USD1USDT_5m_52weeks.db
[USD1USDT] Loaded 71,475 rows from DB.
[SOLUSDT] DB: ./db/SOLUSDT_5m_52weeks.db
[SOLUSDT] Loaded 104,832 rows from DB.
[XRPUSDT] DB: ./db/XRPUSDT_5m_52weeks.db
[XRPUSDT] Loaded 104,832 rows from DB.
[FOGOUSDT] DB: ./db/FOGOUSDT_5m_52weeks.db
[FOGOUSDT] Loaded 2,907 rows from DB.
[FDUSDUSDT] DB: ./db/FDUSDUSDT_5m_52weeks.db
[FDUSDUSDT] Loaded 104,832 rows from DB.
[PAXGUSDT] DB: ./db/PAXGUSDT_5m_52weeks.db
[PAXGUSDT] Loaded 104,832 rows from DB.
[ZKPUSDT] DB: ./db/ZKPUSDT_5m_52weeks.db
[ZKPUSDT] Loaded 5,212 rows from DB.
[BNBUSDT] DB: ./db/BNBUSDT_5m_52weeks.db
[BNBUSDT] Loaded 104,832 rows from DB.
[AXSUSDT] DB: ./db/AXSUSDT_5m_52weeks.db
[AXSUSDT] Loaded 104,832 rows from DB.
[ZECUSDT] DB: ./db/ZECUSDT_

RuntimeError: [RVNUSDT] No data returned from Binance for the requested range.

## Detect Comporession state

In [11]:
import pandas as pd
import numpy as np

def detect_compression_state(
    df: pd.DataFrame,
    *,
    atr_short: int = 20,          # ~100 min on 5m
    atr_long: int = 100,          # ~8 hours on 5m
    vol_win: int = 144,           # volume median window (12 hours)
    vol_ratio_thresh: float = 0.6,
    rvol_thresh: float = 0.6,
    min_duration: int = 12        # bars of sustained compression (60 min)
):
    """
    Detects pre-shock compression state.

    Returns df with added columns:
      - atr
      - atr_med
      - vol_compression
      - rvol
      - volu_compression
      - compression_raw
      - compression_duration
      - is_compressed
    """

    d = df.copy().sort_index()
    d = d[["open","high","low","close","volume"]].dropna()

    # -----------------------
    # 1) Volatility (ATR)
    # -----------------------
    high = d["high"]
    low  = d["low"]
    close = d["close"]

    tr = pd.concat([
        high - low,
        (high - close.shift()).abs(),
        (low - close.shift()).abs()
    ], axis=1).max(axis=1)

    d["atr"] = tr.rolling(atr_short).mean()
    d["atr_med"] = d["atr"].rolling(atr_long).median()

    d["vol_compression"] = d["atr"] / d["atr_med"]
    d["is_vol_compressed"] = d["vol_compression"] <= vol_ratio_thresh

    # -----------------------
    # 2) Volume compression
    # -----------------------
    vol_med = d["volume"].rolling(vol_win).median()
    d["rvol"] = d["volume"] / vol_med
    d["is_volume_compressed"] = d["rvol"] <= rvol_thresh

    # -----------------------
    # 3) Raw compression flag
    # -----------------------
    d["compression_raw"] = (
        d["is_vol_compressed"] &
        d["is_volume_compressed"]
    )

    # -----------------------
    # 4) Duration counter
    # -----------------------
    duration = np.zeros(len(d), dtype=int)

    for i in range(1, len(d)):
        if d["compression_raw"].iloc[i]:
            duration[i] = duration[i-1] + 1
        else:
            duration[i] = 0

    d["compression_duration"] = duration

    # -----------------------
    # 5) Final state
    # -----------------------
    d["is_compressed"] = d["compression_duration"] >= min_duration

    return d


In [12]:
df = dfs["BTCUSDT"]
df_c = detect_compression_state(df)

df_c[["vol_compression","rvol","compression_duration","is_compressed"]].tail(30)


Unnamed: 0_level_0,vol_compression,rvol,compression_duration,is_compressed
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2026-01-24 12:50:00,0.868259,1.387158,0,False
2026-01-24 12:55:00,0.837331,1.140229,0,False
2026-01-24 13:00:00,0.797691,0.936581,0,False
2026-01-24 13:05:00,0.779997,0.651361,0,False
2026-01-24 13:10:00,0.775064,0.883699,0,False
2026-01-24 13:15:00,0.7865,2.852841,0,False
2026-01-24 13:20:00,0.698898,1.816785,0,False
2026-01-24 13:25:00,0.675804,0.651221,0,False
2026-01-24 13:30:00,0.65734,0.881418,0,False
2026-01-24 13:35:00,0.694816,0.81219,0,False


In [15]:
df_c["is_compressed"].mean()

np.float64(0.0025660103785103785)