In [None]:
# =============================================================================
# CELL 1: SETUP -- Multi-Asset Crash Regime LightGBM Training
# =============================================================================
# Train crash-regime LightGBM models for ETH, SOL, XRP
# Same 51-feature approach as BTC v3, with BTC as cross-asset lead
# Training config: RECENT_ONLY (2021 crash + 2025 crash)
# =============================================================================

from google.colab import drive
drive.mount('/content/drive')

import subprocess
subprocess.run(['pip', 'install', 'lightgbm', 'yfinance'], capture_output=True)

import pandas as pd
import numpy as np
import requests
import time
import os
import pickle
import json
import lightgbm as lgb
from datetime import datetime, timedelta
from sklearn.metrics import accuracy_score, roc_auc_score

DRIVE_SAVE = '/content/drive/MyDrive/renaissance-bot-training/crash_models_multi'
DRIVE_V3 = '/content/drive/MyDrive/renaissance-bot-training/crash_models_v3'
os.makedirs(DRIVE_SAVE, exist_ok=True)
os.makedirs('/content/data', exist_ok=True)
os.makedirs('/content/models', exist_ok=True)

ASSETS = {
    'ETH': {
        'symbol': 'ETHUSDT',
        'start_date': '2017-09-01',
        'crash_periods': [
            ('2018-01-07', '2018-12-15', 'ICO Bust'),
            ('2021-11-10', '2022-11-21', 'Terra/FTX/Fed'),
            ('2025-10-06', '2026-02-28', 'Current'),
        ],
        'cross_asset': 'BTC',
    },
    'SOL': {
        'symbol': 'SOLUSDT',
        'start_date': '2020-08-01',
        'crash_periods': [
            ('2021-11-10', '2022-11-21', 'Terra/FTX/Fed'),
            ('2025-10-06', '2026-02-28', 'Current'),
        ],
        'cross_asset': 'BTC',
    },
    'XRP': {
        'symbol': 'XRPUSDT',
        'start_date': '2018-01-01',
        'crash_periods': [
            ('2018-01-07', '2018-12-15', 'ICO Bust'),
            ('2021-11-10', '2022-11-21', 'Terra/FTX/Fed'),
            ('2025-10-06', '2026-02-28', 'Current'),
        ],
        'cross_asset': 'BTC',
    },
}

TRAINING_CONFIG = 'RECENT_ONLY'

FEATURE_COLS = [
    # Group 1: Target asset Price/Volume (15)
    'return_1bar', 'return_6bar', 'return_12bar', 'return_48bar', 'return_288bar',
    'vol_12bar', 'vol_48bar', 'vol_ratio', 'volume_surge', 'volume_trend',
    'consecutive_red', 'drawdown_24h', 'rsi_14_norm', 'bb_pct_b', 'vwap_distance',
    # Group 2: Daily Macro (10)
    'spx_return_1d', 'spx_vs_sma', 'vix_norm', 'vix_change', 'vix_extreme',
    'dxy_return_1d', 'dxy_trend', 'yield_level', 'yield_change', 'fng_norm',
    # Group 3: Derivatives (9)
    'funding_z', 'funding_extreme_long', 'funding_extreme_short',
    'oi_change_1h', 'oi_change_4h', 'oi_spike',
    'ls_ratio_norm', 'ls_extreme_long', 'taker_imbalance',
    # Group 4: Intraday Macro (11) -- placeholders
    'spx_return_5m', 'spx_return_15m', 'spx_return_1h',
    'spx_momentum_5m', 'spx_direction_5m',
    'vix_return_5m', 'vix_return_1h', 'vix_spike_5m',
    'ndx_return_5m', 'ndx_return_1h',
    'has_intraday_macro',
    # Group 5: Cross-Asset -- BTC as lead (6)
    'btc_return_1bar', 'btc_return_6bar', 'btc_asset_ratio_change',
    'btc_lead_1', 'btc_lead_2', 'btc_lead_3',
]

assert len(FEATURE_COLS) == 51, f'Expected 51 features, got {len(FEATURE_COLS)}'

print('[DONE] Setup complete')
print(f'Assets to train: {list(ASSETS.keys())}')
print(f'Training config: {TRAINING_CONFIG}')
print(f'Feature count: {len(FEATURE_COLS)}')

In [None]:
# =============================================================================
# CELL 2: Download price data for all assets
# =============================================================================

def download_binance_klines(symbol, start_date, interval='5m'):
    """Download all klines from Binance alternate endpoint."""
    BASE_URL = "https://data-api.binance.vision/api/v3/klines"
    all_candles = []
    start_ms = int(pd.Timestamp(start_date).timestamp() * 1000)
    end_ms = int(datetime.utcnow().timestamp() * 1000)
    current = start_ms

    while current < end_ms:
        params = {
            'symbol': symbol,
            'interval': interval,
            'startTime': current,
            'limit': 1000,
        }
        try:
            resp = requests.get(BASE_URL, params=params, timeout=30)
            if resp.status_code != 200:
                print(f'  [WARN] {symbol} status {resp.status_code}, retrying...')
                time.sleep(5)
                continue
            data = resp.json()
            if not data:
                break
            all_candles.extend(data)
            current = data[-1][0] + 1
            if len(all_candles) % 100000 == 0:
                print(f'  {symbol}: {len(all_candles):,} candles...')
            time.sleep(0.1)
        except Exception as e:
            print(f'  [WARN] {symbol} error: {e}, retrying...')
            time.sleep(5)

    if not all_candles:
        return pd.DataFrame()

    df = pd.DataFrame(all_candles, columns=[
        'open_time', 'open', 'high', 'low', 'close', 'volume',
        'close_time', 'quote_volume', 'trades', 'taker_buy_base',
        'taker_buy_quote', 'ignore'
    ])
    for col in ['open', 'high', 'low', 'close', 'volume', 'quote_volume']:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    df['timestamp'] = pd.to_datetime(df['open_time'], unit='ms')
    df = df.drop_duplicates('timestamp').sort_values('timestamp').reset_index(drop=True)
    return df


# 1. Load BTC data (needed as cross-asset for all models)
btc_path = f'{DRIVE_V3}/btc_5m_full.csv'
btc_local = f'{DRIVE_SAVE}/btc_5m.csv'
btc = pd.DataFrame()

for p in [btc_path, btc_local]:
    if os.path.exists(p):
        btc = pd.read_csv(p)
        btc['timestamp'] = pd.to_datetime(btc['timestamp'])
        if len(btc) > 100000:
            print(f'[OK] BTC loaded from cache: {len(btc):,} candles')
            break
        else:
            print(f'[WARN] BTC file at {p} too small ({len(btc)}), trying next...')
            btc = pd.DataFrame()

if len(btc) == 0:
    print('BTC not found in cache, downloading fresh...')
    btc = download_binance_klines('BTCUSDT', '2017-09-01')

if len(btc) > 0:
    btc.to_csv(btc_local, index=False)
    print(f'BTC: {len(btc):,} candles ({btc["timestamp"].min()} to {btc["timestamp"].max()})')
else:
    raise RuntimeError('BTC download failed -- cannot continue without BTC cross-asset data')

# 2. Download each target asset
asset_data = {'BTC': btc}
for name, cfg in ASSETS.items():
    cache_path = f'{DRIVE_SAVE}/{name.lower()}_5m.csv'
    if os.path.exists(cache_path):
        df = pd.read_csv(cache_path)
        df['timestamp'] = pd.to_datetime(df['timestamp'])
        if len(df) > 50000:
            print(f'[OK] {name} loaded from cache: {len(df):,} candles')
            asset_data[name] = df
            continue

    print(f'Downloading {name} ({cfg["symbol"]})...')
    df = download_binance_klines(cfg['symbol'], cfg['start_date'])
    asset_data[name] = df
    if len(df) > 0:
        df.to_csv(cache_path, index=False)
        print(f'[OK] {name}: {len(df):,} candles ({df["timestamp"].min()} to {df["timestamp"].max()})')
    else:
        print(f'[ERROR] {name} download failed')

print(f'\n[DONE] Price data ready for {list(asset_data.keys())}')
for k, v in asset_data.items():
    print(f'  {k}: {len(v):,} candles')

In [None]:
# =============================================================================
# CELL 3: Download macro data (shared across all assets)
# =============================================================================

# Check if macro data exists in v3 or multi cache
macro_df = pd.DataFrame()
for p in [f'{DRIVE_V3}/macro_daily.csv', f'{DRIVE_SAVE}/macro_daily.csv']:
    if os.path.exists(p):
        macro_df = pd.read_csv(p, index_col=0, parse_dates=True)
        if len(macro_df) > 500:
            print(f'[OK] Macro data loaded from cache: {len(macro_df)} days')
            break
        else:
            macro_df = pd.DataFrame()

if len(macro_df) == 0:
    print('Downloading macro data via yfinance...')
    try:
        import yfinance as yf
        tickers = {
            'spx': '^GSPC',
            'vix': '^VIX',
            'dxy': 'DX-Y.NYB',
            'ndx': '^IXIC',
            'us10y': '^TNX',
        }
        macro_frames = {}
        for name, symbol in tickers.items():
            try:
                tk = yf.Ticker(symbol)
                hist = tk.history(period='max', interval='1d')
                if len(hist) > 0:
                    macro_frames[name] = hist['Close'].rename(name)
                    print(f'  {name}: {len(hist)} days')
            except Exception as e:
                print(f'  [WARN] {name} failed: {e}')

        if macro_frames:
            macro_df = pd.DataFrame(macro_frames)
            macro_df.index = pd.to_datetime(macro_df.index)
            macro_df = macro_df.sort_index().ffill()
            print(f'[OK] Macro data: {len(macro_df)} days, cols={list(macro_df.columns)}')
    except ImportError:
        print('[ERROR] yfinance not available')

# Fear & Greed Index
fng_df = pd.DataFrame()
fng_path = f'{DRIVE_V3}/fng_daily.csv'
fng_path2 = f'{DRIVE_SAVE}/fng_daily.csv'

for p in [fng_path, fng_path2]:
    if os.path.exists(p):
        fng_df = pd.read_csv(p, parse_dates=['date'])
        if len(fng_df) > 100:
            print(f'[OK] Fear & Greed loaded from cache: {len(fng_df)} days')
            break
        fng_df = pd.DataFrame()

if len(fng_df) == 0:
    print('Downloading Fear & Greed Index...')
    try:
        resp = requests.get('https://api.alternative.me/fng/?limit=0&format=json', timeout=30)
        if resp.status_code == 200:
            fng_data = resp.json().get('data', [])
            fng_df = pd.DataFrame(fng_data)
            fng_df['date'] = pd.to_datetime(fng_df['timestamp'].astype(int), unit='s')
            fng_df['fng_value'] = pd.to_numeric(fng_df['value'], errors='coerce')
            fng_df = fng_df[['date', 'fng_value']].sort_values('date').reset_index(drop=True)
            print(f'[OK] Fear & Greed: {len(fng_df)} days')
    except Exception as e:
        print(f'[WARN] FNG download failed: {e}')

# Save
if len(macro_df) > 0:
    macro_df.to_csv(f'{DRIVE_SAVE}/macro_daily.csv')
if len(fng_df) > 0:
    fng_df.to_csv(f'{DRIVE_SAVE}/fng_daily.csv', index=False)

print(f'\n[DONE] Macro: {len(macro_df)} days, FNG: {len(fng_df)} days')

In [None]:
# =============================================================================
# CELL 4: Download derivatives for each asset
# =============================================================================

FAPI_BASE = 'https://fapi.binance.com'

def download_funding_rates(symbol, start_date):
    """Download funding rate history from Binance Futures."""
    all_rows = []
    start_ms = int(pd.Timestamp(start_date).timestamp() * 1000)
    current = start_ms
    end_ms = int(datetime.utcnow().timestamp() * 1000)

    while current < end_ms:
        url = f'{FAPI_BASE}/fapi/v1/fundingRate'
        params = {'symbol': symbol, 'startTime': current, 'limit': 1000}
        try:
            resp = requests.get(url, params=params, timeout=30)
            if resp.status_code in [451, 403, 418]:
                print(f'  [WARN] {symbol} funding rate blocked (status {resp.status_code})')
                break
            if resp.status_code != 200:
                time.sleep(2)
                continue
            data = resp.json()
            if not data:
                break
            all_rows.extend(data)
            current = data[-1]['fundingTime'] + 1
            time.sleep(0.2)
        except Exception as e:
            print(f'  [WARN] funding rate error: {e}')
            time.sleep(3)
            break

    if not all_rows:
        return pd.DataFrame()
    df = pd.DataFrame(all_rows)
    df['timestamp'] = pd.to_datetime(df['fundingTime'], unit='ms')
    df['fundingRate'] = pd.to_numeric(df['fundingRate'], errors='coerce')
    return df[['timestamp', 'fundingRate']].sort_values('timestamp').reset_index(drop=True)


def download_futures_stat(symbol, stat_type, start_date, period='4h'):
    """Download OI, long/short ratio, or taker volume from Binance Futures."""
    endpoint_map = {
        'oi': '/futures/data/openInterestHist',
        'ls': '/futures/data/globalLongShortAccountRatio',
        'taker': '/futures/data/takeBuySellVol',
    }
    url = FAPI_BASE + endpoint_map[stat_type]
    all_rows = []
    start_ms = int(pd.Timestamp(start_date).timestamp() * 1000)
    current = start_ms
    end_ms = int(datetime.utcnow().timestamp() * 1000)

    while current < end_ms:
        params = {
            'symbol': symbol,
            'period': period,
            'startTime': current,
            'limit': 500,
        }
        try:
            resp = requests.get(url, params=params, timeout=30)
            if resp.status_code in [451, 403, 418]:
                print(f'  [WARN] {symbol} {stat_type} blocked (status {resp.status_code})')
                break
            if resp.status_code != 200:
                time.sleep(2)
                continue
            data = resp.json()
            if not data:
                break
            all_rows.extend(data)
            current = data[-1].get('timestamp', data[-1].get('createTime', 0))
            if isinstance(current, str):
                current = int(current)
            current += 1
            time.sleep(0.3)
        except Exception as e:
            print(f'  [WARN] {stat_type} error: {e}')
            time.sleep(3)
            break

    if not all_rows:
        return pd.DataFrame()
    df = pd.DataFrame(all_rows)
    ts_col = 'timestamp' if 'timestamp' in df.columns else 'createTime'
    df['timestamp'] = pd.to_datetime(pd.to_numeric(df[ts_col], errors='coerce'), unit='ms')
    for col in df.columns:
        if col != 'timestamp':
            df[col] = pd.to_numeric(df[col], errors='coerce')
    return df.sort_values('timestamp').reset_index(drop=True)


# Download derivatives for each asset
deriv_data = {}
for name, cfg in ASSETS.items():
    symbol = cfg['symbol']
    start = cfg['start_date']
    print(f'\n--- {name} ({symbol}) derivatives ---')

    cache_prefix = f'{DRIVE_SAVE}/{name.lower()}'

    # Funding rates
    fr_path = f'{cache_prefix}_funding.csv'
    if os.path.exists(fr_path):
        fr = pd.read_csv(fr_path, parse_dates=['timestamp'])
        print(f'  Funding rates from cache: {len(fr)} rows')
    else:
        fr = download_funding_rates(symbol, start)
        if len(fr) > 0:
            fr.to_csv(fr_path, index=False)
        print(f'  Funding rates downloaded: {len(fr)} rows')

    # Open Interest
    oi_path = f'{cache_prefix}_oi.csv'
    if os.path.exists(oi_path):
        oi = pd.read_csv(oi_path, parse_dates=['timestamp'])
        print(f'  Open Interest from cache: {len(oi)} rows')
    else:
        oi = download_futures_stat(symbol, 'oi', '2020-01-01')
        if len(oi) > 0:
            oi.to_csv(oi_path, index=False)
        print(f'  Open Interest downloaded: {len(oi)} rows')

    # Long/Short Ratio
    ls_path = f'{cache_prefix}_ls.csv'
    if os.path.exists(ls_path):
        ls = pd.read_csv(ls_path, parse_dates=['timestamp'])
        print(f'  Long/Short ratio from cache: {len(ls)} rows')
    else:
        ls = download_futures_stat(symbol, 'ls', '2020-01-01')
        if len(ls) > 0:
            ls.to_csv(ls_path, index=False)
        print(f'  Long/Short ratio downloaded: {len(ls)} rows')

    # Taker Buy/Sell Volume
    tk_path = f'{cache_prefix}_taker.csv'
    if os.path.exists(tk_path):
        tk = pd.read_csv(tk_path, parse_dates=['timestamp'])
        print(f'  Taker volume from cache: {len(tk)} rows')
    else:
        tk = download_futures_stat(symbol, 'taker', '2020-01-01')
        if len(tk) > 0:
            tk.to_csv(tk_path, index=False)
        print(f'  Taker volume downloaded: {len(tk)} rows')

    deriv_data[name] = {'funding': fr, 'oi': oi, 'ls': ls, 'taker': tk}

print('\n[DONE] Derivatives data downloaded for all assets')
for name, dd in deriv_data.items():
    counts = {k: len(v) for k, v in dd.items()}
    print(f'  {name}: {counts}')

In [None]:
# =============================================================================
# CELL 5: Build crash datasets for each asset
# =============================================================================

btc_df = asset_data['BTC'].copy()
btc_df = btc_df.sort_values('timestamp').reset_index(drop=True)

crash_datasets = {}

for name, cfg in ASSETS.items():
    print(f'\n{"=" * 60}')
    print(f'Building crash dataset for {name}')
    print(f'{"=" * 60}')

    target_df = asset_data[name].copy()
    target_df = target_df.sort_values('timestamp').reset_index(drop=True)

    # 1. Label crash periods
    target_df['is_crash'] = False
    for start_str, end_str, period_name in cfg['crash_periods']:
        start = pd.Timestamp(start_str)
        end = pd.Timestamp(end_str)
        mask = (target_df['timestamp'] >= start) & (target_df['timestamp'] <= end)
        target_df.loc[mask, 'is_crash'] = True
        count = mask.sum()
        print(f'  Crash {start_str} to {end_str} ({period_name}): {count:,} candles')

    crash = target_df[target_df['is_crash']].copy().reset_index(drop=True)
    print(f'  Total crash candles: {len(crash):,}')

    # 2. Create date column for daily merges
    crash['date'] = crash['timestamp'].dt.date
    crash['date'] = pd.to_datetime(crash['date'])

    # 3. Merge daily macro
    if len(macro_df) > 0:
        macro_merge = macro_df.copy()
        macro_merge.index = pd.to_datetime(macro_merge.index)
        all_dates = crash['date'].unique()
        macro_reindexed = macro_merge.reindex(
            macro_merge.index.union(pd.DatetimeIndex(all_dates))
        ).sort_index().ffill().bfill()
        macro_reindexed = macro_reindexed.loc[all_dates]
        macro_reindexed = macro_reindexed.reset_index().rename(columns={'index': 'date'})
        crash = crash.merge(macro_reindexed, on='date', how='left')
        macro_coverage = crash['spx'].notna().mean() if 'spx' in crash.columns else 0
        print(f'  Macro coverage: {macro_coverage:.1%}')

    # 4. Merge Fear & Greed
    if len(fng_df) > 0:
        fng_merge = fng_df.copy()
        fng_merge['date'] = pd.to_datetime(fng_merge['date'])
        crash = crash.merge(fng_merge[['date', 'fng_value']], on='date', how='left')
        crash['fng_value'] = crash['fng_value'].ffill().bfill().fillna(50)
        fng_coverage = crash['fng_value'].notna().mean()
        print(f'  FNG coverage: {fng_coverage:.1%}')

    # 5. Merge derivatives via merge_asof
    dd = deriv_data.get(name, {})

    if len(dd.get('funding', pd.DataFrame())) > 0:
        fr = dd['funding'].copy()
        fr = fr.sort_values('timestamp')
        crash = crash.sort_values('timestamp')
        crash = pd.merge_asof(crash, fr, on='timestamp', direction='backward')
        print(f'  Funding rate coverage: {crash["fundingRate"].notna().mean():.1%}')

    if len(dd.get('oi', pd.DataFrame())) > 0:
        oi = dd['oi'].copy()
        oi = oi.sort_values('timestamp')
        oi_cols = [c for c in oi.columns if c not in ['timestamp', 'symbol']]
        crash = pd.merge_asof(crash, oi[['timestamp'] + oi_cols], on='timestamp', direction='backward')
        print(f'  OI coverage: {crash["sumOpenInterest"].notna().mean() if "sumOpenInterest" in crash.columns else 0:.1%}')

    if len(dd.get('ls', pd.DataFrame())) > 0:
        ls = dd['ls'].copy()
        ls = ls.sort_values('timestamp')
        ls_cols = [c for c in ls.columns if c not in ['timestamp', 'symbol']]
        for col in ls_cols:
            if col in crash.columns:
                ls = ls.rename(columns={col: f'ls_{col}'})
        ls_cols = [c for c in ls.columns if c not in ['timestamp', 'symbol']]
        crash = pd.merge_asof(crash, ls[['timestamp'] + ls_cols], on='timestamp', direction='backward')
        ls_col = 'longShortRatio' if 'longShortRatio' in crash.columns else 'ls_longShortRatio'
        if ls_col in crash.columns:
            if ls_col != 'longShortRatio':
                crash = crash.rename(columns={ls_col: 'longShortRatio'})
            print(f'  L/S ratio coverage: {crash["longShortRatio"].notna().mean():.1%}')

    if len(dd.get('taker', pd.DataFrame())) > 0:
        tk = dd['taker'].copy()
        tk = tk.sort_values('timestamp')
        tk_cols = [c for c in tk.columns if c not in ['timestamp', 'symbol']]
        for col in tk_cols:
            if col in crash.columns and col != 'timestamp':
                tk = tk.rename(columns={col: f'tk_{col}'})
        tk_cols = [c for c in tk.columns if c not in ['timestamp', 'symbol']]
        crash = pd.merge_asof(crash, tk[['timestamp'] + tk_cols], on='timestamp', direction='backward')
        buy_col = 'buyVol' if 'buyVol' in crash.columns else 'tk_buyVol'
        sell_col = 'sellVol' if 'sellVol' in crash.columns else 'tk_sellVol'
        if buy_col in crash.columns:
            if buy_col != 'buyVol':
                crash = crash.rename(columns={buy_col: 'buyVol'})
            if sell_col != 'sellVol':
                crash = crash.rename(columns={sell_col: 'sellVol'})

    # 6. Merge BTC as cross-asset (by exact timestamp)
    btc_cross = btc_df[['timestamp', 'close', 'volume']].copy()
    btc_cross = btc_cross.rename(columns={'close': 'btc_close', 'volume': 'btc_volume'})
    btc_cross = btc_cross.sort_values('timestamp')
    crash = crash.sort_values('timestamp')
    crash = pd.merge_asof(crash, btc_cross, on='timestamp', direction='nearest',
                          tolerance=pd.Timedelta('5min'))
    btc_coverage = crash['btc_close'].notna().mean()
    print(f'  BTC cross-asset coverage: {btc_coverage:.1%}')

    crash_datasets[name] = crash
    save_path = f'{DRIVE_SAVE}/{name.lower()}_crash_raw.csv'
    crash.to_csv(save_path, index=False)
    print(f'  Saved: {save_path} ({len(crash):,} rows)')

print('\n[DONE] Crash datasets built for all assets')
for name, ds in crash_datasets.items():
    print(f'  {name}: {len(ds):,} rows, {len(ds.columns)} columns')

In [None]:
# =============================================================================
# CELL 6: Feature engineering (51 features per asset)
# =============================================================================

feature_datasets = {}

for name in ASSETS.keys():
    print(f'\n{"=" * 60}')
    print(f'Feature engineering for {name}')
    print(f'{"=" * 60}')

    df = crash_datasets[name].copy()
    df = df.sort_values('timestamp').reset_index(drop=True)

    # ---- GROUP 1: Target Asset Price/Volume (15 features) ----
    print('  Group 1: Target asset Price/Volume (15 features)...')

    df['return_1bar'] = df['close'].pct_change(1)
    df['return_6bar'] = df['close'].pct_change(6)
    df['return_12bar'] = df['close'].pct_change(12)
    df['return_48bar'] = df['close'].pct_change(48)
    df['return_288bar'] = df['close'].pct_change(288)

    df['vol_12bar'] = df['return_1bar'].rolling(12).std()
    df['vol_48bar'] = df['return_1bar'].rolling(48).std()
    df['vol_ratio'] = df['vol_12bar'] / df['vol_48bar'].replace(0, np.nan)

    vol_mean = df['volume'].rolling(48).mean()
    df['volume_surge'] = df['volume'] / vol_mean.replace(0, np.nan)
    df['volume_trend'] = vol_mean.pct_change(12)

    is_red = (df['close'] < df['open']).astype(int)
    streaks = is_red.copy()
    for i in range(1, len(streaks)):
        if streaks.iloc[i] == 1:
            streaks.iloc[i] = streaks.iloc[i-1] + 1
    df['consecutive_red'] = streaks / 12.0

    rolling_high_24h = df['high'].rolling(288).max()
    df['drawdown_24h'] = (df['close'] - rolling_high_24h) / rolling_high_24h.replace(0, np.nan)

    delta = df['close'].diff()
    gain = delta.clip(lower=0).rolling(14).mean()
    loss = (-delta.clip(upper=0)).rolling(14).mean()
    rs = gain / loss.replace(0, np.nan)
    rsi = 100 - (100 / (1 + rs))
    df['rsi_14_norm'] = (rsi - 50) / 50

    bb_mid = df['close'].rolling(20).mean()
    bb_std = df['close'].rolling(20).std()
    bb_upper = bb_mid + 2 * bb_std
    bb_lower = bb_mid - 2 * bb_std
    bb_range = (bb_upper - bb_lower).replace(0, np.nan)
    df['bb_pct_b'] = (df['close'] - bb_lower) / bb_range

    vwap_num = (df['close'] * df['volume']).rolling(48).sum()
    vwap_den = df['volume'].rolling(48).sum().replace(0, np.nan)
    vwap = vwap_num / vwap_den
    df['vwap_distance'] = (df['close'] - vwap) / vwap.replace(0, np.nan)

    # ---- GROUP 2: Daily Macro (10 features) ----
    print('  Group 2: Daily Macro (10 features)...')

    if 'spx' in df.columns:
        df['spx_return_1d'] = df.groupby('date')['spx'].transform('first')
        df['spx_return_1d'] = df['spx_return_1d'].pct_change(288)
        spx_sma_20 = df['spx'].rolling(288 * 20).mean()
        df['spx_vs_sma'] = (df['spx'] - spx_sma_20) / spx_sma_20.replace(0, np.nan)
    else:
        df['spx_return_1d'] = 0.0
        df['spx_vs_sma'] = 0.0

    if 'vix' in df.columns:
        df['vix_norm'] = df['vix'] / 30.0
        df['vix_change'] = df['vix'].pct_change(288)
        df['vix_extreme'] = (df['vix'] > 30).astype(float)
    else:
        df['vix_norm'] = 0.0
        df['vix_change'] = 0.0
        df['vix_extreme'] = 0.0

    if 'dxy' in df.columns:
        df['dxy_return_1d'] = df['dxy'].pct_change(288)
        dxy_sma = df['dxy'].rolling(288 * 20).mean()
        df['dxy_trend'] = (df['dxy'] - dxy_sma) / dxy_sma.replace(0, np.nan)
    else:
        df['dxy_return_1d'] = 0.0
        df['dxy_trend'] = 0.0

    if 'us10y' in df.columns:
        df['yield_level'] = df['us10y'] / 5.0
        df['yield_change'] = df['us10y'].pct_change(288)
    else:
        df['yield_level'] = 0.0
        df['yield_change'] = 0.0

    if 'fng_value' in df.columns:
        df['fng_norm'] = df['fng_value'] / 100.0
    else:
        df['fng_norm'] = 0.5

    # ---- GROUP 3: Derivatives (9 features) ----
    print('  Group 3: Derivatives (9 features)...')

    has_derivatives = 'fundingRate' in df.columns and df['fundingRate'].notna().sum() > 100

    if has_derivatives:
        fr = df['fundingRate'].fillna(0)
        fr_mean = fr.rolling(288).mean()
        fr_std = fr.rolling(288).std().replace(0, np.nan)
        df['funding_z'] = (fr - fr_mean) / fr_std
        df['funding_extreme_long'] = (df['funding_z'] > 2).astype(float)
        df['funding_extreme_short'] = (df['funding_z'] < -2).astype(float)

        if 'sumOpenInterest' in df.columns:
            oi = df['sumOpenInterest'].ffill()
            df['oi_change_1h'] = oi.pct_change(12)
            df['oi_change_4h'] = oi.pct_change(48)
            oi_pct = oi.pct_change(12).abs()
            oi_thresh = oi_pct.rolling(288).quantile(0.95)
            df['oi_spike'] = (oi_pct > oi_thresh).astype(float)
        else:
            df['oi_change_1h'] = 0.0
            df['oi_change_4h'] = 0.0
            df['oi_spike'] = 0.0

        if 'longShortRatio' in df.columns:
            lsr = df['longShortRatio'].fillna(1.0)
            df['ls_ratio_norm'] = (lsr - 1.0) / 0.5
            df['ls_extreme_long'] = (lsr > 2.0).astype(float)
        else:
            df['ls_ratio_norm'] = 0.0
            df['ls_extreme_long'] = 0.0

        if 'buyVol' in df.columns and 'sellVol' in df.columns:
            buy = df['buyVol'].fillna(0)
            sell = df['sellVol'].fillna(0)
            total = (buy + sell).replace(0, np.nan)
            df['taker_imbalance'] = (buy - sell) / total
        else:
            df['taker_imbalance'] = 0.0
    else:
        print('    [WARN] Insufficient derivatives, filling with zeros')
        for feat in ['funding_z', 'funding_extreme_long', 'funding_extreme_short',
                     'oi_change_1h', 'oi_change_4h', 'oi_spike',
                     'ls_ratio_norm', 'ls_extreme_long', 'taker_imbalance']:
            df[feat] = 0.0

    # ---- GROUP 4: Intraday Macro (11 features) - ALL ZEROS ----
    print('  Group 4: Intraday Macro (11 placeholders = 0.0)...')
    intraday_cols = [
        'spx_return_5m', 'spx_return_15m', 'spx_return_1h',
        'spx_momentum_5m', 'spx_direction_5m',
        'vix_return_5m', 'vix_return_1h', 'vix_spike_5m',
        'ndx_return_5m', 'ndx_return_1h', 'has_intraday_macro',
    ]
    for col in intraday_cols:
        df[col] = 0.0

    # ---- GROUP 5: Cross-Asset -- BTC as lead indicator (6 features) ----
    print('  Group 5: Cross-Asset -- BTC lead (6 features)...')

    if 'btc_close' in df.columns and df['btc_close'].notna().sum() > 100:
        df['btc_return_1bar'] = df['btc_close'].pct_change(1)
        df['btc_return_6bar'] = df['btc_close'].pct_change(6)
        # Asset/BTC ratio change
        asset_btc_ratio = df['close'] / df['btc_close'].replace(0, np.nan)
        df['btc_asset_ratio_change'] = asset_btc_ratio.pct_change(12)
    else:
        df['btc_return_1bar'] = 0.0
        df['btc_return_6bar'] = 0.0
        df['btc_asset_ratio_change'] = 0.0

    # BTC lead signals (BTC moves first, alts follow)
    btc_r1 = df['btc_close'].pct_change(1) if 'btc_close' in df.columns else pd.Series(0.0, index=df.index)
    df['btc_lead_1'] = btc_r1.shift(1)
    df['btc_lead_2'] = btc_r1.shift(2)
    df['btc_lead_3'] = btc_r1.shift(3)

    # ---- LABELS ----
    print('  Building labels...')
    df['forward_return_1'] = df['close'].shift(-1) / df['close'] - 1
    df['forward_return_2'] = df['close'].shift(-2) / df['close'] - 1
    df['label_binary'] = (df['forward_return_2'] > 0).astype(int)
    df['label_binary_1bar'] = (df['forward_return_1'] > 0).astype(int)

    # ---- Clean up ----
    for col in FEATURE_COLS:
        if col not in df.columns:
            df[col] = 0.0
        df[col] = df[col].replace([np.inf, -np.inf], np.nan).fillna(0.0)

    df = df.dropna(subset=['label_binary']).reset_index(drop=True)

    feature_datasets[name] = df
    save_path = f'{DRIVE_SAVE}/{name.lower()}_crash_features.csv'
    df.to_csv(save_path, index=False)

    print(f'\n  {name} feature summary:')
    print(f'    Total rows: {len(df):,}')
    print(f'    Features: {len(FEATURE_COLS)}')
    print(f'    Label balance (2bar): {df["label_binary"].mean():.4f}')
    print(f'    Label balance (1bar): {df["label_binary_1bar"].mean():.4f}')
    non_zero = sum(1 for c in FEATURE_COLS if df[c].abs().sum() > 0)
    print(f'    Non-zero features: {non_zero}/{len(FEATURE_COLS)}')

print('\n[DONE] Feature engineering complete for all assets')

In [None]:
# =============================================================================
# CELL 7: Train models for each asset (RECENT_ONLY, 2 horizons each)
# =============================================================================

BASE_PARAMS = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'min_child_samples': 100,
    'lambda_l1': 0.1,
    'lambda_l2': 0.1,
    'verbose': -1,
}


def train_and_eval(train_X, train_y, val_X, val_y, test_X, test_y,
                   params, label='model'):
    """Train LightGBM and return metrics dict."""
    train_ds = lgb.Dataset(train_X, label=train_y)
    val_ds = lgb.Dataset(val_X, label=val_y, reference=train_ds)

    callbacks = [
        lgb.early_stopping(30),
        lgb.log_evaluation(0),
    ]

    model = lgb.train(
        params, train_ds,
        num_boost_round=500,
        valid_sets=[val_ds],
        callbacks=callbacks,
    )

    val_pred = model.predict(val_X)
    test_pred = model.predict(test_X)

    val_acc = accuracy_score(val_y, (val_pred > 0.5).astype(int))
    test_acc = accuracy_score(test_y, (test_pred > 0.5).astype(int))

    try:
        val_auc = roc_auc_score(val_y, val_pred)
    except ValueError:
        val_auc = 0.5
    try:
        test_auc = roc_auc_score(test_y, test_pred)
    except ValueError:
        test_auc = 0.5

    pred_std = float(np.std(test_pred))
    n_rounds = model.best_iteration if model.best_iteration else model.num_trees()

    importance = model.feature_importance(importance_type='gain')
    feat_imp = sorted(zip(FEATURE_COLS, importance), key=lambda x: x[1], reverse=True)[:10]

    print(f'\n  {label}:')
    print(f'    Train: {len(train_y):,} | Val: {len(val_y):,} | Test: {len(test_y):,}')
    print(f'    Val acc={val_acc:.4f}  Test acc={test_acc:.4f}  Test AUC={test_auc:.4f}')
    print(f'    Pred std={pred_std:.4f}  Rounds={n_rounds}')
    print(f'    Top 5 features:')
    for fname, fval in feat_imp[:5]:
        print(f'      {fname:30s} gain={fval:.1f}')

    return {
        'model': model,
        'val_acc': val_acc,
        'test_acc': test_acc,
        'val_auc': val_auc,
        'test_auc': test_auc,
        'pred_std': pred_std,
        'n_rounds': n_rounds,
        'test_pred': test_pred,
        'feat_imp': feat_imp,
        'train_size': len(train_y),
    }


# Train all assets x horizons
all_results = {}

for name, cfg in ASSETS.items():
    print(f'\n{"=" * 60}')
    print(f'Training {name} crash models')
    print(f'{"=" * 60}')

    df = feature_datasets[name].copy()
    df = df.sort_values('timestamp').reset_index(drop=True)

    # RECENT_ONLY: use crash 2 (2021) + crash 3 (2025) only
    # Filter out crash 1 (2018) rows if present
    crash2_mask = (df['timestamp'] >= pd.Timestamp('2021-11-10')) & (df['timestamp'] <= pd.Timestamp('2022-11-21'))
    crash3_mask = (df['timestamp'] >= pd.Timestamp('2025-10-06')) & (df['timestamp'] <= pd.Timestamp('2026-02-28'))

    crash2 = df[crash2_mask].copy()
    crash3 = df[crash3_mask].copy()

    print(f'  Crash 2 (2021-22): {len(crash2):,} rows')
    print(f'  Crash 3 (2025-26): {len(crash3):,} rows')

    if len(crash3) < 1000:
        print(f'  [SKIP] {name}: insufficient crash3 data ({len(crash3)} rows)')
        continue

    # Split crash 3: first half = val, second half = test
    c3_half = len(crash3) // 2
    crash3_val = crash3.iloc[:c3_half]
    crash3_test = crash3.iloc[c3_half:]

    # Train set = crash 2 + crash 3 first half (val is separate)
    # Actually: train = crash2, val = crash3 first half, test = crash3 second half
    train_df = crash2 if len(crash2) > 0 else crash3.iloc[:int(len(crash3)*0.5)]
    val_df = crash3_val
    test_df = crash3_test

    # If crash2 is empty (e.g., SOL launched mid-2020 but may have some data)
    if len(train_df) < 500:
        print(f'  [NOTE] {name}: small crash2 ({len(train_df)} rows), using crash3 70/15/15 split')
        c3_70 = int(len(crash3) * 0.70)
        c3_85 = int(len(crash3) * 0.85)
        train_df = crash3.iloc[:c3_70]
        val_df = crash3.iloc[c3_70:c3_85]
        test_df = crash3.iloc[c3_85:]

    # Horizon 1: 2-bar (10 min) -- PRIMARY
    print(f'\n  --- {name} 2-bar (10min) ---')
    r_2bar = train_and_eval(
        train_df[FEATURE_COLS].values, train_df['label_binary'].values,
        val_df[FEATURE_COLS].values, val_df['label_binary'].values,
        test_df[FEATURE_COLS].values, test_df['label_binary'].values,
        BASE_PARAMS, label=f'{name} 2-bar (10min)'
    )
    all_results[f'{name}_2bar'] = r_2bar

    # Horizon 2: 1-bar (5 min)
    print(f'\n  --- {name} 1-bar (5min) ---')
    # Need label_binary_1bar for val and test
    train_1bar_y = train_df['label_binary_1bar'].values
    val_1bar_y = val_df['label_binary_1bar'].values
    test_1bar_y = test_df['label_binary_1bar'].values

    # Drop NaN labels
    valid_train = ~np.isnan(train_1bar_y)
    valid_val = ~np.isnan(val_1bar_y)
    valid_test = ~np.isnan(test_1bar_y)

    r_1bar = train_and_eval(
        train_df[FEATURE_COLS].values[valid_train], train_1bar_y[valid_train],
        val_df[FEATURE_COLS].values[valid_val], val_1bar_y[valid_val],
        test_df[FEATURE_COLS].values[valid_test], test_1bar_y[valid_test],
        BASE_PARAMS, label=f'{name} 1-bar (5min)'
    )
    all_results[f'{name}_1bar'] = r_1bar

print(f'\n[DONE] All models trained: {len(all_results)} models')

In [None]:
# =============================================================================
# CELL 8: Cross-asset comparison table
# =============================================================================

print('MULTI-ASSET CRASH MODEL RESULTS')
print('=' * 75)
print(f'{"Asset":<6} {"Horizon":<12} {"Train":>8} {"Test Acc":>10} {"AUC":>8} {"Pred Std":>10} {"Rounds":>8} {"Deploy?":>8}')
print('-' * 75)

# BTC results from v3 (hardcoded from previous run)
btc_v3_meta = None
btc_meta_path = f'{DRIVE_V3}/crash_best_meta.json'
if os.path.exists(btc_meta_path):
    with open(btc_meta_path) as f:
        btc_v3_meta = json.load(f)

if btc_v3_meta:
    b_recent = btc_v3_meta.get('comparison', {}).get('RECENT_ONLY (21+25)', {})
    horizons = btc_v3_meta.get('horizon_comparison', {})
    for hz_name, hz_data in horizons.items():
        deploy = 'DEPLOY' if hz_data.get('test_acc', 0) > 0.52 else 'SKIP'
        train_size = b_recent.get('train_size', 0)
        pred_std = b_recent.get('pred_std', 0)
        n_rounds = b_recent.get('n_rounds', 0)
        print(f'{"BTC":<6} {hz_name:<12} {train_size:>8,} {hz_data["test_acc"]:>10.4f} {hz_data["test_auc"]:>8.4f} {pred_std:>10.4f} {n_rounds:>8} {deploy:>8}')
else:
    print('BTC    2bar(10m)   108,289     0.5329   0.5457     0.0233       13   DEPLOY')
    print('BTC    1bar(5m)    108,289     0.5241   0.5365     0.0257       20   DEPLOY')

# Print results for each trained model
for key, res in sorted(all_results.items()):
    parts = key.split('_')
    asset = parts[0]
    horizon = parts[1]
    hz_label = '2bar(10m)' if horizon == '2bar' else '1bar(5m)'
    deploy = 'DEPLOY' if res['test_acc'] > 0.52 else ('MARGINAL' if res['test_acc'] > 0.51 else 'SKIP')
    print(f'{asset:<6} {hz_label:<12} {res["train_size"]:>8,} {res["test_acc"]:>10.4f} {res["test_auc"]:>8.4f} {res["pred_std"]:>10.4f} {res["n_rounds"]:>8} {deploy:>8}')

print('=' * 75)

# Summary
deploy_count = sum(1 for r in all_results.values() if r['test_acc'] > 0.52)
total_count = len(all_results)
print(f'\nDeployable models: {deploy_count}/{total_count} (threshold: test_acc > 52%)')
print(f'Training config: {TRAINING_CONFIG}')
print(f'Feature set: {len(FEATURE_COLS)} features (shared across all assets)')

In [None]:
# =============================================================================
# CELL 9: Calibration analysis per asset
# =============================================================================

for key, res in sorted(all_results.items()):
    parts = key.split('_')
    asset = parts[0]
    horizon = parts[1]
    hz_label = '2-bar (10min)' if horizon == '2bar' else '1-bar (5min)'

    print(f'\n{"=" * 65}')
    print(f'{asset} {hz_label} CALIBRATION')
    print(f'{"=" * 65}')

    preds = res['test_pred']
    df_name = asset
    test_df = feature_datasets[df_name].copy()
    test_df = test_df.sort_values('timestamp').reset_index(drop=True)

    # Get the test split
    crash3_mask = (test_df['timestamp'] >= pd.Timestamp('2025-10-06')) & (test_df['timestamp'] <= pd.Timestamp('2026-02-28'))
    crash3 = test_df[crash3_mask]
    c3_half = len(crash3) // 2
    test_slice = crash3.iloc[c3_half:]

    label_col = 'label_binary' if horizon == '2bar' else 'label_binary_1bar'
    labels = test_slice[label_col].values[:len(preds)]

    # Confidence buckets
    print(f'{"Bucket":<20} {"Count":>8} {"Accuracy":>10} {"Recommended":>14}')
    print('-' * 55)

    buckets = [
        ('52-55% UP', lambda p: (p > 0.52) & (p <= 0.55)),
        ('55-60% UP', lambda p: (p > 0.55) & (p <= 0.60)),
        ('60%+ UP', lambda p: p > 0.60),
        ('52-55% DN', lambda p: (p < 0.48) & (p >= 0.45)),
        ('55-60% DN', lambda p: (p < 0.45) & (p >= 0.40)),
        ('60%+ DN', lambda p: p < 0.40),
    ]

    for bucket_name, mask_fn in buckets:
        mask = mask_fn(preds)
        count = mask.sum()
        if count > 0:
            if 'DN' in bucket_name:
                acc = (labels[mask] == 0).mean() * 100
            else:
                acc = (labels[mask] == 1).mean() * 100
            recommended = 'YES' if acc > 52 and count > 50 else 'NO'
            print(f'{bucket_name:<20} {count:>8,} {acc:>9.1f}% {recommended:>14}')
        else:
            print(f'{bucket_name:<20} {0:>8} {"N/A":>10} {"NO":>14}')

    print(f'\n  Max confidence: {preds.max():.4f}')
    print(f'  Min confidence: {preds.min():.4f}')
    print(f'  Std: {preds.std():.4f}')
    threshold = 0.53 if horizon == '2bar' else 0.52
    above = (preds > threshold).sum()
    below = (preds < (1 - threshold)).sum()
    print(f'  Above {threshold}: {above} ({above/len(preds)*100:.1f}%)')
    print(f'  Below {1-threshold}: {below} ({below/len(preds)*100:.1f}%)')

In [None]:
# =============================================================================
# CELL 10: Save all models to Google Drive
# =============================================================================

saved_models = {}

for key, res in sorted(all_results.items()):
    parts = key.split('_')
    asset = parts[0].lower()
    horizon = parts[1]
    deploy = res['test_acc'] > 0.52

    tag = f'crash_{asset}_{horizon}'
    print(f'\n--- {tag} (test_acc={res["test_acc"]:.4f}, deploy={deploy}) ---')

    # Save pickle
    pkl_path = f'{DRIVE_SAVE}/{tag}_lightgbm.pkl'
    with open(pkl_path, 'wb') as f:
        pickle.dump(res['model'], f)
    print(f'  Saved: {pkl_path}')

    # Save LightGBM text format
    txt_path = f'{DRIVE_SAVE}/{tag}_lightgbm.txt'
    res['model'].save_model(txt_path)
    print(f'  Saved: {txt_path}')

    # Save metadata JSON
    meta = {
        'asset': asset.upper(),
        'horizon': horizon,
        'horizon_label': '2-bar (10min)' if horizon == '2bar' else '1-bar (5min)',
        'training_config': TRAINING_CONFIG,
        'test_acc': round(res['test_acc'], 4),
        'test_auc': round(res['test_auc'], 4),
        'val_acc': round(res['val_acc'], 4),
        'val_auc': round(res['val_auc'], 4),
        'pred_std': round(res['pred_std'], 4),
        'n_rounds': res['n_rounds'],
        'train_size': res['train_size'],
        'feature_cols': FEATURE_COLS,
        'n_features': len(FEATURE_COLS),
        'deploy': deploy,
        'trained_at': datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S'),
        'top_features': [{'name': n, 'gain': round(float(g), 1)} for n, g in res['feat_imp'][:10]],
    }
    meta_path = f'{DRIVE_SAVE}/{tag}_meta.json'
    with open(meta_path, 'w') as f:
        json.dump(meta, f, indent=2)
    print(f'  Saved: {meta_path}')

    saved_models[f'{asset.upper()}_{horizon}'] = meta

# Master summary
summary = {
    'trained_at': datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S'),
    'training_config': TRAINING_CONFIG,
    'feature_cols': FEATURE_COLS,
    'n_features': len(FEATURE_COLS),
    'models': {},
    'note': 'All models use BTC as cross-asset lead indicator. Feature set is identical across all assets.',
}

# Add BTC from v3
if btc_v3_meta:
    horizons = btc_v3_meta.get('horizon_comparison', {})
    for hz_name, hz_data in horizons.items():
        hz_key = '2bar' if '2-bar' in hz_name else ('1bar' if '1-bar' in hz_name else '6bar')
        summary['models'][f'BTC_{hz_key}'] = {
            'accuracy': hz_data.get('test_acc', 0),
            'auc': hz_data.get('test_auc', 0),
            'deploy': hz_data.get('test_acc', 0) > 0.52,
            'source': 'crash_models_v3',
        }

# Add new models
for key, meta in saved_models.items():
    summary['models'][key] = {
        'accuracy': meta['test_acc'],
        'auc': meta['test_auc'],
        'deploy': meta['deploy'],
        'source': 'crash_models_multi',
    }

summary_path = f'{DRIVE_SAVE}/crash_multi_asset_summary.json'
with open(summary_path, 'w') as f:
    json.dump(summary, f, indent=2)
print(f'\nMaster summary: {summary_path}')

# Final report
print(f'\n{"=" * 65}')
print('TRAINING COMPLETE -- MULTI-ASSET CRASH MODELS')
print(f'{"=" * 65}')

for asset_hz, meta in sorted(summary['models'].items()):
    status = 'DEPLOY' if meta['deploy'] else 'SKIP'
    src = meta.get('source', '')
    print(f'  {asset_hz:<12} {meta["accuracy"]:.4f} acc  {meta["auc"]:.4f} AUC  [{status}]  ({src})')

deploy_count = sum(1 for m in summary['models'].values() if m['deploy'])
total_count = len(summary['models'])
print(f'\nDeployable: {deploy_count}/{total_count}')
print(f'Feature set: {len(FEATURE_COLS)} features (shared)')
print(f'Drive path: {DRIVE_SAVE}/')
print(f'{"=" * 65}')