#  Crash-Regime LightGBM Training
## Renaissance Trading Bot -- Regime-Specific Models

**What this does:**
- Downloads BTC + ETH candles, macro data (SPX, VIX, DXY), Binance derivatives
- Filters to crash periods only (2018, 2021-22, 2025-26)
- Engineers 40 crash-specific features across 4 groups
- Trains LightGBM on crash-only data with macro features
- Saves everything to Google Drive (survives runtime disconnects)

**Key insight:** BTC has 0.77-0.90 correlation with S&P 500 during crashes.
Our current models miss this signal entirely, achieving only 51% accuracy.
Training on crash-only data with macro features should improve this.

**Runtime:** Select GPU (T4) for faster training, though LightGBM trains in ~30 seconds either way.

## Cell 1: Setup & Google Drive Mount
Mount Drive first so all outputs survive runtime disconnects.

In [None]:
# ============================================================
# CELL 1: Setup & Google Drive Mount
# ============================================================
# Mount Drive FIRST -- all outputs save here to survive disconnects

from google.colab import drive
drive.mount('/content/drive')

import os
os.makedirs('/content/data', exist_ok=True)
os.makedirs('/content/models', exist_ok=True)

DRIVE_SAVE = '/content/drive/MyDrive/renaissance-bot-training/crash_models/'
os.makedirs(DRIVE_SAVE, exist_ok=True)

!pip install -q yfinance lightgbm pandas numpy scikit-learn

import numpy as np
import pandas as pd
import requests
import time
from datetime import datetime, timedelta

print("\u2705 Setup complete")
print(f"Drive save path: {DRIVE_SAVE}")

## Cell 2: Download BTC + ETH 5-Minute Candles from Binance
Full history from Sep 2017. Covers all three crash periods. Takes 5-10 minutes.

In [None]:
# ============================================================
# CELL 2: Download BTC + ETH 5m candles from Binance
# ============================================================
# Need crash periods:
#   Crash 1: Jan 2018 -- Dec 2018
#   Crash 2: Nov 2021 -- Nov 2022
#   Crash 3: Oct 2025 -- Feb 2026 (current)

def fetch_binance_klines(symbol, interval, start_ms, end_ms, limit=1000):
    """Fetch klines from Binance public API. No auth needed."""
    url = "https://api.binance.com/api/v3/klines"
    all_data = []
    current = start_ms
    retries = 0

    while current < end_ms:
        params = {
            'symbol': symbol,
            'interval': interval,
            'startTime': current,
            'endTime': end_ms,
            'limit': limit,
        }
        try:
            resp = requests.get(url, params=params, timeout=30)
            if resp.status_code == 429:
                print("  Rate limited, waiting 60s...")
                time.sleep(60)
                continue
            if resp.status_code != 200:
                print(f"  HTTP {resp.status_code}, retrying...")
                retries += 1
                if retries > 5:
                    break
                time.sleep(5)
                continue

            data = resp.json()
            if not data or not isinstance(data, list):
                break

            all_data.extend(data)
            current = data[-1][0] + 1
            retries = 0

            if len(all_data) % 50000 == 0:
                print(f"  ... {len(all_data):,} candles so far")

            if len(data) < limit:
                break

            time.sleep(0.1)

        except Exception as e:
            print(f"  Error: {e}, retrying...")
            retries += 1
            if retries > 5:
                break
            time.sleep(5)

    return all_data

def klines_to_df(raw):
    """Convert Binance klines to clean DataFrame."""
    df = pd.DataFrame(raw, columns=[
        'open_time', 'open', 'high', 'low', 'close', 'volume',
        'close_time', 'quote_volume', 'trades', 'taker_buy_base',
        'taker_buy_quote', 'ignore'
    ])
    for col in ['open', 'high', 'low', 'close', 'volume', 'quote_volume',
                'taker_buy_base', 'taker_buy_quote']:
        df[col] = df[col].astype(float)
    df['trades'] = df['trades'].astype(int)
    df['timestamp'] = pd.to_datetime(df['open_time'], unit='ms')
    df = df.drop_duplicates(subset=['open_time']).sort_values('timestamp').reset_index(drop=True)
    return df

start_ms = int(datetime(2017, 9, 1).timestamp() * 1000)
end_ms = int(datetime.now(datetime.UTC).timestamp() * 1000)

# Download BTC
print("Downloading BTC 5m candles from Binance (this takes 5-10 minutes)...")
raw = fetch_binance_klines('BTCUSDT', '5m', start_ms, end_ms)
btc_5m = klines_to_df(raw)
btc_5m.to_csv('/content/data/btc_5m_full.csv', index=False)
print(f"\u2705 BTC: {len(btc_5m):,} candles ({btc_5m['timestamp'].min().date()} \u2192 {btc_5m['timestamp'].max().date()})")

# Download ETH (for cross-asset features)
print("\nDownloading ETH 5m candles...")
raw = fetch_binance_klines('ETHUSDT', '5m', start_ms, end_ms)
eth_5m = klines_to_df(raw)
eth_5m.to_csv('/content/data/eth_5m_full.csv', index=False)
print(f"\u2705 ETH: {len(eth_5m):,} candles ({eth_5m['timestamp'].min().date()} \u2192 {eth_5m['timestamp'].max().date()})")

# Save to Drive as backup
btc_5m.to_csv(f'{DRIVE_SAVE}/btc_5m_full.csv', index=False)
eth_5m.to_csv(f'{DRIVE_SAVE}/eth_5m_full.csv', index=False)
print("\n\u2705 Backed up to Drive")

## Cell 3: Download Macro Data (S&P 500, VIX, DXY, Yields)
Daily macro data via yfinance. BTC has 0.77-0.90 correlation with SPX in crashes -- this is the NEW signal our current models don't have.

In [None]:
# ============================================================
# CELL 3: Download macro data via yfinance
# ============================================================
# BTC has 0.77-0.90 correlation with S&P 500 in crash periods.
# This is the signal our models are missing.

import yfinance as yf

macro_tickers = {
    'spx': '^GSPC',       # S&P 500
    'ndx': '^IXIC',       # Nasdaq Composite
    'vix': '^VIX',        # CBOE Volatility Index
    'dxy': 'DX-Y.NYB',    # US Dollar Index
    'us10y': '^TNX',      # 10-Year Treasury Yield
    'gold': 'GC=F',       # Gold Futures
}

macro_daily = {}
for name, ticker in macro_tickers.items():
    print(f"Downloading {name} ({ticker})...")
    try:
        data = yf.download(ticker, start='2017-01-01', progress=False)
        if len(data) > 0:
            # Handle MultiIndex columns from yfinance
            if isinstance(data.columns, pd.MultiIndex):
                data.columns = data.columns.get_level_values(0)
            macro_daily[name] = data[['Close']].rename(columns={'Close': name})
            print(f"  \u2705 {name}: {len(data):,} days ({data.index.min().date()} \u2192 {data.index.max().date()})")
        else:
            print(f"  \u26a0\ufe0f {name}: no data returned")
    except Exception as e:
        print(f"  \u274c {name}: {e}")

# Merge into one DataFrame
macro_df = pd.concat(macro_daily.values(), axis=1)
macro_df.index = pd.to_datetime(macro_df.index)
macro_df = macro_df.ffill()  # Forward-fill weekends/holidays
macro_df = macro_df.bfill()  # Back-fill any leading NaNs
macro_df.to_csv('/content/data/macro_daily.csv')
print(f"\n\u2705 Macro data: {len(macro_df):,} days")
print(f"   Columns: {list(macro_df.columns)}")
print(f"   Date range: {macro_df.index.min().date()} \u2192 {macro_df.index.max().date()}")

# Fear & Greed Index (daily, from alternative.me \u2014 free API)
print("\nDownloading Fear & Greed Index...")
try:
    fng_resp = requests.get("https://api.alternative.me/fng/?limit=0", timeout=30)
    fng_data = fng_resp.json()['data']
    fng_df = pd.DataFrame(fng_data)
    fng_df['timestamp'] = pd.to_datetime(fng_df['timestamp'].astype(int), unit='s')
    fng_df['fng'] = fng_df['value'].astype(int)
    fng_df = fng_df[['timestamp', 'fng']].set_index('timestamp').sort_index()
    fng_df.to_csv('/content/data/fear_greed.csv')
    print(f"  \u2705 Fear & Greed: {len(fng_df):,} days")
except Exception as e:
    print(f"  \u274c Fear & Greed: {e}")
    fng_df = None

# Save to Drive
macro_df.to_csv(f'{DRIVE_SAVE}/macro_daily.csv')
print("\u2705 Backed up to Drive")

## Cell 4: Download Binance Derivatives Data
Funding rate (8h), Open Interest (5m), Long/Short ratio (5m), Taker Buy/Sell volume (5m). Available from ~mid 2019.

In [None]:
# ============================================================
# CELL 4: Download Binance Futures derivatives data
# ============================================================
# Funding rate, Open Interest, Long/Short ratio, Taker buy/sell
# Free endpoints, no API key needed.
# Available from approximately mid-2019.

def fetch_binance_futures(endpoint, symbol, period=None, limit=500,
                          start_ms=None, end_ms=None):
    """Fetch data from Binance Futures API."""
    base = "https://fapi.binance.com"
    all_data = []
    current = start_ms
    retries = 0

    while True:
        params = {'symbol': symbol, 'limit': limit}
        if period and 'fundingRate' not in endpoint:
            params['period'] = period
        if current:
            params['startTime'] = current
        if end_ms:
            params['endTime'] = end_ms

        try:
            resp = requests.get(f"{base}{endpoint}", params=params, timeout=30)
            if resp.status_code == 429:
                time.sleep(60)
                continue
            if resp.status_code != 200:
                retries += 1
                if retries > 3:
                    break
                time.sleep(5)
                continue

            data = resp.json()
            if not data or not isinstance(data, list):
                break

            all_data.extend(data)
            retries = 0

            # Find timestamp key for pagination
            if 'fundingTime' in data[-1]:
                current = data[-1]['fundingTime'] + 1
            elif 'timestamp' in data[-1]:
                current = data[-1]['timestamp'] + 1
            else:
                break

            if len(data) < limit:
                break

            if len(all_data) % 10000 == 0:
                print(f"    ... {len(all_data):,} records")

            time.sleep(0.2)

        except Exception as e:
            retries += 1
            if retries > 3:
                print(f"    Giving up after {retries} retries: {e}")
                break
            time.sleep(5)

    return all_data

start_ms = int(datetime(2019, 9, 1).timestamp() * 1000)
end_ms = int(datetime.now(datetime.UTC).timestamp() * 1000)

derivatives = {}

# 1. Funding Rate (8-hourly)
print("Downloading BTC funding rate (8h intervals)...")
raw = fetch_binance_futures('/fapi/v1/fundingRate', 'BTCUSDT',
                            start_ms=start_ms, end_ms=end_ms)
if raw:
    fr_df = pd.DataFrame(raw)
    fr_df['timestamp'] = pd.to_datetime(fr_df['fundingTime'], unit='ms')
    fr_df['funding_rate'] = fr_df['fundingRate'].astype(float)
    derivatives['funding_rate'] = fr_df[['timestamp', 'funding_rate']]
    fr_df[['timestamp', 'funding_rate']].to_csv('/content/data/btc_funding_rate.csv', index=False)
    print(f"  \u2705 Funding rate: {len(fr_df):,} records")
else:
    print("  \u26a0\ufe0f No funding rate data")

# 2. Open Interest (5m)
print("Downloading BTC open interest (5m)...")
raw = fetch_binance_futures('/futures/data/openInterestHist', 'BTCUSDT',
                            period='5m', start_ms=start_ms, end_ms=end_ms)
if raw:
    oi_df = pd.DataFrame(raw)
    oi_df['timestamp'] = pd.to_datetime(oi_df['timestamp'], unit='ms')
    oi_df['open_interest'] = oi_df['sumOpenInterest'].astype(float)
    oi_df['oi_value'] = oi_df['sumOpenInterestValue'].astype(float)
    derivatives['open_interest'] = oi_df[['timestamp', 'open_interest', 'oi_value']]
    oi_df[['timestamp', 'open_interest', 'oi_value']].to_csv('/content/data/btc_open_interest.csv', index=False)
    print(f"  \u2705 Open interest: {len(oi_df):,} records")
else:
    print("  \u26a0\ufe0f No open interest data")

# 3. Long/Short Ratio (5m)
print("Downloading BTC long/short ratio (5m)...")
raw = fetch_binance_futures('/futures/data/globalLongShortAccountRatio',
                            'BTCUSDT', period='5m',
                            start_ms=start_ms, end_ms=end_ms)
if raw:
    ls_df = pd.DataFrame(raw)
    ls_df['timestamp'] = pd.to_datetime(ls_df['timestamp'], unit='ms')
    ls_df['long_short_ratio'] = ls_df['longShortRatio'].astype(float)
    ls_df['long_account'] = ls_df['longAccount'].astype(float)
    ls_df['short_account'] = ls_df['shortAccount'].astype(float)
    derivatives['long_short'] = ls_df[['timestamp', 'long_short_ratio',
                                        'long_account', 'short_account']]
    ls_df[['timestamp', 'long_short_ratio', 'long_account', 'short_account']].to_csv(
        '/content/data/btc_long_short.csv', index=False)
    print(f"  \u2705 Long/short ratio: {len(ls_df):,} records")
else:
    print("  \u26a0\ufe0f No long/short data")

# 4. Taker Buy/Sell Volume (5m)
print("Downloading BTC taker buy/sell volume (5m)...")
raw = fetch_binance_futures('/futures/data/takeBuySellVol', 'BTCUSDT',
                            period='5m', start_ms=start_ms, end_ms=end_ms)
if raw:
    tv_df = pd.DataFrame(raw)
    tv_df['timestamp'] = pd.to_datetime(tv_df['timestamp'], unit='ms')
    tv_df['taker_buy_vol'] = tv_df['buyVol'].astype(float)
    tv_df['taker_sell_vol'] = tv_df['sellVol'].astype(float)
    tv_df['taker_ratio'] = tv_df['taker_buy_vol'] / (tv_df['taker_sell_vol'] + 1e-10)
    derivatives['taker_vol'] = tv_df[['timestamp', 'taker_buy_vol',
                                       'taker_sell_vol', 'taker_ratio']]
    tv_df[['timestamp', 'taker_buy_vol', 'taker_sell_vol', 'taker_ratio']].to_csv(
        '/content/data/btc_taker_vol.csv', index=False)
    print(f"  \u2705 Taker volume: {len(tv_df):,} records")
else:
    print("  \u26a0\ufe0f No taker volume data")

print(f"\n\u2705 Derivatives data complete. Saved {len(derivatives)} datasets.")

## Cell 5: Label Crash Periods & Merge All Data
Merge BTC 5m candles with macro (daily), derivatives (variable freq via merge_asof), ETH cross-asset, and Fear & Greed. Filter to crash periods only.

In [None]:
# ============================================================
# CELL 5: Label crash periods and merge everything
# ============================================================

# Load BTC 5m
btc = pd.read_csv('/content/data/btc_5m_full.csv')
btc['timestamp'] = pd.to_datetime(btc['timestamp'])
btc = btc.sort_values('timestamp').reset_index(drop=True)

# \u2500\u2500 Define crash periods \u2500\u2500
# Based on BTC market cycle analysis:
#   Crash 1: Post-2017 bubble, Jan 2018 peak \u2192 Dec 2018 bottom
#   Crash 2: Post-2021 bubble, Nov 2021 ATH $69K \u2192 Nov 2022 bottom $15.5K
#   Crash 3: Post-2025 bubble, Oct 2025 ATH $126K \u2192 ongoing (~$65K)
CRASH_PERIODS = [
    ('2018-01-07', '2018-12-15'),
    ('2021-11-10', '2022-11-21'),
    ('2025-10-06', '2026-02-28'),
]

btc['is_crash'] = False
for start, end in CRASH_PERIODS:
    mask = (btc['timestamp'] >= start) & (btc['timestamp'] <= end)
    btc.loc[mask, 'is_crash'] = True

crash_data = btc[btc['is_crash']].copy()
print(f"Total BTC candles: {len(btc):,}")
print(f"Crash candles: {len(crash_data):,} ({100*len(crash_data)/len(btc):.1f}%)")
for start, end in CRASH_PERIODS:
    n = len(btc[(btc['timestamp'] >= start) & (btc['timestamp'] <= end)])
    print(f"  {start} \u2192 {end}: {n:,} candles")

# \u2500\u2500 Merge macro data (daily \u2192 5m via date join) \u2500\u2500
macro = pd.read_csv('/content/data/macro_daily.csv', index_col=0, parse_dates=True)
crash_data['date'] = crash_data['timestamp'].dt.strftime('%Y-%m-%d')
macro['date'] = macro.index.strftime('%Y-%m-%d')
crash_data = crash_data.merge(macro, on='date', how='left')
for col in ['spx', 'ndx', 'vix', 'dxy', 'us10y', 'gold']:
    if col in crash_data.columns:
        crash_data[col] = crash_data[col].ffill().bfill()
print(f"\n\u2705 Merged macro data")

# \u2500\u2500 Merge Fear & Greed (daily \u2192 5m via date join) \u2500\u2500
try:
    fng = pd.read_csv('/content/data/fear_greed.csv', parse_dates=['timestamp'])
    fng['date'] = fng['timestamp'].dt.strftime('%Y-%m-%d')
    crash_data = crash_data.merge(fng[['date', 'fng']], on='date', how='left')
    crash_data['fng'] = crash_data['fng'].ffill().bfill().fillna(50)
    print(f"\u2705 Merged Fear & Greed")
except Exception as e:
    crash_data['fng'] = 50
    print(f"\u26a0\ufe0f Fear & Greed failed, using default 50: {e}")

# \u2500\u2500 Merge derivatives (variable freq \u2192 5m via merge_asof) \u2500\u2500
crash_data = crash_data.sort_values('timestamp').reset_index(drop=True)

deriv_files = {
    'funding_rate': ('/content/data/btc_funding_rate.csv', ['funding_rate']),
    'open_interest': ('/content/data/btc_open_interest.csv', ['open_interest', 'oi_value']),
    'long_short': ('/content/data/btc_long_short.csv', ['long_short_ratio', 'long_account', 'short_account']),
    'taker_vol': ('/content/data/btc_taker_vol.csv', ['taker_buy_vol', 'taker_sell_vol', 'taker_ratio']),
}

for name, (path, cols) in deriv_files.items():
    try:
        deriv = pd.read_csv(path, parse_dates=['timestamp'])
        deriv = deriv.sort_values('timestamp').reset_index(drop=True)
        crash_data = pd.merge_asof(
            crash_data, deriv[['timestamp'] + cols],
            on='timestamp', direction='backward',
            tolerance=pd.Timedelta('8h')  # Funding rate is 8-hourly
        )
        filled = crash_data[cols[0]].notna().sum()
        print(f"\u2705 Merged {name}: {filled:,}/{len(crash_data):,} rows filled")
    except Exception as e:
        for col in cols:
            crash_data[col] = np.nan
        print(f"\u26a0\ufe0f {name} merge failed: {e}")

# \u2500\u2500 Merge ETH cross-asset data \u2500\u2500
try:
    eth = pd.read_csv('/content/data/eth_5m_full.csv')
    eth['timestamp'] = pd.to_datetime(eth['timestamp'])
    eth = eth.sort_values('timestamp').reset_index(drop=True)
    eth_merge = eth[['timestamp', 'close', 'volume']].rename(
        columns={'close': 'eth_close', 'volume': 'eth_volume'}
    )
    crash_data = pd.merge_asof(
        crash_data, eth_merge,
        on='timestamp', direction='backward',
        tolerance=pd.Timedelta('5min')
    )
    filled = crash_data['eth_close'].notna().sum()
    print(f"\u2705 Merged ETH: {filled:,}/{len(crash_data):,} rows filled")
except Exception as e:
    crash_data['eth_close'] = np.nan
    crash_data['eth_volume'] = np.nan
    print(f"\u26a0\ufe0f ETH merge failed: {e}")

# \u2500\u2500 Clean and save \u2500\u2500
crash_data = crash_data.dropna(subset=['close']).reset_index(drop=True)
crash_data.to_csv('/content/data/crash_dataset_raw.csv', index=False)
crash_data.to_csv(f'{DRIVE_SAVE}/crash_dataset_raw.csv', index=False)

print(f"\n\u2705 Crash dataset: {len(crash_data):,} rows, {len(crash_data.columns)} columns")
print(f"\nColumn overview:")
for col in sorted(crash_data.columns):
    non_null = crash_data[col].notna().sum()
    pct = 100 * non_null / len(crash_data)
    print(f"  {col:30s} {non_null:>8,} non-null ({pct:.0f}%)")

## Cell 6: Engineer Crash-Specific Features
40 features in 4 groups: BTC price/volume (15), Macro correlation (10), Derivatives (9), Cross-asset (6). All features are normalized/relative (no absolute prices).

In [None]:
# ============================================================
# CELL 6: Build crash-specific features
# ============================================================
# 40 features across 4 groups, all scale-invariant

df = crash_data.copy()

# \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
# GROUP 1: BTC PRICE & VOLUME (15 features)
# \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550

# Returns at multiple horizons (relative, not absolute)
df['return_1bar'] = df['close'].pct_change(1)          # 5 min
df['return_6bar'] = df['close'].pct_change(6)          # 30 min
df['return_12bar'] = df['close'].pct_change(12)        # 1 hour
df['return_48bar'] = df['close'].pct_change(48)        # 4 hours
df['return_288bar'] = df['close'].pct_change(288)      # 24 hours

# Volatility (rolling std of returns \u2014 already scale-invariant)
df['vol_12bar'] = df['return_1bar'].rolling(12).std()
df['vol_48bar'] = df['return_1bar'].rolling(48).std()
df['vol_ratio'] = df['vol_12bar'] / (df['vol_48bar'] + 1e-10)

# Volume (relative to own history \u2014 scale-invariant)
df['vol_sma_20'] = df['volume'].rolling(20).mean()
df['volume_surge'] = df['volume'] / (df['vol_sma_20'] + 1e-10)
df['volume_trend'] = df['volume'].rolling(12).mean() / (df['volume'].rolling(48).mean() + 1e-10)

# Consecutive red candles (count)
df['candle_dir'] = (df['close'] > df['open']).astype(int)
groups = (df['candle_dir'] != df['candle_dir'].shift()).cumsum()
df['consecutive_red'] = df.groupby(groups)['candle_dir'].cumcount()
df.loc[df['candle_dir'] == 1, 'consecutive_red'] = 0

# Drawdown from rolling 24h high (relative)
df['rolling_high_24h'] = df['high'].rolling(288).max()
df['drawdown_24h'] = df['close'] / df['rolling_high_24h'] - 1

# RSI (normalized to [-1, 1])
delta = df['close'].diff()
gain = delta.where(delta > 0, 0).rolling(14).mean()
loss = (-delta.where(delta < 0, 0)).rolling(14).mean()
rs = gain / (loss + 1e-10)
df['rsi_14_norm'] = (100 - (100 / (1 + rs)) - 50) / 50

# Bollinger Band position (already normalized)
bb_mid = df['close'].rolling(20).mean()
bb_std = df['close'].rolling(20).std()
df['bb_pct_b'] = (df['close'] - bb_mid) / (2 * bb_std + 1e-10)

# VWAP distance (relative)
# Use session-based VWAP (reset every 288 bars = 24h)
df['session'] = np.arange(len(df)) // 288
vwap = df.groupby('session').apply(
    lambda g: (g['quote_volume'].cumsum() / (g['volume'].cumsum() + 1e-10))
).reset_index(level=0, drop=True)
df['vwap_distance'] = df['close'] / (vwap + 1e-10) - 1

# \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
# GROUP 2: MACRO CORRELATION (10 features)
# THE NEW SIGNAL \u2014 BTC follows SPX in crashes
# \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550

# S&P 500
if 'spx' in df.columns and df['spx'].notna().sum() > 100:
    # Daily SPX return (same value for all bars in a day)
    spx_daily = df.groupby('date')['spx'].last()
    spx_returns = spx_daily.pct_change()
    spx_ret_map = spx_returns.to_dict()
    df['spx_return_1d'] = df['date'].map(spx_ret_map).fillna(0)

    # SPX trend (vs 5-day SMA equivalent)
    spx_sma = df['spx'].rolling(288 * 5, min_periods=288).mean()
    df['spx_vs_sma'] = df['spx'] / (spx_sma + 1e-10) - 1
else:
    df['spx_return_1d'] = 0.0
    df['spx_vs_sma'] = 0.0

# VIX
if 'vix' in df.columns and df['vix'].notna().sum() > 100:
    df['vix_norm'] = (df['vix'] - 20) / 20
    vix_daily = df.groupby('date')['vix'].last()
    vix_change = vix_daily.pct_change()
    vix_chg_map = vix_change.to_dict()
    df['vix_change'] = df['date'].map(vix_chg_map).fillna(0)
    df['vix_extreme'] = (df['vix'] > 30).astype(float)
else:
    df['vix_norm'] = 0.0
    df['vix_change'] = 0.0
    df['vix_extreme'] = 0.0

# Dollar Index
if 'dxy' in df.columns and df['dxy'].notna().sum() > 100:
    dxy_daily = df.groupby('date')['dxy'].last()
    dxy_returns = dxy_daily.pct_change()
    dxy_ret_map = dxy_returns.to_dict()
    df['dxy_return_1d'] = df['date'].map(dxy_ret_map).fillna(0)
    dxy_sma = df['dxy'].rolling(288 * 20, min_periods=288).mean()
    df['dxy_trend'] = df['dxy'] / (dxy_sma + 1e-10) - 1
else:
    df['dxy_return_1d'] = 0.0
    df['dxy_trend'] = 0.0

# Treasury Yields
if 'us10y' in df.columns and df['us10y'].notna().sum() > 100:
    df['yield_level'] = (df['us10y'] - 3.0) / 2.0
    yield_daily = df.groupby('date')['us10y'].last()
    yield_diff = yield_daily.diff()
    yield_diff_map = yield_diff.to_dict()
    df['yield_change'] = df['date'].map(yield_diff_map).fillna(0)
else:
    df['yield_level'] = 0.0
    df['yield_change'] = 0.0

# Fear & Greed
if 'fng' in df.columns and df['fng'].notna().sum() > 100:
    df['fng_norm'] = (df['fng'] - 50) / 50
else:
    df['fng_norm'] = 0.0

# \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
# GROUP 3: DERIVATIVES (9 features)
# \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550

# Funding rate
if 'funding_rate' in df.columns and df['funding_rate'].notna().sum() > 100:
    df['funding_rate'] = df['funding_rate'].ffill().fillna(0)
    fr_mean = df['funding_rate'].rolling(288 * 7, min_periods=288).mean()
    fr_std = df['funding_rate'].rolling(288 * 7, min_periods=288).std()
    df['funding_z'] = (df['funding_rate'] - fr_mean) / (fr_std + 1e-10)
    df['funding_extreme_long'] = (df['funding_rate'] > 0.01).astype(float)
    df['funding_extreme_short'] = (df['funding_rate'] < -0.01).astype(float)
else:
    df['funding_z'] = 0.0
    df['funding_extreme_long'] = 0.0
    df['funding_extreme_short'] = 0.0

# Open Interest
if 'oi_value' in df.columns and df['oi_value'].notna().sum() > 100:
    df['oi_value'] = df['oi_value'].ffill().bfill()
    df['oi_change_1h'] = df['oi_value'].pct_change(12)
    df['oi_change_4h'] = df['oi_value'].pct_change(48)
    df['oi_spike'] = (df['oi_change_1h'].abs() > 0.05).astype(float)
else:
    df['oi_change_1h'] = 0.0
    df['oi_change_4h'] = 0.0
    df['oi_spike'] = 0.0

# Long/Short Ratio
if 'long_short_ratio' in df.columns and df['long_short_ratio'].notna().sum() > 100:
    df['ls_ratio_norm'] = df['long_short_ratio'].ffill().fillna(1.0) - 1.0
    df['ls_extreme_long'] = (df['long_short_ratio'] > 2.0).astype(float)
else:
    df['ls_ratio_norm'] = 0.0
    df['ls_extreme_long'] = 0.0

# Taker Buy/Sell
if 'taker_ratio' in df.columns and df['taker_ratio'].notna().sum() > 100:
    df['taker_imbalance'] = df['taker_ratio'].ffill().fillna(1.0) - 1.0
else:
    df['taker_imbalance'] = 0.0

# \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
# GROUP 4: CROSS-ASSET (6 features)
# \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550

if 'eth_close' in df.columns and df['eth_close'].notna().sum() > 100:
    df['eth_return_1bar'] = df['eth_close'].pct_change(1)
    df['eth_return_6bar'] = df['eth_close'].pct_change(6)
    df['eth_btc_ratio'] = df['eth_close'] / (df['close'] + 1e-10)
    df['eth_btc_ratio_change'] = df['eth_btc_ratio'].pct_change(12)
    df['btc_lead_1'] = df['return_1bar'].shift(1)
    df['btc_lead_2'] = df['return_1bar'].shift(2)
    df['btc_lead_3'] = df['return_1bar'].shift(3)
else:
    df['eth_return_1bar'] = 0.0
    df['eth_return_6bar'] = 0.0
    df['eth_btc_ratio_change'] = 0.0
    df['btc_lead_1'] = 0.0
    df['btc_lead_2'] = 0.0
    df['btc_lead_3'] = 0.0

# \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
# LABELS: 6-bar forward return (30 min ahead)
# \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550

df['forward_return_6'] = df['close'].shift(-6) / df['close'] - 1
df['label_binary'] = (df['forward_return_6'] > 0).astype(int)  # 1=up, 0=down
df['label_soft'] = np.tanh(df['forward_return_6'] * 100)

# \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
# FEATURE LIST (40 features)
# \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550

FEATURE_COLS = [
    # BTC price/volume (15)
    'return_1bar', 'return_6bar', 'return_12bar', 'return_48bar', 'return_288bar',
    'vol_12bar', 'vol_48bar', 'vol_ratio',
    'volume_surge', 'volume_trend',
    'consecutive_red', 'drawdown_24h',
    'rsi_14_norm', 'bb_pct_b', 'vwap_distance',
    # Macro (10)
    'spx_return_1d', 'spx_vs_sma',
    'vix_norm', 'vix_change', 'vix_extreme',
    'dxy_return_1d', 'dxy_trend',
    'yield_level', 'yield_change',
    'fng_norm',
    # Derivatives (9)
    'funding_z', 'funding_extreme_long', 'funding_extreme_short',
    'oi_change_1h', 'oi_change_4h', 'oi_spike',
    'ls_ratio_norm', 'ls_extreme_long',
    'taker_imbalance',
    # Cross-asset (6)
    'eth_return_1bar', 'eth_return_6bar', 'eth_btc_ratio_change',
    'btc_lead_1', 'btc_lead_2', 'btc_lead_3',
]

# Replace inf with nan, then drop incomplete rows
df = df.replace([np.inf, -np.inf], np.nan)
df = df.dropna(subset=FEATURE_COLS + ['label_binary']).reset_index(drop=True)

print(f"\n\u2705 Feature engineering complete")
print(f"Clean dataset: {len(df):,} rows with {len(FEATURE_COLS)} features")
print(f"Label balance: {df['label_binary'].mean():.3f} (1=up)")
print(f"\nFeature groups:")
print(f"  BTC price/volume:  15 features")
print(f"  Macro correlation: 10 features")
print(f"  Derivatives:        9 features")
print(f"  Cross-asset:        6 features")
print(f"  Total:             40 features")

# Save feature dataset to Drive
df[FEATURE_COLS + ['timestamp', 'close', 'label_binary', 'label_soft', 'forward_return_6']].to_csv(
    f'{DRIVE_SAVE}/crash_features.csv', index=False
)
print(f"\u2705 Feature dataset saved to Drive")

## Cell 7: Walk-Forward Split & Train LightGBM
Train on crash 1+2, validate on first half of crash 3, test on second half. Crash 1 gets 0.5x weight (older market structure).

In [None]:
# ============================================================
# CELL 7: Train crash-regime LightGBM
# ============================================================
import lightgbm as lgb
from sklearn.metrics import accuracy_score, roc_auc_score
import pickle
import json
import shutil

# \u2500\u2500 Split by crash period (walk-forward) \u2500\u2500
crash1 = df[df['timestamp'] < '2019-01-01']
crash2 = df[(df['timestamp'] >= '2021-11-01') & (df['timestamp'] < '2023-01-01')]
crash3 = df[df['timestamp'] >= '2025-10-01']

print(f"Crash 1 (2018):    {len(crash1):>8,} rows")
print(f"Crash 2 (2021-22): {len(crash2):>8,} rows")
print(f"Crash 3 (2025-26): {len(crash3):>8,} rows")

# Train on crash 1 + 2
# Validate on first half of crash 3
# Test on second half of crash 3
train_data = pd.concat([crash1, crash2])

if len(crash3) > 1000:
    split_idx = len(crash3) // 2
    val_data = crash3.iloc[:split_idx]
    test_data = crash3.iloc[split_idx:]
else:
    # Fall back: split crash 2 if no crash 3 data
    split_70 = int(len(crash2) * 0.7)
    split_85 = int(len(crash2) * 0.85)
    train_data = pd.concat([crash1, crash2.iloc[:split_70]])
    val_data = crash2.iloc[split_70:split_85]
    test_data = crash2.iloc[split_85:]

# Sample weights: crash 1 gets 0.5x weight (older)
train_weights = np.ones(len(train_data))
crash1_mask = train_data['timestamp'] < '2019-01-01'
train_weights[crash1_mask.values] = 0.5

X_train = train_data[FEATURE_COLS].values
y_train = train_data['label_binary'].values
X_val = val_data[FEATURE_COLS].values
y_val = val_data['label_binary'].values
X_test = test_data[FEATURE_COLS].values
y_test = test_data['label_binary'].values

print(f"\nTrain: {len(X_train):,} | Val: {len(X_val):,} | Test: {len(X_test):,}")
print(f"Train up%: {y_train.mean():.3f} | Val up%: {y_val.mean():.3f} | Test up%: {y_test.mean():.3f}")

# \u2500\u2500 Train \u2500\u2500
train_set = lgb.Dataset(X_train, label=y_train, weight=train_weights,
                         feature_name=FEATURE_COLS)
val_set = lgb.Dataset(X_val, label=y_val, feature_name=FEATURE_COLS)

params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'min_child_samples': 100,
    'lambda_l1': 0.1,
    'lambda_l2': 0.1,
    'verbose': -1,
}

print("\n\ud83d\ude80 Training LightGBM (crash-regime)...\n")
model = lgb.train(
    params,
    train_set,
    num_boost_round=500,
    valid_sets=[val_set],
    valid_names=['val'],
    callbacks=[lgb.early_stopping(30), lgb.log_evaluation(50)],
)

# \u2500\u2500 Evaluate \u2500\u2500
val_probs = model.predict(X_val)
test_probs = model.predict(X_test)

val_acc = accuracy_score(y_val, (val_probs > 0.5).astype(int))
test_acc = accuracy_score(y_test, (test_probs > 0.5).astype(int))
val_auc = roc_auc_score(y_val, val_probs)
test_auc = roc_auc_score(y_test, test_probs)
pred_std = np.std(val_probs)

print(f"\n{'='*60}")
print(f"CRASH-REGIME LIGHTGBM RESULTS")
print(f"{'='*60}")
print(f"  Val accuracy:  {val_acc:.4f}  ({val_acc*100:.1f}%)")
print(f"  Test accuracy: {test_acc:.4f}  ({test_acc*100:.1f}%)")
print(f"  Val AUC:       {val_auc:.4f}")
print(f"  Test AUC:      {test_auc:.4f}")
print(f"  Pred std:      {pred_std:.4f}")
print(f"  Best round:    {model.best_iteration}")

# \u2500\u2500 Feature importance \u2500\u2500
importance = model.feature_importance(importance_type='gain')
feat_imp = sorted(zip(FEATURE_COLS, importance), key=lambda x: -x[1])

macro_feats = {'spx_return_1d','spx_vs_sma','vix_norm','vix_change','vix_extreme',
               'dxy_return_1d','dxy_trend','yield_level','yield_change','fng_norm'}
deriv_feats = {'funding_z','funding_extreme_long','funding_extreme_short',
               'oi_change_1h','oi_change_4h','oi_spike','ls_ratio_norm',
               'ls_extreme_long','taker_imbalance'}
cross_feats = {'eth_return_1bar','eth_return_6bar','eth_btc_ratio_change',
               'btc_lead_1','btc_lead_2','btc_lead_3'}

print(f"\nTop 20 features by gain:")
for i, (feat, imp) in enumerate(feat_imp[:20], 1):
    if feat in macro_feats:
        group = "MACRO"
    elif feat in deriv_feats:
        group = "DERIV"
    elif feat in cross_feats:
        group = "CROSS"
    else:
        group = "PRICE"
    print(f"  {i:3d}. [{group:5s}] {feat:30s} {imp:>12,.0f}")

# Calculate group importance totals
group_imp = {'PRICE': 0, 'MACRO': 0, 'DERIV': 0, 'CROSS': 0}
for feat, imp in feat_imp:
    if feat in macro_feats:
        group_imp['MACRO'] += imp
    elif feat in deriv_feats:
        group_imp['DERIV'] += imp
    elif feat in cross_feats:
        group_imp['CROSS'] += imp
    else:
        group_imp['PRICE'] += imp

total_imp = sum(group_imp.values())
print(f"\nFeature group importance:")
for group, imp in sorted(group_imp.items(), key=lambda x: -x[1]):
    print(f"  {group:5s}: {imp:>12,.0f} ({100*imp/total_imp:.1f}%)")

# \u2500\u2500 Save model \u2500\u2500
model.save_model('/content/models/crash_lightgbm_model.txt')
with open('/content/models/crash_lightgbm_model.pkl', 'wb') as f:
    pickle.dump(model, f)

meta = {
    'model_type': 'lightgbm_crash_regime',
    'regime': 'CRASH',
    'val_accuracy': float(val_acc),
    'test_accuracy': float(test_acc),
    'val_auc': float(val_auc),
    'test_auc': float(test_auc),
    'pred_std': float(pred_std),
    'best_round': int(model.best_iteration),
    'n_features': len(FEATURE_COLS),
    'feature_names': FEATURE_COLS,
    'feature_importance': {f: float(i) for f, i in feat_imp},
    'train_rows': int(len(X_train)),
    'val_rows': int(len(X_val)),
    'test_rows': int(len(X_test)),
    'crash_periods': CRASH_PERIODS,
    'crash1_weight': 0.5,
    'params': params,
    'trained_at': datetime.now(datetime.UTC).isoformat(),
}
with open('/content/models/crash_lightgbm_meta.json', 'w') as f:
    json.dump(meta, f, indent=2)

# \u2500\u2500 SAVE TO DRIVE IMMEDIATELY \u2500\u2500
for fname in os.listdir('/content/models/'):
    shutil.copy(f'/content/models/{fname}', f'{DRIVE_SAVE}/{fname}')
    sz = os.path.getsize(f'/content/models/{fname}') / 1024
    print(f"\u2705 Saved to Drive: {fname} ({sz:.1f} KB)")

print(f"\n\ud83c\udfaf Crash-regime LightGBM trained and saved!")

## Cell 8: Confidence Calibration Analysis
Critical for Polymarket: when the model says 85% confident, how often is it actually right?

In [None]:
# ============================================================
# CELL 8: Confidence calibration analysis
# ============================================================
# This answers: "When the model says 85% confident, is it right 85% of the time?"
# Critical for Polymarket strategy which bets at 85%+ confidence.

# Use TEST set (out-of-sample)
probs = test_probs.copy()
actuals = y_test.copy()

# Model outputs probability of UP (0.0 to 1.0)
# Confidence = how far from 50/50
# prob=0.85 means 85% confident UP \u2192 confidence = 0.70 (distance from 0.5, scaled)
# prob=0.15 means 85% confident DOWN \u2192 confidence = 0.70

confidence = np.abs(probs - 0.5) * 2  # Scale to 0-1
predicted_up = (probs > 0.5).astype(int)
correct = (predicted_up == actuals).astype(int)

# Bin by model probability (both directions)
print(f"{'Probability':>12} {'Direction':>10} {'Count':>8} {'Accuracy':>10} {'Bet?':>8}")
print("-" * 55)

prob_bins = [
    (0.50, 0.55, 'UP'),
    (0.55, 0.60, 'UP'),
    (0.60, 0.65, 'UP'),
    (0.65, 0.70, 'UP'),
    (0.70, 0.75, 'UP'),
    (0.75, 0.80, 'UP'),
    (0.80, 0.85, 'UP'),
    (0.85, 0.90, 'UP'),
    (0.90, 0.95, 'UP'),
    (0.95, 1.01, 'UP'),
]

for lo, hi, direction in prob_bins:
    mask = (probs >= lo) & (probs < hi)
    if mask.sum() > 0:
        acc = correct[mask].mean()
        bet = "\u2705 BET" if lo >= 0.85 else "\u2014"
        print(f"  {lo:.0%}-{hi:.0%}      {'UP':>10} {mask.sum():>8,} {acc:>10.1%} {bet:>8}")

print()
# Also check DOWN predictions (prob < 0.5)
for lo, hi in [(0.05, 0.10), (0.10, 0.15), (0.15, 0.20), (0.20, 0.25),
               (0.25, 0.30), (0.30, 0.35), (0.35, 0.40), (0.40, 0.45), (0.45, 0.50)]:
    mask = (probs >= lo) & (probs < hi)
    if mask.sum() > 0:
        # For DOWN predictions, \"correct\" means actual=0 (down)
        down_correct = (actuals[mask] == 0).mean()
        bet = "\u2705 BET" if hi <= 0.15 else "\u2014"
        print(f"  {lo:.0%}-{hi:.0%}      {'DOWN':>10} {mask.sum():>8,} {down_correct:>10.1%} {bet:>8}")

# Summary for Polymarket thresholds
print(f"\n{'='*55}")
print(f"POLYMARKET DECISION THRESHOLDS")
print(f"{'='*55}")

# 85%+ confident UP (prob >= 0.85)
mask_85_up = probs >= 0.85
if mask_85_up.sum() > 0:
    acc = correct[mask_85_up].mean()
    print(f"  85%+ UP:   {mask_85_up.sum():>6,} predictions, {acc:.1%} accuracy")
else:
    print(f"  85%+ UP:   0 predictions")

# 85%+ confident DOWN (prob <= 0.15)
mask_85_down = probs <= 0.15
if mask_85_down.sum() > 0:
    acc = (actuals[mask_85_down] == 0).mean()
    print(f"  85%+ DOWN: {mask_85_down.sum():>6,} predictions, {acc:.1%} accuracy")
else:
    print(f"  85%+ DOWN: 0 predictions")

# Combined high confidence
mask_85_any = (probs >= 0.85) | (probs <= 0.15)
if mask_85_any.sum() > 0:
    hc_correct = np.where(probs[mask_85_any] >= 0.5,
                          actuals[mask_85_any] == 1,
                          actuals[mask_85_any] == 0)
    print(f"  85%+ ANY:  {mask_85_any.sum():>6,} predictions, {hc_correct.mean():.1%} accuracy")
    print(f"\n  \u2192 Polymarket should {'BET' if hc_correct.mean() > 0.60 else 'WAIT \u2014 models not calibrated yet'}")
else:
    print(f"  85%+ ANY:  0 predictions \u2014 model never reaches 85% confidence")
    # Check what confidence it does reach
    max_conf = confidence.max()
    print(f"  Max confidence seen: {50 + max_conf*50:.1f}%")
    for thresh in [0.60, 0.65, 0.70, 0.75, 0.80]:
        mask = confidence >= (thresh - 0.5) * 2
        if mask.sum() > 10:
            hc = np.where(probs[mask] >= 0.5, actuals[mask] == 1, actuals[mask] == 0)
            print(f"  {thresh:.0%}+ conf: {mask.sum():>6,} predictions, {hc.mean():.1%} accuracy")

# Save calibration data to Drive
cal_df = pd.DataFrame({
    'probability': probs,
    'confidence': confidence,
    'predicted_up': predicted_up,
    'actual_up': actuals,
    'correct': correct,
})
cal_df.to_csv(f'{DRIVE_SAVE}/calibration_analysis.csv', index=False)
print(f"\n\u2705 Calibration data saved to Drive")

# \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
# FINAL SUMMARY
# \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
print(f"\n\n{'='*60}")
print(f"TRAINING COMPLETE \u2014 CRASH REGIME LIGHTGBM")
print(f"{'='*60}")
print(f"  Accuracy:       {test_acc:.1%}")
print(f"  AUC:            {test_auc:.4f}")
print(f"  Features:       {len(FEATURE_COLS)} (40)")
print(f"  Training data:  {len(X_train):,} crash-period rows")
print(f"  Model saved to: {DRIVE_SAVE}")
print(f"{'='*60}")

if test_acc > 0.53:
    print(f"\n\ud83c\udfaf ACCURACY ABOVE 53% \u2014 Ready to deploy!")
    print(f"   Download crash_lightgbm_model.pkl from Google Drive")
    print(f"   and place in models/trained/ on the VPS.")
elif test_acc > 0.51:
    print(f"\n\ud83d\udcca MARGINAL IMPROVEMENT \u2014 Consider adding more features")
    print(f"   Try intraday SPX data (1-min) instead of daily")
else:
    print(f"\n\u26a0\ufe0f NO IMPROVEMENT \u2014 Macro features may not help at 5-min resolution")
    print(f"   Consider longer prediction horizon (1h, 4h) or different features")