# LightGBM Directional Classifier Training

**Hypothesis 2**: A global directional classifier (LightGBM) with per-stock confidence thresholds can generate alpha.

## Approach
- Global model: one model across all stocks (captures cross-asset patterns)
- Binary classification: UP/DOWN direction
- Per-stock thresholds τ*: only trade when confidence > τ*
- Walk-forward validation: train pre-2022, optimize thresholds 2022-2023, test 2024+

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, confusion_matrix
import sys
sys.path.insert(0, '../..')

from utils.features import compute_features, forward_return_direction, multi_timeframe_returns
from utils.evaluation import performance_report, print_report, sharpe_ratio
from utils.data_loaders import load_yahoo
from utils.model_store import ModelStore

plt.style.use('seaborn-v0_8-whitegrid')

## 1. Load Data — Diverse 19-Stock Universe

In [None]:
tickers = [
    'AAPL', 'MSFT', 'NVDA', 'GOOGL',  # Tech
    'JPM', 'BAC', 'GS',                 # Finance
    'JNJ', 'UNH', 'PFE',                # Healthcare
    'AMZN', 'WMT', 'KO',                # Consumer
    'XOM', 'CVX',                        # Energy
    'CAT', 'BA',                         # Industrial
    'META', 'DIS',                       # Communication
]

df = load_yahoo(tickers, start='2018-01-01', end='2024-12-31')
print(f'Loaded: {len(df)} rows, {df["ticker"].nunique()} tickers')

## 2. Compute Features + Target

In [None]:
TARGET_PERIODS = 5  # 5-day forward direction

all_featured = []
for ticker, group in df.groupby('ticker'):
    group = group.sort_values('date').reset_index(drop=True)
    featured = compute_features(group)
    featured['ticker'] = ticker
    featured['target'] = forward_return_direction(featured['close'], periods=TARGET_PERIODS)
    all_featured.append(featured)

data = pd.concat(all_featured, ignore_index=True).dropna().sort_values('date').reset_index(drop=True)

exclude = ['date', 'ticker', 'target', 'open', 'high', 'low', 'close', 'volume', 'adj_close']
feature_cols = [c for c in data.columns if c not in exclude]

print(f'Feature matrix: {data.shape}')
print(f'Features: {len(feature_cols)}')
print(f'Target: {data["target"].value_counts().to_dict()}')

## 3. Walk-Forward Validation

In [None]:
# Split by date
train_end = '2022-01-01'
val_end = '2023-01-01'

train = data[data['date'] < train_end]
val = data[(data['date'] >= train_end) & (data['date'] < val_end)]
test = data[data['date'] >= val_end]

print(f'Train: {len(train)} rows ({train["date"].min().date()} to {train["date"].max().date()})')
print(f'Val:   {len(val)} rows ({val["date"].min().date()} to {val["date"].max().date()})')
print(f'Test:  {len(test)} rows ({test["date"].min().date()} to {test["date"].max().date()})')

X_train, y_train = train[feature_cols].values, train['target'].values
X_val, y_val = val[feature_cols].values, val['target'].values
X_test, y_test = test[feature_cols].values, test['target'].values

## 4. Train LightGBM

In [None]:
model = lgb.LGBMClassifier(
    num_leaves=31,
    max_depth=6,
    learning_rate=0.05,
    n_estimators=500,
    min_child_samples=20,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=0.1,
    class_weight='balanced',
    random_state=42,
    verbose=-1,
)

model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    callbacks=[
        lgb.early_stopping(50, verbose=False),
        lgb.log_evaluation(period=50),
    ],
)

print(f'\nBest iteration: {model.best_iteration_}')

## 5. Evaluate on Test Set

In [None]:
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)

print('=== Test Set Results ===')
print(f'Accuracy: {accuracy_score(y_test, y_pred):.4f}')
print(f'AUC: {roc_auc_score(y_test, y_prob[:, 1]):.4f}')
print(f'\n{classification_report(y_test, y_pred, target_names=["DOWN", "UP"])}')

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['DOWN', 'UP'], yticklabels=['DOWN', 'UP'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix — Test Set')
plt.show()

## 6. Optimize Per-Stock Thresholds

Use validation set to find optimal confidence threshold τ* per stock.

In [None]:
# Predict on validation set with probabilities
val_probs = model.predict_proba(X_val)[:, 1]  # P(UP)
val['prob_up'] = val_probs

# Find optimal threshold per stock (maximize accuracy on val set)
optimal_thresholds = {}
for ticker in tickers:
    ticker_val = val[val['ticker'] == ticker]
    if len(ticker_val) < 20:
        optimal_thresholds[ticker] = 0.6  # default
        continue
    
    best_threshold = 0.5
    best_sharpe = -np.inf
    
    for threshold in np.arange(0.5, 0.85, 0.05):
        # Simulate: buy when P(UP) > threshold
        signals = ticker_val['prob_up'] > threshold
        if signals.sum() < 5:
            continue
        # Simple return calculation
        returns = ticker_val.loc[signals, 'close'].pct_change().dropna()
        if len(returns) > 5:
            sr = sharpe_ratio(returns)
            if sr > best_sharpe:
                best_sharpe = sr
                best_threshold = threshold
    
    optimal_thresholds[ticker] = best_threshold

print('Optimal thresholds per stock:')
for ticker, t in sorted(optimal_thresholds.items()):
    print(f'  {ticker}: {t:.2f}')

## 7. Feature Importance

In [None]:
importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

fig, ax = plt.subplots(figsize=(10, 8))
importance.head(20).plot(x='feature', y='importance', kind='barh', ax=ax)
ax.set_title('Top 20 Feature Importances')
plt.tight_layout()
plt.show()

## 8. Save Model

In [None]:
# Save locally
store = ModelStore(local_dir='../../models')
model_dir = store.save(
    model,
    'lightgbm_directional',
    metadata={
        'target_periods': TARGET_PERIODS,
        'n_features': len(feature_cols),
        'feature_names': feature_cols,
        'optimal_thresholds': optimal_thresholds,
        'test_accuracy': float(accuracy_score(y_test, y_pred)),
        'test_auc': float(roc_auc_score(y_test, y_prob[:, 1])),
    },
)
print(f'Model saved to: {model_dir}')

# Save feature names for QC
import json
with open(f'{model_dir}/feature_names.json', 'w') as f:
    json.dump(feature_cols, f)
print('Feature names saved for QuantConnect integration.')

## 9. Next Steps

- [ ] Upload model to QC ObjectStore for live strategy use
- [ ] Run backtest in QC with the directional_classifier strategy
- [ ] Compare to SPY buy-and-hold
- [ ] Test with different target periods (1, 5, 10, 21 days)
- [ ] Try accumulative learning (transfer weights across walk-forward folds)