# Model Training: Triple-Barrier (Meta-Labeling)

This notebook trains a Random Forest classifier for meta-labeling with the Bollinger Band mean reversion strategy.

**Workflow:**
1. Load preprocessed training data (features, labels, metadata)
2. Train Random Forest classifier
3. Evaluate on test set with confusion matrix and metrics
4. Analyze feature importance
5. Save trained model for deployment

**Meta-Labeling Context:**
- Binary classification: 0 (SKIP_TRADE) vs 1 (TAKE_TRADE)
- Purpose: Filter false positives from primary Bollinger strategy
- Features: 62 technical indicators on EURUSD tick bars
- Labels: ~10k strategy signals (not every bar)

**Model:** Random Forest with sample weights (optional)

In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import joblib
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_curve,
    roc_auc_score
)

print("Libraries loaded successfully")

## 1. Load Preprocessed Training Data

Load the most recent preprocessed dataset from `training_data_triple_barrier.ipynb`

In [None]:
# Find the most recent preprocessed files
data_dir = Path('data/training')

# Get all triple_barrier training files sorted by timestamp
train_files = sorted(data_dir.glob('X_train_triple_barrier_*.csv'))

if not train_files:
    raise FileNotFoundError("No preprocessed training data found. Run training_data_triple_barrier.ipynb first.")

# Extract timestamp from most recent file
latest_file = train_files[-1]
timestamp = latest_file.stem.split('_')[-2] + '_' + latest_file.stem.split('_')[-1]

print(f"Loading preprocessed data from: {timestamp}")

# Load all files with matching timestamp
X_train = pd.read_csv(data_dir / f'X_train_triple_barrier_{timestamp}.csv', index_col=0, parse_dates=True)
y_train = pd.read_csv(data_dir / f'y_train_triple_barrier_{timestamp}.csv', index_col=0, parse_dates=True)
X_test = pd.read_csv(data_dir / f'X_test_triple_barrier_{timestamp}.csv', index_col=0, parse_dates=True)
y_test = pd.read_csv(data_dir / f'y_test_triple_barrier_{timestamp}.csv', index_col=0, parse_dates=True)
metadata_train = pd.read_csv(data_dir / f'metadata_train_triple_barrier_{timestamp}.csv', index_col=0, parse_dates=True)
metadata_test = pd.read_csv(data_dir / f'metadata_test_triple_barrier_{timestamp}.csv', index_col=0, parse_dates=True)

print(f"\n✓ Data loaded successfully")
print(f"  Train samples: {len(X_train):,}")
print(f"  Test samples: {len(X_test):,}")
print(f"  Features: {X_train.shape[1]}")
print(f"  Train date range: {X_train.index[0]} to {X_train.index[-1]}")
print(f"  Test date range: {X_test.index[0]} to {X_test.index[-1]}")

In [None]:
# Check label distribution
print("Label Distribution (Train):")
print(y_train['bin'].value_counts())
print(f"\nClass balance: {y_train['bin'].value_counts(normalize=True)}")

print("\nLabel Distribution (Test):")
print(y_test['bin'].value_counts())
print(f"\nClass balance: {y_test['bin'].value_counts(normalize=True)}")

## 2. Train Random Forest Classifier

Train a Random Forest with reasonable default parameters. We'll use:
- `n_estimators=500` - Number of trees
- `max_depth=10` - Prevent overfitting
- `min_samples_split=20` - Require minimum samples to split
- `random_state=42` - Reproducibility
- `n_jobs=-1` - Use all CPU cores
- `class_weight='balanced'` - Handle class imbalance if present

In [None]:
# Create Random Forest classifier
rf_clf = RandomForestClassifier(
    n_estimators=500,
    max_depth=10,
    min_samples_split=20,
    min_samples_leaf=10,
    random_state=42,
    n_jobs=-1,
    class_weight='balanced',
    bootstrap=True,
    oob_score=True,  # Out-of-bag score for validation
    verbose=1
)

print("Training Random Forest classifier...")
print(f"Training samples: {len(X_train):,}")
print(f"Features: {X_train.shape[1]}")
print(f"Starting training at: {datetime.now().strftime('%H:%M:%S')}\n")

In [None]:
# Train the model
rf_clf.fit(X_train, y_train['bin'])

print(f"\n✓ Training completed at: {datetime.now().strftime('%H:%M:%S')}")
print(f"  OOB Score: {rf_clf.oob_score_:.4f}")
print(f"  Number of features: {rf_clf.n_features_in_}")

## 3. Model Evaluation

### 3.1 Predictions

In [None]:
# Generate predictions
y_train_pred = rf_clf.predict(X_train)
y_test_pred = rf_clf.predict(X_test)

# Generate prediction probabilities
y_train_proba = rf_clf.predict_proba(X_train)[:, 1]
y_test_proba = rf_clf.predict_proba(X_test)[:, 1]

print("✓ Predictions generated")
print(f"  Train predictions: {len(y_train_pred):,}")
print(f"  Test predictions: {len(y_test_pred):,}")

### 3.2 Performance Metrics

In [None]:
# Calculate metrics for train set
train_accuracy = accuracy_score(y_train['bin'], y_train_pred)
train_precision = precision_score(y_train['bin'], y_train_pred)
train_recall = recall_score(y_train['bin'], y_train_pred)
train_f1 = f1_score(y_train['bin'], y_train_pred)
train_auc = roc_auc_score(y_train['bin'], y_train_proba)

# Calculate metrics for test set
test_accuracy = accuracy_score(y_test['bin'], y_test_pred)
test_precision = precision_score(y_test['bin'], y_test_pred)
test_recall = recall_score(y_test['bin'], y_test_pred)
test_f1 = f1_score(y_test['bin'], y_test_pred)
test_auc = roc_auc_score(y_test['bin'], y_test_proba)

# Display results
print("=" * 60)
print("MODEL PERFORMANCE METRICS")
print("=" * 60)
print(f"\n{'Metric':<20} {'Train':<15} {'Test':<15} {'Difference':<15}")
print("-" * 60)
print(f"{'Accuracy':<20} {train_accuracy:<15.4f} {test_accuracy:<15.4f} {train_accuracy - test_accuracy:<15.4f}")
print(f"{'Precision':<20} {train_precision:<15.4f} {test_precision:<15.4f} {train_precision - test_precision:<15.4f}")
print(f"{'Recall':<20} {train_recall:<15.4f} {test_recall:<15.4f} {train_recall - test_recall:<15.4f}")
print(f"{'F1 Score':<20} {train_f1:<15.4f} {test_f1:<15.4f} {train_f1 - test_f1:<15.4f}")
print(f"{'ROC AUC':<20} {train_auc:<15.4f} {test_auc:<15.4f} {train_auc - test_auc:<15.4f}")
print("=" * 60)

# Interpretation
if train_accuracy - test_accuracy > 0.1:
    print("\n⚠ Warning: Large train-test gap suggests overfitting")
else:
    print("\n✓ Train-test performance gap is reasonable")

In [None]:
# Detailed classification report
print("\nClassification Report (Test Set):")
print("=" * 60)
print(classification_report(
    y_test['bin'], 
    y_test_pred,
    target_names=['SKIP_TRADE (0)', 'TAKE_TRADE (1)'],
    digits=4
))

### 3.3 Confusion Matrix

In [None]:
# Compute confusion matrix
cm = confusion_matrix(y_test['bin'], y_test_pred)

# Create labeled confusion matrix
plt.figure(figsize=(10, 8))

# Create annotations
group_names = ['True Neg\n(Correct Skip)', 'False Pos\n(Wrong Take)', 
               'False Neg\n(Missed Trade)', 'True Pos\n(Correct Take)']
group_counts = [f'{value:,}' for value in cm.flatten()]
group_percentages = [f'{value:.2%}' for value in cm.flatten() / np.sum(cm)]

labels = [f'{v1}\n{v2}\n{v3}' for v1, v2, v3 in zip(group_names, group_counts, group_percentages)]
labels = np.asarray(labels).reshape(2, 2)

# Plot heatmap
sns.heatmap(cm, annot=labels, fmt='', cmap='Blues', 
            xticklabels=['SKIP_TRADE (0)', 'TAKE_TRADE (1)'],
            yticklabels=['SKIP_TRADE (0)', 'TAKE_TRADE (1)'],
            cbar_kws={'label': 'Count'})

plt.title('Confusion Matrix - Meta-Labeling (Test Set)', fontsize=14, fontweight='bold')
plt.ylabel('Actual Label', fontsize=12)
plt.xlabel('Predicted Label', fontsize=12)
plt.tight_layout()
plt.show()

# Calculate and display key metrics from confusion matrix
tn, fp, fn, tp = cm.ravel()
print(f"\nConfusion Matrix Breakdown:")
print(f"  True Negatives (Correct Skip):  {tn:,} ({tn/np.sum(cm):.2%})")
print(f"  False Positives (Wrong Take):    {fp:,} ({fp/np.sum(cm):.2%})")
print(f"  False Negatives (Missed Trade):  {fn:,} ({fn/np.sum(cm):.2%})")
print(f"  True Positives (Correct Take):   {tp:,} ({tp/np.sum(cm):.2%})")

print(f"\nInterpretation for Meta-Labeling:")
print(f"  • False Positives: Model says TAKE but strategy loses (cost: trade execution)")
print(f"  • False Negatives: Model says SKIP but strategy wins (cost: missed profit)")
print(f"  • Goal: Minimize false positives to filter bad signals from primary strategy")

### 3.4 ROC Curve

In [None]:
# Calculate ROC curve
fpr, tpr, thresholds = roc_curve(y_test['bin'], y_test_proba)

# Plot ROC curve
plt.figure(figsize=(10, 8))
plt.plot(fpr, tpr, 'b-', label=f'Random Forest (AUC = {test_auc:.4f})', linewidth=2)
plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier (AUC = 0.5000)', linewidth=1)

plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.title('ROC Curve - Meta-Labeling (Test Set)', fontsize=14, fontweight='bold')
plt.legend(loc='lower right', fontsize=11)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

print(f"\nROC AUC Score: {test_auc:.4f}")
if test_auc > 0.7:
    print("✓ Good discrimination between classes")
elif test_auc > 0.6:
    print("⚠ Moderate discrimination - consider feature engineering")
else:
    print("⚠ Poor discrimination - model needs improvement")

## 4. Feature Importance Analysis

In [None]:
# Get feature importances
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': rf_clf.feature_importances_
}).sort_values('importance', ascending=False)

# Display top 20 features
print("Top 20 Most Important Features:")
print("=" * 60)
for idx, row in feature_importance.head(20).iterrows():
    print(f"{row['feature']:<30} {row['importance']:.6f}")

# Plot feature importance
plt.figure(figsize=(12, 10))
top_n = 20
top_features = feature_importance.head(top_n)

plt.barh(range(top_n), top_features['importance'])
plt.yticks(range(top_n), top_features['feature'])
plt.xlabel('Importance Score (Mean Decrease in Impurity)', fontsize=12)
plt.ylabel('Feature', fontsize=12)
plt.title(f'Top {top_n} Feature Importances - Random Forest', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

## 5. Save Trained Model

In [None]:
# Create models directory if it doesn't exist
models_dir = Path('models')
models_dir.mkdir(exist_ok=True)

# Generate timestamp for model filename
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
model_filename = models_dir / f'rf_meta_labeling_{timestamp}.pkl'

# Save model with metadata
model_data = {
    'model': rf_clf,
    'feature_names': X_train.columns.tolist(),
    'feature_importance': feature_importance,
    'train_date_range': (str(X_train.index[0]), str(X_train.index[-1])),
    'test_date_range': (str(X_test.index[0]), str(X_test.index[-1])),
    'train_samples': len(X_train),
    'test_samples': len(X_test),
    'metrics': {
        'test_accuracy': test_accuracy,
        'test_precision': test_precision,
        'test_recall': test_recall,
        'test_f1': test_f1,
        'test_auc': test_auc,
        'oob_score': rf_clf.oob_score_
    },
    'timestamp': timestamp
}

joblib.dump(model_data, model_filename)

print(f"\n✓ Model saved successfully")
print(f"  Location: {model_filename}")
print(f"  Size: {model_filename.stat().st_size / 1024 / 1024:.2f} MB")
print(f"\nTo load this model later:")
print(f"  model_data = joblib.load('{model_filename}')")
print(f"  rf_clf = model_data['model']")

## 6. Summary

Review the model training results and next steps.

In [None]:
print("=" * 70)
print("MODEL TRAINING SUMMARY")
print("=" * 70)
print(f"\nModel Type: Random Forest Classifier (Meta-Labeling)")
print(f"Strategy: Bollinger Band Mean Reversion")
print(f"\nDataset:")
print(f"  Training samples: {len(X_train):,}")
print(f"  Test samples: {len(X_test):,}")
print(f"  Features: {X_train.shape[1]}")
print(f"  Train period: {X_train.index[0].date()} to {X_train.index[-1].date()}")
print(f"  Test period: {X_test.index[0].date()} to {X_test.index[-1].date()}")
print(f"\nModel Performance (Test Set):")
print(f"  Accuracy:  {test_accuracy:.4f}")
print(f"  Precision: {test_precision:.4f}")
print(f"  Recall:    {test_recall:.4f}")
print(f"  F1 Score:  {test_f1:.4f}")
print(f"  ROC AUC:   {test_auc:.4f}")
print(f"  OOB Score: {rf_clf.oob_score_:.4f}")
print(f"\nTop 5 Most Important Features:")
for idx, row in feature_importance.head(5).iterrows():
    print(f"  {idx+1}. {row['feature']}: {row['importance']:.6f}")
print(f"\nModel saved: {model_filename.name}")
print("=" * 70)
print("\nNext Steps:")
print("  1. Run backtesting to evaluate trading performance")
print("  2. Compare with baseline (primary strategy without ML filter)")
print("  3. Try different probability thresholds to optimize precision/recall")
print("  4. Consider hyperparameter tuning (GridSearchCV, RandomSearchCV)")
print("  5. Experiment with sample weighting (concurrency, return attribution)")
print("=" * 70)