# Feature Matrix Generation

This notebook generates a comprehensive feature matrix for EURUSD tick bars using:
- ForexFeatureEngine (MA features, volatility, trend, risk, market structure)
- Technical indicators (RSI, MACD, ADX, ATR, Bollinger Bands)
- Momentum and autocorrelation features
- Higher-order return moments (skewness, kurtosis)
- Lagged returns and volatility estimates

Output: feature_matrix.csv ready for labeling and model training

In [None]:
# Imports
import pandas as pd
import numpy as np
from load_data import load_bars
from features import create_meta_features
import warnings
warnings.filterwarnings('ignore')

## 1. Load Data

In [None]:
# Load EURUSD tick bars
SYMBOL = 'EURUSD'
BAR_TYPE = 'time'
LOOKBACK = 100  # Fixed: was string '100', now integer 100

print(f"Loading {SYMBOL} {BAR_TYPE} bars...")
df = load_bars(SYMBOL, BAR_TYPE)

print(f"✓ Loaded {len(df):,} bars")
print(f"  Date range: {df.index[0]} to {df.index[-1]}")
print(f"  Columns: {list(df.columns)}")
df.head()

## 2. Generate Core Features

In [None]:
# Generate all features
features = create_meta_features(
    data=df,
    lookback_window=LOOKBACK,
    bb_period=20,  # Bollinger Bands period
    bb_std=2,   # Bollinger Bands standard deviation
)

print(f"\n✓ Generated {len(features.columns)} features")
print(f"  Shape: {features.shape}")
features.head(50)

## 3. Feature Summary

In [None]:
print("\n" + "="*60)
print("TRIPLE-BARRIER FEATURE MATRIX SUMMARY")
print("="*60)
print(f"Total features: {len(features.columns)}")
print(f"Total observations: {len(features):,}")
print(f"Date range: {features.index[0]} to {features.index[-1]}")
print(f"\nFeature categories:")

# Updated categories to match actual features in create_meta_features
categories = {
    'Spread': sum(1 for c in features.columns if 'spread' in c),
    'Bollinger Bands': sum(1 for c in features.columns if c.startswith('bb_')),
    'Returns (Lagged)': sum(1 for c in features.columns if c.startswith('returns')),
    'Volatility (Yang-Zhang)': sum(1 for c in features.columns if c.startswith('vol')),
    'Autocorrelation': sum(1 for c in features.columns if c.startswith('autocorr')),
    'Period Volatility (H/D)': sum(1 for c in features.columns if any(c.startswith(x) for x in ['H1', 'H4', 'D1'])),
    'Higher Moments': sum(1 for c in features.columns if any(x in c for x in ['skew', 'kurt'])),
    'True Range/ATR': sum(1 for c in features.columns if c in ['tr', 'atr']),
    'MA Differences': sum(1 for c in features.columns if c.startswith('sma_diff')),
    'Momentum': sum(1 for c in features.columns if c.startswith('mom_')),
    'RSI': sum(1 for c in features.columns if 'rsi' in c.lower()),
    'Stochastic': sum(1 for c in features.columns if c.startswith('stoch')),
    'ADX/Directional': sum(1 for c in features.columns if any(x in c for x in ['adx', 'dmp', 'dmn', 'dm_net'])),
    'MACD': sum(1 for c in features.columns if c.startswith('macd')),
    'Volume (OBV/AD)': sum(1 for c in features.columns if any(c.startswith(x) for x in ['obv', 'ad', 'adosc'])),
    'MA Bias': sum(1 for c in features.columns if c.startswith('bias_')),
    'Acceleration': sum(1 for c in features.columns if c.startswith('acc_')),
}

for cat, count in categories.items():
    if count > 0:
        print(f"  {cat}: {count}")

print(f"\nMemory usage: {features.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"Missing values: {features.isnull().sum().sum():,}")

## 4. Data Quality Check

In [None]:
# Check for issues
print("Data quality checks:")

# Infinite values
inf_count = np.isinf(features.select_dtypes(include=[np.number])).sum().sum()
print(f"  Infinite values: {inf_count}")

# NaN values by column
nan_cols = features.isnull().sum()
nan_cols = nan_cols[nan_cols > 0].sort_values(ascending=False)
if len(nan_cols) > 0:
    print(f"\n  Columns with NaN values (top 10):")
    print(nan_cols.head(10))
else:
    print("  ✓ No NaN values")

# Feature statistics
print(f"\nFeature statistics:")
features.describe().T.head(50)

## 5. Save Feature Matrix

In [None]:
# Save to CSV
output_file = 'data/features_triple_barrier.csv'
print(f"Saving feature matrix to {output_file}...")

features.to_csv(output_file)
print(f"✓ Saved {len(features):,} rows × {len(features.columns)} columns")

# Also save feature list for reference
feature_list_file = 'data/feature_list.txt'
with open(feature_list_file, 'w') as f:
    f.write("Feature Matrix Columns\n")
    f.write("=" * 60 + "\n\n")
    for i, col in enumerate(features.columns, 1):
        f.write(f"{i}. {col}\n")

print(f"✓ Saved feature list to {feature_list_file}")
print("\nFeature matrix generation complete!")
print("Next step: Create training_data.ipynb to add labels and split data")

In [None]:
# Quick preview
features.tail(10)