# Trend-Scanning Feature Generation

This notebook generates features specifically for **trend-scanning labels**.

**Approach:**
- Uses `trend_scanning_labels()` with `lookforward=False` to generate features
- These features capture adaptive trend characteristics without lookahead bias
- Output: feature_matrix_trend_scanning.csv ready for labeling

**Key Features:**
- Linear regression features (window, slope, t_value, rsquared)
- Trend strength and direction
- Statistical significance of trends

In [10]:
# Imports
import pandas as pd
import numpy as np
from load_data import load_bars
from trend_scanning import trend_scanning_labels
from ma_crossover_feature_engine import ForexFeatureEngine
import warnings
warnings.filterwarnings('ignore')

## 1. Load Data

In [11]:
# Load EURUSD tick bars
SYMBOL = 'EURUSD'
BAR_TYPE = 'tick'

print(f"Loading {SYMBOL} {BAR_TYPE} bars...")
df = load_bars(SYMBOL, BAR_TYPE)
close = df['close']

print(f"✓ Loaded {len(df):,} bars")
print(f"  Date range: {df.index[0]} to {df.index[-1]}")
df.head()

INFO: Loading tick bars from: EURUSD_tick_bars_20251102_113310.csv


Loading EURUSD tick bars...


INFO: Loaded 686,033 tick bars
INFO:   Start: 2023-01-02 07:33:51.458001
INFO:   End: 2025-10-31 22:58:59.181001
INFO:   Columns: ['open', 'high', 'low', 'close', 'tick_volume', 'bid_open', 'bid_high', 'bid_low', 'bid_close', 'ask_open', 'ask_high', 'ask_low', 'ask_close']
INFO:   Start: 2023-01-02 07:33:51.458001
INFO:   End: 2025-10-31 22:58:59.181001
INFO:   Columns: ['open', 'high', 'low', 'close', 'tick_volume', 'bid_open', 'bid_high', 'bid_low', 'bid_close', 'ask_open', 'ask_high', 'ask_low', 'ask_close']


✓ Loaded 686,033 bars
  Date range: 2023-01-02 07:33:51.458001 to 2025-10-31 22:58:59.181001


Unnamed: 0_level_0,open,high,low,close,tick_volume,bid_open,bid_high,bid_low,bid_close,ask_open,ask_high,ask_low,ask_close
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2023-01-02 07:33:51.458001,1.070205,1.070205,1.069125,1.069415,60,1.07017,1.07017,1.06909,1.06938,1.07024,1.07024,1.06916,1.06945
2023-01-02 07:38:30.862001,1.069445,1.069725,1.069355,1.069545,60,1.06941,1.06969,1.06932,1.06951,1.06948,1.06976,1.06939,1.06958
2023-01-02 07:44:08.259001,1.069535,1.069725,1.069525,1.069575,60,1.0695,1.06969,1.06949,1.06954,1.06957,1.06976,1.06956,1.06961
2023-01-02 07:48:52.326001,1.069545,1.069755,1.069495,1.069705,60,1.06951,1.06972,1.06946,1.06967,1.06958,1.06979,1.06953,1.06974
2023-01-02 07:52:47.369001,1.069725,1.070045,1.069485,1.069835,60,1.06969,1.07001,1.06945,1.0698,1.06976,1.07008,1.06952,1.06987


In [12]:
# Initialize feature engine
engine = ForexFeatureEngine(pair_name=SYMBOL)

# Generate all features
print("Generating features with ForexFeatureEngine...")
features = engine.calculate_all_features(
    price_data=df,  
    timeframe='M5',  # Timeframe string (e.g., 'M1', 'M5', 'M15', 'H1')
    lr_period=(5, 100),  # Trend-scanning window range
    additional_pairs=None  # Pass full data OHLC
)

print(f"✓ Generated {len(features.columns)} features")
print(f"  Shape: {features.shape}")

Generating features with ForexFeatureEngine...
Memory usage reduced from 500.27 MB to 223.52 MB (55.3% reduction)
Memory usage reduced from 500.27 MB to 223.52 MB (55.3% reduction)
✓ Generated 95 features
  Shape: (686033, 95)
✓ Generated 95 features
  Shape: (686033, 95)


## 2. Clean and Prepare Features

In [13]:
print("Features already cleaned by ForexFeatureEngine:")
print("  - Forward filled (ffill)")
print("  - Shifted by 1 to prevent lookahead")
print("  - Inf values replaced with NaN and filled with 0")
print(f"\n✓ Feature matrix ready")
print(f"  Shape: {features.shape}")
print(f"  Missing values: {features.isnull().sum().sum()}")
print(f"  Inf values: {np.isinf(features.select_dtypes(include=[np.number])).sum().sum()}")

Features already cleaned by ForexFeatureEngine:
  - Forward filled (ffill)
  - Shifted by 1 to prevent lookahead
  - Inf values replaced with NaN and filled with 0

✓ Feature matrix ready
  Shape: (686033, 95)
  Missing values: 0
  Inf values: 0
  Inf values: 0


## 3. Feature Summary

In [14]:
print("\n" + "="*60)
print("TREND-SCANNING FEATURE MATRIX SUMMARY")
print("="*60)
print(f"Total features: {len(features.columns)}")
print(f"Total observations: {len(features):,}")
print(f"Date range: {features.index[0]} to {features.index[-1]}")

print(f"\nFeature categories:")
categories = {
    'Moving Averages': sum(1 for c in features.columns if c.startswith('ma_') or 'ribbon' in c),
    'Trend-Scanning (LR)': sum(1 for c in features.columns if c.startswith('trend_')),
    'Volatility': sum(1 for c in features.columns if any(x in c for x in ['vol', 'atr'])),
    'Bollinger Bands': sum(1 for c in features.columns if c.startswith('bb_')),
    'ADX/Directional': sum(1 for c in features.columns if any(x in c for x in ['adx', 'dmp', 'dmn'])),
    'Momentum': sum(1 for c in features.columns if any(x in c for x in ['roc', 'momentum', 'efficiency'])),
    'Time Features': sum(1 for c in features.columns if any(x in c for x in ['hour', 'day', 'session'])),
    'Market Structure': sum(1 for c in features.columns if any(x in c for x in ['fractal', 'doji', 'hammer', 'inside', 'outside', 'near_'])),
    'Risk Metrics': sum(1 for c in features.columns if any(x in c for x in ['var', 'cvar', 'stress', 'drawdown', 'skew', 'kurt'])),
    'Price Position': sum(1 for c in features.columns if 'price_above' in c or 'hl_range' in c),
    'Trend Persistence': sum(1 for c in features.columns if 'persistence' in c or 'hh_ll' in c),
}

for cat, count in categories.items():
    print(f"  {cat}: {count}")

print(f"\nMemory usage: {features.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"Missing values: {features.isnull().sum().sum():,}")

# Display statistics for key trend features
print(f"\nKey Trend-Scanning Feature Statistics:")
trend_cols = [col for col in features.columns if col.startswith('trend_')]
if trend_cols:
    features[trend_cols].describe()


TREND-SCANNING FEATURE MATRIX SUMMARY
Total features: 95
Total observations: 686,033
Date range: 2023-01-02 07:33:51.458001 to 2025-10-31 22:58:59.181001

Feature categories:
  Moving Averages: 14
  Trend-Scanning (LR): 6
  Volatility: 15
  Bollinger Bands: 5
  ADX/Directional: 6
  Momentum: 5
  Time Features: 24
  Market Structure: 12
  Risk Metrics: 6
  Price Position: 5
  Trend Persistence: 2

Memory usage: 340.63 MB
Missing values: 0

Key Trend-Scanning Feature Statistics:

TREND-SCANNING FEATURE MATRIX SUMMARY
Total features: 95
Total observations: 686,033
Date range: 2023-01-02 07:33:51.458001 to 2025-10-31 22:58:59.181001

Feature categories:
  Moving Averages: 14
  Trend-Scanning (LR): 6
  Volatility: 15
  Bollinger Bands: 5
  ADX/Directional: 6
  Momentum: 5
  Time Features: 24
  Market Structure: 12
  Risk Metrics: 6
  Price Position: 5
  Trend Persistence: 2

Memory usage: 340.63 MB
Missing values: 0

Key Trend-Scanning Feature Statistics:


In [20]:
# Check for issues
print("Data quality checks:")

# Infinite values
inf_count = np.isinf(features.select_dtypes(include=[np.number])).sum().sum()
print(f"  Infinite values: {inf_count}")

# NaN values by column
nan_cols = features.isnull().sum()
nan_cols = nan_cols[nan_cols > 0].sort_values(ascending=False)
if len(nan_cols) > 0:
    print(f"\n  Columns with NaN values (top 10):")
    print(nan_cols.head(10))
else:
    print("  ✓ No NaN values")

# Feature statistics
print(f"\nFeature statistics:")
features.describe().T.head(90)

Data quality checks:
  Infinite values: 0
  ✓ No NaN values

Feature statistics:
  ✓ No NaN values

Feature statistics:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ma_10,686033.0,1.100525,0.038581,0.0,1.077005,1.090996,1.132458,1.191034
ma_20,686033.0,1.100508,0.038809,0.0,1.077006,1.090994,1.132470,1.190642
ma_50,686033.0,1.100460,0.039482,0.0,1.077012,1.090992,1.132480,1.189997
ma_100,686033.0,1.100379,0.040580,0.0,1.077029,1.090998,1.132501,1.189486
ma_200,686033.0,1.100217,0.042691,0.0,1.077026,1.090942,1.132556,1.187964
...,...,...,...,...,...,...,...,...
inside_bar,686033.0,0.102919,0.303854,0.0,0.000000,0.000000,0.000000,1.000000
outside_bar,686033.0,0.085920,0.280246,0.0,0.000000,0.000000,0.000000,1.000000
near_recent_high,686033.0,0.916008,0.277375,0.0,1.000000,1.000000,1.000000,1.000000
near_recent_low,686033.0,0.915428,0.278244,0.0,1.000000,1.000000,1.000000,1.000000


## 5. Save Feature Matrix

## 4. Integrity Verification

In [None]:
# Run comprehensive integrity check
print("="*80)
print("FEATURE INTEGRITY VERIFICATION")
print("="*80)

print(f"\n✓ Shape: {features.shape[0]:,} rows × {features.shape[1]} columns")
print(f"✓ Date range: {features.index[0]} to {features.index[-1]}")
print(f"✓ Memory usage: {features.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

print(f"\nData Quality:")
print(f"  ✓ Missing values: {features.isnull().sum().sum():,}")
print(f"  ✓ Infinite values: {np.isinf(features.select_dtypes(include=[np.number])).sum().sum():,}")
print(f"  ✓ Duplicate rows: {features.duplicated().sum():,}")
print(f"  ✓ Duplicate timestamps: {features.index.duplicated().sum():,}")

# Check for constant columns
constant_cols = [col for col in features.columns if features[col].nunique() <= 1]
if constant_cols:
    print(f"  ⚠ Constant columns ({len(constant_cols)}): {constant_cols}")
else:
    print(f"  ✓ No constant columns")

# Check for zero-heavy columns (>95% zeros)
zero_heavy = []
for col in features.columns:
    zero_pct = (features[col] == 0).sum() / len(features) * 100
    if zero_pct > 95:
        zero_heavy.append((col, zero_pct))

if zero_heavy:
    print(f"  ⚠ Columns with >95% zeros ({len(zero_heavy)}):")
    for col, pct in zero_heavy:
        print(f"    - {col}: {pct:.1f}% zeros")
else:
    print(f"  ✓ No zero-heavy columns")

print(f"\nIndex Integrity:")
print(f"  ✓ Monotonic increasing: {features.index.is_monotonic_increasing}")
print(f"  ✓ Unique timestamps: {features.index.is_unique}")

print(f"\n✓ Feature matrix ready for labeling")

In [16]:
# Save to CSV
output_file = 'data/features_trend_scanning.csv'
print(f"Saving feature matrix to {output_file}...")

features.to_csv(output_file)
print(f"✓ Saved {len(features):,} rows × {len(features.columns)} columns")

# Also save feature list for reference
feature_list_file = 'data/feature_list_trend_scanning.txt'
with open(feature_list_file, 'w') as f:
    f.write("Trend-Scanning Feature Matrix Columns\n")
    f.write("=" * 60 + "\n\n")
    for i, col in enumerate(features.columns, 1):
        f.write(f"{i}. {col}\n")

print(f"✓ Saved feature list to {feature_list_file}")
print("\nTrend-scanning feature matrix generation complete!")
print("Next step: Create training_data_trend_scanning.ipynb to add labels")

Saving feature matrix to data/features_trend_scanning.csv...
✓ Saved 686,033 rows × 95 columns
✓ Saved feature list to data/feature_list_trend_scanning.txt

Trend-scanning feature matrix generation complete!
Next step: Create training_data_trend_scanning.ipynb to add labels
✓ Saved 686,033 rows × 95 columns
✓ Saved feature list to data/feature_list_trend_scanning.txt

Trend-scanning feature matrix generation complete!
Next step: Create training_data_trend_scanning.ipynb to add labels


In [21]:
# Preview
features.tail(100)

Unnamed: 0_level_0,ma_10,ma_20,ma_50,ma_100,ma_200,ma_10_20_cross,ma_20_50_cross,ma_50_200_cross,ma_spread_10_20,ma_spread_20_50,...,inside_bar,outside_bar,near_recent_high,near_recent_low,fractal_trend_strength,fractal_trend_direction,fractal_ma_ratio,fractal_trend_confirmation,distance_to_fractal_resistance,distance_to_fractal_support
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2025-10-31 19:27:02.364001,1.152521,1.152764,1.153199,1.153257,1.153345,-1.0,-1.0,-1.0,-1.100038,-1.971911,...,0.0,0.0,1.0,1.0,0.15,-1.0,-0.000385,0.0,0.00515,0.05663
2025-10-31 19:28:50.238001,1.152495,1.152715,1.153164,1.153253,1.153331,-1.0,-1.0,-1.0,-0.999428,-2.031038,...,0.0,1.0,1.0,1.0,0.20,-1.0,-0.000248,0.0,0.00515,0.05663
2025-10-31 19:30:19.049001,1.152481,1.152673,1.153135,1.153249,1.153320,-1.0,-1.0,-1.0,-0.854949,-2.052101,...,0.0,0.0,1.0,1.0,0.20,-1.0,-0.000168,0.0,0.00515,0.05663
2025-10-31 19:31:16.704001,1.152464,1.152647,1.153107,1.153244,1.153308,-1.0,-1.0,-1.0,-0.838646,-2.101870,...,0.0,0.0,1.0,1.0,0.20,-1.0,-0.000050,0.0,0.00515,0.05663
2025-10-31 19:32:48.078001,1.152452,1.152627,1.153090,1.153240,1.153297,-1.0,-1.0,-1.0,-0.780515,-2.068645,...,0.0,0.0,1.0,1.0,0.20,-1.0,0.000090,0.0,0.00515,0.05663
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-10-31 22:56:19.897001,1.153411,1.153191,1.152915,1.152688,1.152984,1.0,1.0,-1.0,1.052408,1.321729,...,0.0,1.0,1.0,1.0,0.20,1.0,0.000368,0.0,0.00515,0.05663
2025-10-31 22:56:52.872001,1.153458,1.153247,1.152939,1.152701,1.152989,1.0,1.0,-1.0,1.006929,1.461472,...,0.0,0.0,1.0,0.0,0.15,1.0,0.000484,0.0,0.00515,0.05663
2025-10-31 22:57:43.578001,1.153525,1.153298,1.152960,1.152715,1.152995,1.0,1.0,-1.0,1.087727,1.625701,...,0.0,0.0,1.0,1.0,0.15,1.0,0.000387,0.0,0.00515,0.05663
2025-10-31 22:58:27.978001,1.153596,1.153346,1.152987,1.152732,1.153000,1.0,1.0,-1.0,1.204015,1.731420,...,0.0,0.0,1.0,0.0,0.15,1.0,0.000471,0.0,0.00515,0.05663
