# Feature & Target Pipeline
Quick tests and evaluation on new targets/features/models

In [8]:
import pandas as pd
from pathlib import Path
from typing import Optional
import time
from data_pipeline import load_data  # This just loads the data and cleans it
from featureEngineer import FeatureEngineer
from targetEngineer import ExpirationTargetEngineer
from ML_setup import CONFIG
from ML_general_tools import *
from pathlib import Path

print("Imports and configuration ready")

# Build features, targets, and combined dataframe
t0 = time.time()
raw_history = load_data(CONFIG["data"]["path"])
print(f"Loaded raw data: {raw_history.shape} in {time.time()-t0:.2f}s")

# Use slice for faster testing (or use [:] for full data)
history_slice = raw_history[:]  # Last 3000 rows for faster testing
print(f"Using slice: {history_slice.shape}")

feature_params = dict(CONFIG["features"]["params"])
heavy_cache_cfg = CONFIG["features"].get("heavy_cache", {})
heavy_cache_root = Path(heavy_cache_cfg.get("directory", "cache/heavy_features"))

current_output_root_str = CONFIG["output"]["directory"]
current_output_root_path = Path(current_output_root_str)

paths = {
    "root": current_output_root_path,
    "feature_selection": current_output_root_path / CONFIG["output"]["subdirectories"]["features"],
    "trained_models": current_output_root_path / CONFIG["output"]["subdirectories"]["models"],
    "hpt_studies": current_output_root_path / CONFIG["output"]["subdirectories"]["hpt"],
    "feature_cache": current_output_root_path / CONFIG["output"]["subdirectories"]["cache"]
}

cache_dir = heavy_cache_root
cache_dir.mkdir(parents=True, exist_ok=True)
cache_files = sorted(cache_dir.glob("heavy_features_v*.pkl"))
cache_ready = bool(cache_files)
if cache_ready:
    print(f"Heavy cache ready: {cache_files[-1].name} (total {len(cache_files)}) in {cache_dir}")
else:
    print(f"No heavy cache file found in {cache_dir}; initial fit will populate.")

## Feature Engineering
fe = FeatureEngineer(verbose=True, **{k: v for k, v in feature_params.items() if k != "verbose"})

## Cache usage
cache_ready = bool(cache_files)  # Use actual cache status

manual_features = None
if cache_ready and fe.heavy_cache.load():
    print("\nâœ“ Using heavy cache (only prev_cycle features cached)")
    print("  Note: Rolling/stateless features still computed on-the-fly")
    t1 = time.time()
    fe._heavy_payload = fe.heavy_cache.payload
    reference = fe._prepare_reference_frame(history_slice)
    fe._full_reference = reference
    manual_features = fe._compute_all_features(reference, build_heavy=False)
    fe.feature_names_out_ = manual_features.columns.tolist()
    fe._reference_features = manual_features
    print(f"  Features computed in {time.time()-t1:.2f}s -> shape: {manual_features.shape}")
else:
    print("\nâš  Heavy cache not available; running full fit (slower)")
    t1 = time.time()
    verbose_flag = feature_params.pop("verbose", False)
    fe = FeatureEngineer(verbose=True, **feature_params)
    fe.fit(history_slice)
    manual_features = fe.transform(history_slice)
    print(f"  Full fit+transform in {time.time()-t1:.2f}s -> shape: {manual_features.shape}")

feature_engineer = fe
features = manual_features.copy()

## 2a. Volatility Regime Target Engineering ---
from targetEngineer import VolatilityRegimeEngineer

print("\n--- Building Volatility Regime Targets ---")
t2 = time.time()

regime_engineer = VolatilityRegimeEngineer(
    lookback_window=24*3,    # 3 days lookback for vol
    seasonal_window=24*30,   # 30 days to learn patterns
    forward_window=24,       # 24h classification
    trend_std=1.2,           # 1.2 daily sigmas
    jump_std=3.0,            # 3.0 daily sigmas
    jump_speed_window=6,     # 6h window for jump detection
)

regime_engineer.fit(features)
targets = regime_engineer.transform(features)
print(f"Regime targets built in {time.time()-t2:.2f}s -> shape: {targets.shape}")

# Check distribution
dist = regime_engineer.get_regime_distribution(features)
print("\nRegime distribution:")
print(dist)

# Combine
combined_df = pd.concat([features, targets], axis=1)
print(f"\nCombined shape: {combined_df.shape}")
print(f"Total pipeline time: {time.time()-t0:.2f}s")


Imports and configuration ready
=== Loading .hist_db_1h.csv ===

Initial rows: 53,963

=== FOUND ISSUES (prior to automated fixes) ===

ðŸ”´ TEMPORAL: Missing hours: 1 cases
  Missing timestamps sample:
    2025-11-04 13:00:00

ðŸ”´ DATA INTEGRITY: Identical consecutive OHLC rows: 174 cases
  Sample cases:
    {'o': '7110.10', 'h': '7110.10', 'l': '7110.10', 'c': '7110.10', 'volCcy': '0.00'}
    {'o': '7110.10', 'h': '7110.10', 'l': '7110.10', 'c': '7110.10', 'volCcy': '0.00'}
    {'o': '7110.10', 'h': '7110.10', 'l': '7110.10', 'c': '7110.10', 'volCcy': '0.00'}
  Affected dates (sample): 2020-01-02, 2020-01-03, 2020-01-04, 2020-01-05, 2020-01-06

=== APPLYING AUTOMATED FIXES ===
ACTION: Resampled/Reindexed to 53964 hourly intervals (was 53963).
ACTION: Forward-filled NaNs after resampling. (5 NaNs potentially filled by ffill).

=== FINAL STATUS (after automated fixes) ===
DataFrame shape post-fixes: (53964, 5) (Original: (53963, 6))
Date range: 2019-10-01 00:00:00 to 2025-11-26 11:00:

  df[f"{vol_feat}_x_tte_sin"] = df[vol_feat] * df["tte_phase_sin"]
  df[f"{vol_feat}_x_tte_cos"] = df[vol_feat] * df["tte_phase_cos"]
  df[f"{vol_feat}_x_tte_sqrt"] = df[vol_feat] * tte_sqrt
  df[f"{vol_feat}_x_tte"] = df[vol_feat] * tte
  df[f"{vol_feat}_x_tte_sq"] = df[vol_feat] * (tte_normalized ** 2)
  df[f"{vol_feat}_x_tte_sin"] = df[vol_feat] * df["tte_phase_sin"]
  df[f"{vol_feat}_x_tte_cos"] = df[vol_feat] * df["tte_phase_cos"]
  df["vol_term_x_tte_sqrt"] = vol_term_slope * tte_sqrt  # Black-Scholes scaling
  df["vol_term_x_tte"] = vol_term_slope * tte
  df["vol_term_x_tte_sq"] = vol_term_slope * (tte_normalized ** 2)
  df[f"{vol_feat}_weekend"] = df[vol_feat] * df["is_weekend"]
  df[f"{vol_feat}_weekday"] = df[vol_feat] * (1 - df["is_weekend"])
  df[f"{vol_feat}_weekend"] = df[vol_feat] * df["is_weekend"]
  df[f"{vol_feat}_weekday"] = df[vol_feat] * (1 - df["is_weekend"])
  df[f"{vol_feat}_weekend"] = df[vol_feat] * df["is_weekend"]
  df[f"{vol_feat}_weekday"] = df[vol_feat] *

[FeatureEngineer] feature build complete; rows=53964, cols=450, total=10.64s [stateless:234.9ms, merge_stateless:4.3ms, temporal:12.6ms, rolling:762.1ms, prev_week_cycle:44.5ms, current_cycle:9125.5ms, non_linear:250.3ms, custom_interactions:27.6ms, cleanup:177.9ms]
  Features computed in 10.64s -> shape: (53964, 450)

--- Building Volatility Regime Targets ---
Regime targets built in 341.23s -> shape: (53964, 6)
Regime targets built in 341.23s -> shape: (53964, 6)

Regime distribution:
regime_label
0    43673
1     6364
2     3795
Name: count, dtype: Int64

Combined shape: (53964, 456)
Total pipeline time: 689.30s

Regime distribution:
regime_label
0    43673
1     6364
2     3795
Name: count, dtype: Int64

Combined shape: (53964, 456)
Total pipeline time: 689.30s


In [None]:
import pickle
from pathlib import Path

# Define cache paths
cache_root = paths["root"]
cache_root.mkdir(parents=True, exist_ok=True)

feature_cache = cache_root / "features_cache.pkl"
target_cache = cache_root / "targets_cache.pkl"
combined_cache = cache_root / "combined_cache.pkl"

# Option 1: Load from cache if exists
FORCE_REBUILD = False  # Set to True to rebuild from scratch

if not FORCE_REBUILD and feature_cache.exists() and target_cache.exists():
    print("=" * 60)
    print("Loading cached features and targets...")
    t_load = time.time()
    
    with open(feature_cache, 'rb') as f:
        features = pickle.load(f)
    with open(target_cache, 'rb') as f:
        targets = pickle.load(f)
    with open(combined_cache, 'rb') as f:
        combined_df = pickle.load(f)
    
    print(f"âœ“ Loaded from cache in {time.time()-t_load:.2f}s")
    print(f"  Features: {features.shape}")
    print(f"  Targets: {targets.shape}")
    print(f"  Combined: {combined_df.shape}")
    print(f"  Date range: {features.index[0]} to {features.index[-1]}")
    print("=" * 60)
    
else:
    print("=" * 60)
    print("Cache not found or FORCE_REBUILD=True - will save after first run")
    print("To use cache next time:")
    print("  1. Run the first cell with history_slice = raw_history[:]")
    print("  2. Wait for features/targets to compute")
    print("  3. This cell will save them")
    print("  4. Next time, set FORCE_REBUILD=False and skip the first cell")
    print("=" * 60)
    
    # Save the current run to cache
    if 'features' in globals() and 'targets' in globals():
        print("\nSaving current features and targets to cache...")
        t_save = time.time()
        
        with open(feature_cache, 'wb') as f:
            pickle.dump(features, f)
        with open(target_cache, 'wb') as f:
            pickle.dump(targets, f)
        with open(combined_cache, 'wb') as f:
            pickle.dump(combined_df, f)
        
        print(f"âœ“ Saved to cache in {time.time()-t_save:.2f}s")
        print(f"  Location: {cache_root}")
    else:
        print("âš  No features/targets to save yet - run the first cell first")


Cache not found or FORCE_REBUILD=True - will save after first run
To use cache next time:
  1. Run the first cell with history_slice = raw_history[:]
  2. Wait for features/targets to compute
  3. This cell will save them
  4. Next time, set FORCE_REBUILD=False and skip the first cell

Saving current features and targets to cache...
âœ“ Saved to cache in 0.36s
  Location: research_vol
âœ“ Saved to cache in 0.36s
  Location: research_vol


In [None]:
# Comprehensive NaN analysis in features
print("=" * 70)
print("NaN Analysis in Features")
print("=" * 70)

# 1. Overall NaN statistics
nan_counts = features.isna().sum()
nan_features = nan_counts[nan_counts > 0].sort_values(ascending=False)

print(f"\nTotal features: {len(features.columns)}")
print(f"Features with NaNs: {len(nan_features)}")
print(f"Total rows: {len(features)}")

# 2. Group NaN features by prefix to identify source
print("\n" + "=" * 70)
print("NaN Features Grouped by Source:")
print("=" * 70)

feature_groups = {}
for feat in nan_features.index:
    # Extract prefix (everything before first underscore or digit)
    if '_' in feat:
        prefix = feat.split('_')[0]
    else:
        prefix = 'other'
    
    if prefix not in feature_groups:
        feature_groups[prefix] = []
    feature_groups[prefix].append((feat, nan_counts[feat]))

# Sort groups by total NaN count
sorted_groups = sorted(feature_groups.items(), 
                       key=lambda x: sum(count for _, count in x[1]), 
                       reverse=True)

for prefix, features_list in sorted_groups[:10]:  # Top 10 groups
    total_nans = sum(count for _, count in features_list)
    print(f"\n{prefix.upper()} features: {len(features_list)} features, {total_nans:,} total NaNs")
    # Show top 5 within each group
    for feat, count in sorted(features_list, key=lambda x: x[1], reverse=True)[:5]:
        pct = (count / len(features)) * 100
        print(f"  {feat:50s} {count:6,} NaNs ({pct:5.2f}%)")

# 3. Analyze NaN patterns (start/middle/end)
print("\n" + "=" * 70)
print("NaN Location Analysis (Top 10 worst features):")
print("=" * 70)

for feat in nan_features.head(10).index:
    series = features[feat]
    nan_mask = series.isna()
    
    # Find first and last valid index
    valid_indices = series[~nan_mask].index
    if len(valid_indices) == 0:
        print(f"\n{feat}: ALL NaNs!")
        continue
    
    first_valid = valid_indices[0]
    last_valid = valid_indices[-1]
    
    # Count NaNs at start, middle, end
    start_nans = nan_mask.loc[:first_valid].sum() - 1  # -1 to exclude first valid
    end_nans = nan_mask.loc[last_valid:].sum() - 1  # -1 to exclude last valid
    middle_nans = nan_mask.sum() - start_nans - end_nans
    
    print(f"\n{feat}:")
    print(f"  Total NaNs: {nan_mask.sum():,} ({nan_mask.sum()/len(features)*100:.2f}%)")
    print(f"  Start NaNs: {start_nans:,} (before {first_valid})")
    print(f"  Middle NaNs: {middle_nans:,}")
    print(f"  End NaNs: {end_nans:,} (after {last_valid})")

# 4. Check specific feature types that are expected
print("\n" + "=" * 70)
print("Expected NaN Sources (prev_weekend, empirical, etc.):")
print("=" * 70)

prev_weekend_features = [f for f in nan_features.index if 'prev_saturday' in f or 'prev_sunday' in f]
empirical_features = [f for f in nan_features.index if 'emp_' in f]
prev_weekday_features = [f for f in nan_features.index if 'prev_weekday' in f]

print(f"\nprev_saturday/sunday features with NaNs: {len(prev_weekend_features)}")
if prev_weekend_features:
    for feat in prev_weekend_features[:5]:
        print(f"  {feat}: {nan_counts[feat]:,} NaNs")

print(f"\nemp_ (empirical) features with NaNs: {len(empirical_features)}")
if empirical_features:
    for feat in empirical_features[:5]:
        print(f"  {feat}: {nan_counts[feat]:,} NaNs")

print(f"\nprev_weekday features with NaNs: {len(prev_weekday_features)}")
if prev_weekday_features:
    for feat in prev_weekday_features[:5]:
        print(f"  {feat}: {nan_counts[feat]:,} NaNs")

# 5. Check which rows have NaNs
print("\n" + "=" * 70)
print("Row-wise NaN Analysis:")
print("=" * 70)

rows_with_nans = features.isna().any(axis=1)
print(f"Rows with ANY NaNs: {rows_with_nans.sum():,} / {len(features):,} ({rows_with_nans.sum()/len(features)*100:.2f}%)")

# Show first and last rows with NaNs
nan_row_indices = features[rows_with_nans].index
if len(nan_row_indices) > 0:
    print(f"First row with NaNs: {nan_row_indices[0]}")
    print(f"Last row with NaNs: {nan_row_indices[-1]}")
    
    # Count consecutive NaNs at start and end
    consecutive_start = 0
    for i in range(len(rows_with_nans)):
        if rows_with_nans.iloc[i]:
            consecutive_start += 1
        else:
            break
    
    consecutive_end = 0
    for i in range(len(rows_with_nans)-1, -1, -1):
        if rows_with_nans.iloc[i]:
            consecutive_end += 1
        else:
            break
    
    print(f"Consecutive NaN rows at start: {consecutive_start}")
    print(f"Consecutive NaN rows at end: {consecutive_end}")


  features[100:][nan_mask]


Unnamed: 0,o,h,l,c,volCcy,time_to_exp1_hr,time_elapsed,hour,day_of_week,is_weekend,...,prev_saturday_range_x_vol,prev_sunday_range_x_vol,weekday_vs_saturday_prog,weekday_vs_sunday_prog,prev_cycle_progress_x_hour,extreme_range_vol,skew_vol_extreme,kurtosis_vol_extreme,distance_vol_extreme,vol_surprise_clustering
2019-10-05 04:00:00,8149.4,8156.7,8139.9,8139.9,370.0,3.0,21.0,5,5,1,...,0.005596,0.008488,-0.000906,-0.000241,0.000108,0.003825,-0.195495,-0.766529,0.017159,-4.370304
2019-10-05 05:00:00,8139.9,8140.0,8124.2,8128.0,355.0,2.0,22.0,6,5,1,...,0.005495,0.008589,0.000240,0.000740,-0.000755,0.002073,-0.197283,-0.751756,0.015109,-3.933097
2019-10-05 06:00:00,8128.0,8145.4,8115.3,8142.3,615.0,1.0,23.0,7,5,1,...,0.005467,0.008732,-0.000522,0.000732,-0.000281,0.002731,-0.163173,-0.746458,0.015558,-3.672246
2019-10-05 07:00:00,8142.3,8142.3,8112.6,8123.3,844.0,24.0,0.0,8,5,1,...,0.000000,0.000000,0.000000,0.000000,-0.000000,0.004386,-0.164850,-0.763307,0.014923,-1.011547
2019-10-05 08:00:00,8123.3,8123.3,8081.0,8102.8,1864.0,23.0,1.0,9,5,1,...,0.001047,0.001030,0.000259,-0.000048,0.000060,0.006320,-0.182369,-0.680221,0.015917,0.973009
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-18 14:00:00,16739.7,16739.7,16739.7,16739.7,0.0,17.0,7.0,15,6,1,...,0.001108,0.001329,-0.000538,0.000193,0.000034,0.000000,-0.000000,-0.317772,0.014483,-1.000000
2022-12-18 15:00:00,16739.7,16739.7,16739.7,16739.7,0.0,16.0,8.0,16,6,1,...,0.001227,0.001406,-0.000350,0.000076,-0.000000,0.000000,-0.000000,-0.316647,0.014138,-1.000000
2022-12-18 16:00:00,16739.7,16739.7,16739.7,16739.7,0.0,15.0,9.0,17,6,1,...,0.000804,0.000895,-0.000991,-0.000439,0.000113,0.000000,-0.000000,-0.186966,0.013975,-1.000000
2022-12-18 17:00:00,16739.7,16739.7,16739.7,16739.7,0.0,14.0,10.0,18,6,1,...,0.000404,0.000451,0.000272,0.000524,-0.000120,0.000000,-0.000000,-0.087721,0.013408,-1.000000


In [17]:


# Clean combined dataframe - drop first month and last 11 rows (minimal cleaning for small dataset)
months_to_drop = 1  # Only 1 month for small dataset
tail_rows_to_drop = 11

cutoff = combined_df.index.min() + pd.DateOffset(months=months_to_drop)
print(f"Removing data before {cutoff:%Y-%m-%d} (first {months_to_drop} months)")
combined_df_clean = combined_df.loc[combined_df.index >= cutoff]


nan_mask_clean = combined_df_clean.isna().any(axis=1)
print(f"NaN rows after cutoff removal: {nan_mask_clean.sum()} / {combined_df_clean.shape[0]}")
combined_df_clean


Removing data before 2019-11-01 (first 1 months)
NaN rows after cutoff removal: 173 / 53220


Unnamed: 0,o,h,l,c,volCcy,time_to_exp1_hr,time_elapsed,hour,day_of_week,is_weekend,...,skew_vol_extreme,kurtosis_vol_extreme,distance_vol_extreme,vol_surprise_clustering,regime_label,max_fwd_z_score,max_jump_z_score,box_std_deseasonalized,box_std_raw,seasonal_vol
2019-11-01 00:00:00,9151.2,9155.8,9115.0,9135.0,1174.0,7.0,17.0,1,4,0,...,1.088773,0.125074,0.121527,-4.691296,0,0.572082,1.144163,0.006387,0.008368,0.005900
2019-11-01 01:00:00,9135.0,9149.0,9108.8,9108.8,987.0,6.0,18.0,2,4,0,...,1.117588,0.127607,0.122305,-3.974554,0,0.641508,1.283017,0.006610,0.008345,0.005686
2019-11-01 02:00:00,9108.9,9148.8,9075.2,9137.3,1655.0,5.0,19.0,3,4,0,...,1.063914,0.100095,0.098497,-5.231596,0,0.461672,0.923345,0.007804,0.008362,0.004826
2019-11-01 03:00:00,9137.3,9137.3,9061.5,9082.9,1315.0,4.0,20.0,4,4,0,...,0.568728,0.023961,0.064775,-2.253028,0,0.640387,1.280775,0.007529,0.008361,0.005001
2019-11-01 04:00:00,9083.0,9123.4,9060.0,9099.7,1496.0,3.0,21.0,5,4,0,...,0.478705,-0.114750,0.069909,-4.428023,0,0.661488,1.322976,0.006719,0.008273,0.005546
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-11-26 07:00:00,87725.2,87900.0,87637.7,87872.6,195.0,24.0,0.0,8,2,0,...,0.013903,-1.149395,0.028068,-1.972040,0,0.467373,0.000000,0.006393,0.005483,0.003863
2025-11-26 08:00:00,87872.7,87881.8,87342.9,87361.7,232.0,23.0,1.0,9,2,0,...,0.014572,-1.115242,0.026972,-6.318691,0,0.249501,0.000000,0.007204,0.005481,0.003427
2025-11-26 09:00:00,87353.5,87396.7,86627.9,86776.2,567.0,22.0,2.0,10,2,0,...,-0.054797,-1.139544,0.030126,-4.989376,0,0.092720,0.000000,0.005668,0.005495,0.004367
2025-11-26 10:00:00,86772.2,86999.9,86595.8,86879.7,195.0,21.0,3.0,11,2,0,...,-0.108033,-0.985923,0.033643,7.130272,0,0.017461,0.000000,0.008198,0.005503,0.003023


In [None]:



if tail_rows_to_drop > 0:
    print(f"Dropping last {tail_rows_to_drop} rows to avoid trailing NaNs")
    combined_df_clean = combined_df_clean.iloc[:-tail_rows_to_drop]

print(f"Rows after cleaning: {len(combined_df_clean)} (from {combined_df_clean.index[0]} to {combined_df_clean.index[-1]})")

# Split into train/val/test (80/10/10)
n_samples = len(combined_df_clean)
train_end = int(n_samples * 0.8)
val_end = train_end + int(n_samples * 0.1)

# Get feature and target columns
feature_cols = features.columns.intersection(combined_df_clean.columns)
target_cols = targets.columns.intersection(combined_df_clean.columns)

X_train = combined_df_clean[feature_cols].iloc[:train_end]
X_val = combined_df_clean[feature_cols].iloc[train_end:val_end]
X_test = combined_df_clean[feature_cols].iloc[val_end:]

y_train = combined_df_clean[target_cols].iloc[:train_end]
y_val = combined_df_clean[target_cols].iloc[train_end:val_end]
y_test = combined_df_clean[target_cols].iloc[val_end:]

print(f"\nX shapes -> train {X_train.shape}, val {X_val.shape}, test {X_test.shape}")
print(f"y shapes -> train {y_train.shape}, val {y_val.shape}, test {y_test.shape}")

# Quick NaN check on training data
train_nans = X_train.isna().sum()
if train_nans.sum() > 0:
    print(f"\nâš  Training features with NaNs: {(train_nans > 0).sum()} columns")
    print(f"  Max NaNs in any column: {train_nans.max()} ({train_nans.max()/len(X_train):.1%})")
else:
    print("\nâœ“ No NaNs in training features")
