# Feature & Target Pipeline
Quick tests and evaluation on new targets/features/models

In [4]:
import pandas as pd
from pathlib import Path
from typing import Optional
import time
from data_pipeline import load_data  # This just loads the data and cleans it
from featureEngineer import FeatureEngineer
from targetEngineer import ExpirationTargetEngineer
from ML_setup import CONFIG
from ML_general_tools import *
from pathlib import Path

print("Imports and configuration ready")

# Build features, targets, and combined dataframe
t0 = time.time()
raw_history = load_data(CONFIG["data"]["path"])
print(f"Loaded raw data: {raw_history.shape} in {time.time()-t0:.2f}s")

# Use slice for faster testing (or use [:] for full data)
history_slice = raw_history[:]  # Last 3000 rows for faster testing
print(f"Using slice: {history_slice.shape}")

feature_params = dict(CONFIG["features"]["params"])
heavy_cache_cfg = CONFIG["features"].get("heavy_cache", {})
heavy_cache_root = Path(heavy_cache_cfg.get("directory", "cache/heavy_features"))

current_output_root_str = CONFIG["output"]["directory"]
current_output_root_path = Path(current_output_root_str)

paths = {
    "root": current_output_root_path,
    "feature_selection": current_output_root_path / CONFIG["output"]["subdirectories"]["features"],
    "trained_models": current_output_root_path / CONFIG["output"]["subdirectories"]["models"],
    "hpt_studies": current_output_root_path / CONFIG["output"]["subdirectories"]["hpt"],
    "feature_cache": current_output_root_path / CONFIG["output"]["subdirectories"]["cache"]
}

cache_dir = heavy_cache_root
cache_dir.mkdir(parents=True, exist_ok=True)
cache_files = sorted(cache_dir.glob("heavy_features_v*.pkl"))
cache_ready = bool(cache_files)
if cache_ready:
    print(f"Heavy cache ready: {cache_files[-1].name} (total {len(cache_files)}) in {cache_dir}")
else:
    print(f"No heavy cache file found in {cache_dir}; initial fit will populate.")

## Feature Engineering
fe = FeatureEngineer(verbose=True, **{k: v for k, v in feature_params.items() if k != "verbose"})

## Cache usage
cache_ready = bool(cache_files)  # Use actual cache status
cache_ready = False
manual_features = None
if cache_ready and fe.heavy_cache.load():
    print("\n‚úì Using heavy cache (only prev_cycle features cached)")
    print("  Note: Rolling/stateless features still computed on-the-fly")
    t1 = time.time()
    fe._heavy_payload = fe.heavy_cache.payload
    reference = fe._prepare_reference_frame(history_slice)
    fe._full_reference = reference
    manual_features = fe._compute_all_features(reference, build_heavy=False)
    fe.feature_names_out_ = manual_features.columns.tolist()
    fe._reference_features = manual_features
    print(f"  Features computed in {time.time()-t1:.2f}s -> shape: {manual_features.shape}")
else:
    print("\n‚ö† Heavy cache not available; running full fit (slower)")
    t1 = time.time()
    verbose_flag = feature_params.pop("verbose", False)
    fe = FeatureEngineer(verbose=True, **feature_params)
    fe.fit(history_slice)
    manual_features = fe.transform(history_slice)
    print(f"  Full fit+transform in {time.time()-t1:.2f}s -> shape: {manual_features.shape}")

feature_engineer = fe
features = manual_features.copy()

## 2a. Volatility Regime Target Engineering ---
from targetEngineer import VolatilityRegimeEngineer

print("\n--- Building Volatility Regime Targets ---")
t2 = time.time()

regime_engineer = VolatilityRegimeEngineer(
    lookback_window=24*3,    # 3 days lookback for vol
    seasonal_window=24*30,   # 30 days to learn patterns
    forward_window=24,       # 24h classification
    trend_std=1.2,           # 1.2 daily sigmas
    jump_std=3.0,            # 3.0 daily sigmas
    jump_speed_window=6,     # 6h window for jump detection
)

regime_engineer.fit(features)
targets = regime_engineer.transform(features)
print(f"Regime targets built in {time.time()-t2:.2f}s -> shape: {targets.shape}")

# Check distribution
dist = regime_engineer.get_regime_distribution(features)
print("\nRegime distribution:")
print(dist)

# Combine
combined_df = pd.concat([features, targets], axis=1)
print(f"\nCombined shape: {combined_df.shape}")
print(f"Total pipeline time: {time.time()-t0:.2f}s")


Imports and configuration ready
=== Loading .hist_db_1h.csv ===

Initial rows: 53,963

=== FOUND ISSUES (prior to automated fixes) ===

üî¥ TEMPORAL: Missing hours: 1 cases
  Missing timestamps sample:
    2025-11-04 13:00:00

üî¥ DATA INTEGRITY: Identical consecutive OHLC rows: 174 cases
  Sample cases:
    {'o': '7110.10', 'h': '7110.10', 'l': '7110.10', 'c': '7110.10', 'volCcy': '0.00'}
    {'o': '7110.10', 'h': '7110.10', 'l': '7110.10', 'c': '7110.10', 'volCcy': '0.00'}
    {'o': '7110.10', 'h': '7110.10', 'l': '7110.10', 'c': '7110.10', 'volCcy': '0.00'}
  Affected dates (sample): 2020-01-02, 2020-01-03, 2020-01-04, 2020-01-05, 2020-01-06

=== APPLYING AUTOMATED FIXES ===
ACTION: Resampled/Reindexed to 53964 hourly intervals (was 53963).
ACTION: Forward-filled NaNs after resampling. (5 NaNs potentially filled by ffill).

=== FINAL STATUS (after automated fixes) ===
DataFrame shape post-fixes: (53964, 5) (Original: (53963, 6))
Date range: 2019-10-01 00:00:00 to 2025-11-26 11:00:

  df[f"{vol_feat}_x_tte_sin"] = df[vol_feat] * df["tte_phase_sin"]
  df[f"{vol_feat}_x_tte_cos"] = df[vol_feat] * df["tte_phase_cos"]
  df[f"{vol_feat}_x_tte_sqrt"] = df[vol_feat] * tte_sqrt
  df[f"{vol_feat}_x_tte"] = df[vol_feat] * tte
  df[f"{vol_feat}_x_tte_sq"] = df[vol_feat] * (tte_normalized ** 2)
  df[f"{vol_feat}_x_tte_sin"] = df[vol_feat] * df["tte_phase_sin"]
  df[f"{vol_feat}_x_tte_cos"] = df[vol_feat] * df["tte_phase_cos"]
  df["vol_term_x_tte_sqrt"] = vol_term_slope * tte_sqrt  # Black-Scholes scaling
  df["vol_term_x_tte"] = vol_term_slope * tte
  df["vol_term_x_tte_sq"] = vol_term_slope * (tte_normalized ** 2)
  df[f"{vol_feat}_weekend"] = df[vol_feat] * df["is_weekend"]
  df[f"{vol_feat}_weekday"] = df[vol_feat] * (1 - df["is_weekend"])
  df[f"{vol_feat}_weekend"] = df[vol_feat] * df["is_weekend"]
  df[f"{vol_feat}_weekday"] = df[vol_feat] * (1 - df["is_weekend"])
  df[f"{vol_feat}_weekend"] = df[vol_feat] * df["is_weekend"]
  df[f"{vol_feat}_weekday"] = df[vol_feat] *

[FeatureEngineer] feature build complete; rows=53964, cols=450, total=173.38s [stateless:251.9ms, merge_stateless:2.4ms, temporal:15.7ms, rolling:642.0ms, prev_week_cycle:163155.7ms, current_cycle:8857.9ms, non_linear:265.1ms, custom_interactions:27.0ms, cleanup:167.2ms]
[FeatureEngineer] fit complete; rows=53964, cols=450, elapsed=173.39s
[FeatureEngineer] transform start; rows=53964


  df[f"{vol_feat}_x_tte_sin"] = df[vol_feat] * df["tte_phase_sin"]
  df[f"{vol_feat}_x_tte_cos"] = df[vol_feat] * df["tte_phase_cos"]
  df[f"{vol_feat}_x_tte_sqrt"] = df[vol_feat] * tte_sqrt
  df[f"{vol_feat}_x_tte"] = df[vol_feat] * tte
  df[f"{vol_feat}_x_tte_sq"] = df[vol_feat] * (tte_normalized ** 2)
  df[f"{vol_feat}_x_tte_sin"] = df[vol_feat] * df["tte_phase_sin"]
  df[f"{vol_feat}_x_tte_cos"] = df[vol_feat] * df["tte_phase_cos"]
  df["vol_term_x_tte_sqrt"] = vol_term_slope * tte_sqrt  # Black-Scholes scaling
  df["vol_term_x_tte"] = vol_term_slope * tte
  df["vol_term_x_tte_sq"] = vol_term_slope * (tte_normalized ** 2)
  df[f"{vol_feat}_weekend"] = df[vol_feat] * df["is_weekend"]
  df[f"{vol_feat}_weekday"] = df[vol_feat] * (1 - df["is_weekend"])
  df[f"{vol_feat}_weekend"] = df[vol_feat] * df["is_weekend"]
  df[f"{vol_feat}_weekday"] = df[vol_feat] * (1 - df["is_weekend"])
  df[f"{vol_feat}_weekend"] = df[vol_feat] * df["is_weekend"]
  df[f"{vol_feat}_weekday"] = df[vol_feat] *

[FeatureEngineer] feature build complete; rows=53964, cols=450, total=10.51s [stateless:234.7ms, merge_stateless:2.1ms, temporal:12.9ms, rolling:639.1ms, prev_week_cycle:51.9ms, current_cycle:9106.4ms, non_linear:264.7ms, custom_interactions:28.5ms, cleanup:168.7ms]
[FeatureEngineer] transform complete; rows=53964, cols=450, elapsed=10.51s
  Full fit+transform in 183.90s -> shape: (53964, 450)

--- Building Volatility Regime Targets ---
Regime targets built in 340.01s -> shape: (53964, 6)
Regime targets built in 340.01s -> shape: (53964, 6)

Regime distribution:
regime_label
0    43673
1     6364
2     3795
Name: count, dtype: Int64

Combined shape: (53964, 456)
Total pipeline time: 862.56s

Regime distribution:
regime_label
0    43673
1     6364
2     3795
Name: count, dtype: Int64

Combined shape: (53964, 456)
Total pipeline time: 862.56s


In [None]:
import pickle
from pathlib import Path

# Define cache paths
cache_root = paths["root"]
cache_root.mkdir(parents=True, exist_ok=True)

feature_cache = cache_root / "features_cache.pkl"
target_cache = cache_root / "targets_cache.pkl"
combined_cache = cache_root / "combined_cache.pkl"

# Option 1: Load from cache if exists
FORCE_REBUILD = False  # Set to True to rebuild from scratch

if not FORCE_REBUILD and feature_cache.exists() and target_cache.exists():
    print("=" * 60)
    print("Loading cached features and targets...")
    t_load = time.time()
    
    with open(feature_cache, 'rb') as f:
        features = pickle.load(f)
    with open(target_cache, 'rb') as f:
        targets = pickle.load(f)
    with open(combined_cache, 'rb') as f:
        combined_df = pickle.load(f)
    
    print(f"‚úì Loaded from cache in {time.time()-t_load:.2f}s")
    print(f"  Features: {features.shape}")
    print(f"  Targets: {targets.shape}")
    print(f"  Combined: {combined_df.shape}")
    print(f"  Date range: {features.index[0]} to {features.index[-1]}")
    print("=" * 60)
    
else:
    print("=" * 60)
    print("Cache not found or FORCE_REBUILD=True - will save after first run")
    print("To use cache next time:")
    print("  1. Run the first cell with history_slice = raw_history[:]")
    print("  2. Wait for features/targets to compute")
    print("  3. This cell will save them")
    print("  4. Next time, set FORCE_REBUILD=False and skip the first cell")
    print("=" * 60)
    
    # Save the current run to cache
    if 'features' in globals() and 'targets' in globals():
        print("\nSaving current features and targets to cache...")
        t_save = time.time()
        
        with open(feature_cache, 'wb') as f:
            pickle.dump(features, f)
        with open(target_cache, 'wb') as f:
            pickle.dump(targets, f)
        with open(combined_cache, 'wb') as f:
            pickle.dump(combined_df, f)
        
        print(f"‚úì Saved to cache in {time.time()-t_save:.2f}s")
        print(f"  Location: {cache_root}")
    else:
        print("‚ö† No features/targets to save yet - run the first cell first")


Cache not found or FORCE_REBUILD=True - will save after first run
To use cache next time:
  1. Run the first cell with history_slice = raw_history[:]
  2. Wait for features/targets to compute
  3. This cell will save them
  4. Next time, set FORCE_REBUILD=False and skip the first cell

Saving current features and targets to cache...
‚úì Saved to cache in 0.44s
  Location: research_vol
‚úì Saved to cache in 0.44s
  Location: research_vol


In [28]:
# Comprehensive NaN analysis in features
print("=" * 70)
print("NaN Analysis in Features")
print("=" * 70)

# 1. Overall NaN statistics
nan_counts = features.isna().sum()
nan_features = nan_counts[nan_counts > 0].sort_values(ascending=False)

print(f"\nTotal features: {len(features.columns)}")
print(f"Features with NaNs: {len(nan_features)}")
print(f"Total rows: {len(features)}")

# 2. Group NaN features by prefix to identify source
print("\n" + "=" * 70)
print("NaN Features Grouped by Source:")
print("=" * 70)

feature_groups = {}
for feat in nan_features.index:
    # Extract prefix (everything before first underscore or digit)
    if '_' in feat:
        prefix = feat.split('_')[0]
    else:
        prefix = 'other'
    
    if prefix not in feature_groups:
        feature_groups[prefix] = []
    feature_groups[prefix].append((feat, nan_counts[feat]))

# Sort groups by total NaN count
sorted_groups = sorted(feature_groups.items(), 
                       key=lambda x: sum(count for _, count in x[1]), 
                       reverse=True)

for prefix, features_list in sorted_groups[:10]:  # Top 10 groups
    total_nans = sum(count for _, count in features_list)
    print(f"\n{prefix.upper()} features: {len(features_list)} features, {total_nans:,} total NaNs")
    # Show top 5 within each group
    for feat, count in sorted(features_list, key=lambda x: x[1], reverse=True)[:5]:
        pct = (count / len(features)) * 100
        print(f"  {feat:50s} {count:6,} NaNs ({pct:5.2f}%)")

# 3. Analyze NaN patterns (start/middle/end)
print("\n" + "=" * 70)
print("NaN Location Analysis (Top 10 worst features):")
print("=" * 70)

for feat in nan_features.head(10).index:
    series = features[feat]
    nan_mask = series.isna()
    
    # Find first and last valid index
    valid_indices = series[~nan_mask].index
    if len(valid_indices) == 0:
        print(f"\n{feat}: ALL NaNs!")
        continue
    
    first_valid = valid_indices[0]
    last_valid = valid_indices[-1]
    
    # Count NaNs at start, middle, end
    start_nans = nan_mask.loc[:first_valid].sum() - 1  # -1 to exclude first valid
    end_nans = nan_mask.loc[last_valid:].sum() - 1  # -1 to exclude last valid
    middle_nans = nan_mask.sum() - start_nans - end_nans
    
    print(f"\n{feat}:")
    print(f"  Total NaNs: {nan_mask.sum():,} ({nan_mask.sum()/len(features)*100:.2f}%)")
    print(f"  Start NaNs: {start_nans:,} (before {first_valid})")
    print(f"  Middle NaNs: {middle_nans:,}")
    print(f"  End NaNs: {end_nans:,} (after {last_valid})")

# 4. Check specific feature types that are expected
print("\n" + "=" * 70)
print("Expected NaN Sources (prev_weekend, empirical, etc.):")
print("=" * 70)

prev_weekend_features = [f for f in nan_features.index if 'prev_saturday' in f or 'prev_sunday' in f]
empirical_features = [f for f in nan_features.index if 'emp_' in f]
prev_weekday_features = [f for f in nan_features.index if 'prev_weekday' in f]

print(f"\nprev_saturday/sunday features with NaNs: {len(prev_weekend_features)}")
if prev_weekend_features:
    for feat in prev_weekend_features[:5]:
        print(f"  {feat}: {nan_counts[feat]:,} NaNs")

print(f"\nemp_ (empirical) features with NaNs: {len(empirical_features)}")
if empirical_features:
    for feat in empirical_features[:5]:
        print(f"  {feat}: {nan_counts[feat]:,} NaNs")

print(f"\nprev_weekday features with NaNs: {len(prev_weekday_features)}")
if prev_weekday_features:
    for feat in prev_weekday_features[:5]:
        print(f"  {feat}: {nan_counts[feat]:,} NaNs")

# 5. Check which rows have NaNs
print("\n" + "=" * 70)
print("Row-wise NaN Analysis:")
print("=" * 70)

rows_with_nans = features.isna().any(axis=1)
print(f"Rows with ANY NaNs: {rows_with_nans.sum():,} / {len(features):,} ({rows_with_nans.sum()/len(features)*100:.2f}%)")

# Show first and last rows with NaNs
nan_row_indices = features[rows_with_nans].index
if len(nan_row_indices) > 0:
    print(f"First row with NaNs: {nan_row_indices[0]}")
    print(f"Last row with NaNs: {nan_row_indices[-1]}")
    
    # Count consecutive NaNs at start and end
    consecutive_start = 0
    for i in range(len(rows_with_nans)):
        if rows_with_nans.iloc[i]:
            consecutive_start += 1
        else:
            break
    
    consecutive_end = 0
    for i in range(len(rows_with_nans)-1, -1, -1):
        if rows_with_nans.iloc[i]:
            consecutive_end += 1
        else:
            break
    
    print(f"Consecutive NaN rows at start: {consecutive_start}")
    print(f"Consecutive NaN rows at end: {consecutive_end}")


NaN Analysis in Features

Total features: 450
Features with NaNs: 207
Total rows: 53964

NaN Features Grouped by Source:

VOL features: 53 features, 2,697 total NaNs
  vol_gkyz_288h_x_tte_sqrt                              288 NaNs ( 0.53%)
  vol_gkyz_288h_x_tte                                   288 NaNs ( 0.53%)
  vol_gkyz_288h_x_tte_cos                               288 NaNs ( 0.53%)
  vol_gkyz_288h_x_tte_sin                               288 NaNs ( 0.53%)
  vol_gkyz_288h_x_tte_sq                                288 NaNs ( 0.53%)

STOCH features: 7 features, 1,032 total NaNs
  stoch_pos_3h                                          173 NaNs ( 0.32%)
  stoch_pos_6h                                          169 NaNs ( 0.31%)
  stoch_pos_12h                                         162 NaNs ( 0.30%)
  stoch_pos_24h                                         156 NaNs ( 0.29%)
  stoch_pos_288h                                        144 NaNs ( 0.27%)

VLM features: 18 features, 597 total NaNs
  vlm

In [9]:
# Clean combined dataframe - only drop start and end NaNs, preserve middle for debugging
# From analysis: 288 consecutive NaN rows at start, 0 at end

rows_with_nans = combined_df.isna().any(axis=1)

# Find first and last valid rows
valid_rows = ~rows_with_nans
valid_indices = combined_df[valid_rows].index

if len(valid_indices) > 0:
    first_valid = valid_indices[0]
    last_valid = valid_indices[-1]
    
    # Count start and end NaNs
    start_nans = rows_with_nans.loc[:first_valid].sum() - 1
    end_nans = rows_with_nans.loc[last_valid:].sum() - 1
    
    print(f"NaN Summary:")
    print(f"  First valid row: {first_valid}")
    print(f"  Last valid row: {last_valid}")
    print(f"  Start NaNs to drop: {start_nans}")
    print(f"  End NaNs to drop: {max(0, end_nans)}")
    
    # Slice from first valid to last valid (inclusive)
    combined_df_clean = combined_df.loc[first_valid:last_valid].copy()
    
    # Check for middle NaNs (these are preserved for inspection)
    middle_nans = combined_df_clean.isna().any(axis=1).sum()
    
    print(f"\nAfter trimming start/end:")
    print(f"  Rows: {len(combined_df_clean)} (from {combined_df_clean.index[0]} to {combined_df_clean.index[-1]})")
    print(f"  Middle rows with NaNs: {middle_nans}")
    
    if middle_nans > 0:
        print(f"  ‚ö†Ô∏è WARNING: {middle_nans} rows with NaNs in middle - preserved for debugging")
else:
    print("No valid rows found!")
    combined_df_clean = combined_df.iloc[0:0]  # Empty dataframe

combined_df_clean

NaN Summary:
  First valid row: 2019-10-13 00:00:00
  Last valid row: 2025-11-26 10:00:00
  Start NaNs to drop: 287
  End NaNs to drop: 0

After trimming start/end:
  Rows: 53675 (from 2019-10-13 00:00:00 to 2025-11-26 10:00:00)
  Middle rows with NaNs: 97


Unnamed: 0,o,h,l,c,volCcy,time_to_exp1_hr,time_elapsed,hour,day_of_week,is_weekend,...,skew_vol_extreme,kurtosis_vol_extreme,distance_vol_extreme,vol_surprise_clustering,regime_label,max_fwd_z_score,max_jump_z_score,box_std_deseasonalized,box_std_raw,seasonal_vol
2019-10-13 00:00:00,8308.5,8341.3,8289.9,8336.7,718.0,7.0,17.0,1,6,1,...,-0.361831,-0.141242,0.016816,-2.814803,0,0.687067,1.374134,0.006729,0.005135,0.003437
2019-10-13 01:00:00,8336.7,8368.5,8336.7,8349.9,796.0,6.0,18.0,2,6,1,...,-0.325121,-0.212071,0.015524,-0.215035,0,0.901515,1.803030,0.005487,0.005147,0.004225
2019-10-13 02:00:00,8350.0,8358.6,8340.0,8346.9,421.0,5.0,19.0,3,6,1,...,-0.329969,-0.205131,0.014965,0.169210,0,0.720972,1.441945,0.006759,0.005132,0.003420
2019-10-13 03:00:00,8346.9,8348.0,8340.0,8345.0,154.0,4.0,20.0,4,6,1,...,-0.330016,-0.225318,0.015040,-1.322802,0,0.638300,1.276600,0.007561,0.005110,0.003044
2019-10-13 04:00:00,8345.0,8363.4,8341.1,8341.7,576.0,3.0,21.0,5,6,1,...,-0.364801,-0.232202,0.015188,-3.175828,0,0.788462,1.576923,0.006019,0.005107,0.003822
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-11-26 06:00:00,87452.5,87827.3,87383.4,87733.9,338.0,1.0,23.0,7,2,0,...,0.005392,-1.329192,0.032090,-9.314568,0,0.391165,0.000000,0.006814,0.005511,0.003643
2025-11-26 07:00:00,87725.2,87900.0,87637.7,87872.6,195.0,24.0,0.0,8,2,0,...,0.013903,-1.149395,0.028068,-1.972040,0,0.467373,0.000000,0.006393,0.005483,0.003863
2025-11-26 08:00:00,87872.7,87881.8,87342.9,87361.7,232.0,23.0,1.0,9,2,0,...,0.014572,-1.115242,0.026972,-6.318691,0,0.249501,0.000000,0.007204,0.005481,0.003427
2025-11-26 09:00:00,87353.5,87396.7,86627.9,86776.2,567.0,22.0,2.0,10,2,0,...,-0.054797,-1.139544,0.030126,-4.989376,0,0.092720,0.000000,0.005668,0.005495,0.004367


In [7]:
# Analyze which features have the middle NaNs and verify stochastic fix
if 'combined_df_clean' in globals() and len(combined_df_clean) > 0:
    print("=" * 70)
    print("Analyzing Middle NaN Features")
    print("=" * 70)
    
    # Get rows with NaNs
    rows_with_middle_nans = combined_df_clean.isna().any(axis=1)
    
    if rows_with_middle_nans.sum() > 0:
        # Count NaNs per feature
        middle_nan_counts = combined_df_clean.isna().sum()
        features_with_middle_nans = middle_nan_counts[middle_nan_counts > 0].sort_values(ascending=False)
        
        print(f"\nFeatures with middle NaNs: {len(features_with_middle_nans)}")
        print(f"\nTop 20 features by NaN count:")
        for feat, count in features_with_middle_nans.head(20).items():
            pct = (count / len(combined_df_clean)) * 100
            print(f"  {feat:40s} {count:4d} NaNs ({pct:5.2f}%)")
        
        # Group by prefix
        print(f"\nGrouped by feature type:")
        feature_groups = {}
        for feat in features_with_middle_nans.index:
            prefix = feat.split('_')[0] if '_' in feat else 'other'
            if prefix not in feature_groups:
                feature_groups[prefix] = 0
            feature_groups[prefix] += features_with_middle_nans[feat]
        
        for prefix, total_nans in sorted(feature_groups.items(), key=lambda x: x[1], reverse=True):
            print(f"  {prefix:20s} {total_nans:6,} total NaNs")
        
        # Check specifically for stochastic features
        stoch_features_in_nans = [f for f in features_with_middle_nans.index if 'stoch' in f]
        print(f"\nStochastic features with NaNs: {len(stoch_features_in_nans)}")
        if stoch_features_in_nans:
            print("  ‚ö†Ô∏è STOCHASTIC FIX NOT APPLIED! Should be 0.")
            for feat in stoch_features_in_nans:
                print(f"    {feat}: {features_with_middle_nans[feat]} NaNs")
        else:
            print("  ‚úì No stochastic NaNs - fix is working!")
        
        # Show date range of NaN occurrences
        nan_dates = combined_df_clean[rows_with_middle_nans].index
        print(f"\nDate range of middle NaNs:")
        print(f"  First: {nan_dates[0]}")
        print(f"  Last: {nan_dates[-1]}")
        
    else:
        print("\n‚úì No middle NaNs found!")
else:
    print("No data to analyze")

Analyzing Middle NaN Features

Features with middle NaNs: 4

Top 20 features by NaN count:
  regime_label                               96 NaNs ( 0.18%)
  max_fwd_z_score                            96 NaNs ( 0.18%)
  max_jump_z_score                           96 NaNs ( 0.18%)
  vol_ratio_24h_144h                         23 NaNs ( 0.04%)

Grouped by feature type:
  max                     192 total NaNs
  regime                   96 total NaNs
  vol                      23 total NaNs

Stochastic features with NaNs: 0
  ‚úì No stochastic NaNs - fix is working!

Date range of middle NaNs:
  First: 2020-01-05 16:00:00
  Last: 2020-01-09 16:00:00


In [None]:
# Investigate why regime_label has NaNs in Jan 2020
print("=" * 70)
print("Investigating Regime Label NaNs")
print("=" * 70)

# Get the rows with regime NaNs
regime_nans = combined_df_clean['regime_label'].isna()
nan_rows = combined_df_clean[regime_nans]

print(f"\nTotal rows with regime_label NaN: {regime_nans.sum()}")
print(f"Date range: {nan_rows.index[0]} to {nan_rows.index[-1]}")

# Check if this is a continuous block
print(f"\nFirst 10 NaN timestamps:")
for ts in nan_rows.index[:10]:
    print(f"  {ts}")

# Check the raw data around this period
if 'raw_history' in globals():
    print(f"\nChecking raw data around NaN period:")
    check_start = nan_rows.index[0] - pd.Timedelta(hours=24)
    check_end = nan_rows.index[-1] + pd.Timedelta(hours=24)
    
    raw_slice = raw_history.loc[check_start:check_end]
    print(f"  Raw data rows in this period: {len(raw_slice)}")
    print(f"  Expected rows (hourly): {int((check_end - check_start).total_seconds() / 3600)}")
    
    # Check for gaps
    if len(raw_slice) > 0:
        time_diffs = raw_slice.index.to_series().diff()
        gaps = time_diffs[time_diffs > pd.Timedelta(hours=1)]
        if len(gaps) > 0:
            print(f"\n  ‚ö†Ô∏è Found {len(gaps)} time gaps:")
            for gap_time, gap_size in gaps.items():
                print(f"    {gap_time}: {gap_size}")
        else:
            print(f"  ‚úì No time gaps found")

# Check what features look like during this period
print(f"\nFeature values during NaN period (first NaN row):")
first_nan_row = combined_df_clean.loc[nan_rows.index[0]]
print(f"  vol_rs_24h: {first_nan_row.get('vol_rs_24h', 'N/A')}")
print(f"  vol_rs_72h: {first_nan_row.get('vol_rs_72h', 'N/A')}")
print(f"  trend_strength_24h: {first_nan_row.get('trend_strength_24h', 'N/A')}")
print(f"  logret_24h: {first_nan_row.get('logret_24h', 'N/A')}")

In [8]:
# Split into train/val/test (80/10/10)
n_samples = len(combined_df_clean)
train_end = int(n_samples * 0.8)
val_end = train_end + int(n_samples * 0.1)

# Get feature and target columns
feature_cols = features.columns.intersection(combined_df_clean.columns)
target_cols = targets.columns.intersection(combined_df_clean.columns)

X_train = combined_df_clean[feature_cols].iloc[:train_end]
X_val = combined_df_clean[feature_cols].iloc[train_end:val_end]
X_test = combined_df_clean[feature_cols].iloc[val_end:]

y_train = combined_df_clean[target_cols].iloc[:train_end]
y_val = combined_df_clean[target_cols].iloc[train_end:val_end]
y_test = combined_df_clean[target_cols].iloc[val_end:]

print(f"Split sizes:")
print(f"  Train: {len(X_train):,} rows ({len(X_train)/n_samples*100:.1f}%)")
print(f"  Val:   {len(X_val):,} rows ({len(X_val)/n_samples*100:.1f}%)")
print(f"  Test:  {len(X_test):,} rows ({len(X_test)/n_samples*100:.1f}%)")

print(f"\nX shapes -> train {X_train.shape}, val {X_val.shape}, test {X_test.shape}")
print(f"y shapes -> train {y_train.shape}, val {y_val.shape}, test {y_test.shape}")

# Final NaN check on all splits
train_nans = X_train.isna().sum().sum()
val_nans = X_val.isna().sum().sum()
test_nans = X_test.isna().sum().sum()

if train_nans + val_nans + test_nans == 0:
    print("\n‚úì No NaNs in any split - ready for training!")
else:
    print(f"\n‚ö† NaNs found:")
    if train_nans > 0:
        print(f"  Train: {train_nans} NaNs")
    if val_nans > 0:
        print(f"  Val: {val_nans} NaNs")
    if test_nans > 0:
        print(f"  Test: {test_nans} NaNs")

Split sizes:
  Train: 42,940 rows (80.0%)
  Val:   5,367 rows (10.0%)
  Test:  5,368 rows (10.0%)

X shapes -> train (42940, 450), val (5367, 450), test (5368, 450)
y shapes -> train (42940, 6), val (5367, 6), test (5368, 6)

‚ö† NaNs found:
  Train: 23 NaNs
