# Feature & Target Pipeline
Quick tests and evaluation on new targets/features/models

In [4]:
import pandas as pd
from pathlib import Path
from typing import Optional
import time
from data_pipeline import load_data  # This just loads the data and cleans it
from featureEngineer import FeatureEngineer
from targetEngineer import ExpirationTargetEngineer
from ML_setup import CONFIG
from ML_general_tools import *
from pathlib import Path

print("Imports and configuration ready")

# Build features, targets, and combined dataframe
t0 = time.time()
raw_history = load_data(CONFIG["data"]["path"])
print(f"Loaded raw data: {raw_history.shape} in {time.time()-t0:.2f}s")

# Use slice for faster testing (or use [:] for full data)
history_slice = raw_history[-3000:]  # Last 3000 rows for faster testing
print(f"Using slice: {history_slice.shape}")

feature_params = dict(CONFIG["features"]["params"])
heavy_cache_cfg = CONFIG["features"].get("heavy_cache", {})
heavy_cache_root = Path(heavy_cache_cfg.get("directory", "cache/heavy_features"))

current_output_root_str = CONFIG["output"]["directory"]
current_output_root_path = Path(current_output_root_str)

paths = {
    "root": current_output_root_path,
    "feature_selection": current_output_root_path / CONFIG["output"]["subdirectories"]["features"],
    "trained_models": current_output_root_path / CONFIG["output"]["subdirectories"]["models"],
    "hpt_studies": current_output_root_path / CONFIG["output"]["subdirectories"]["hpt"],
    "feature_cache": current_output_root_path / CONFIG["output"]["subdirectories"]["cache"]
}

cache_dir = heavy_cache_root
cache_dir.mkdir(parents=True, exist_ok=True)
cache_files = sorted(cache_dir.glob("heavy_features_v*.pkl"))
cache_ready = bool(cache_files)
if cache_ready:
    print(f"Heavy cache ready: {cache_files[-1].name} (total {len(cache_files)}) in {cache_dir}")
else:
    print(f"No heavy cache file found in {cache_dir}; initial fit will populate.")

## Feature Engineering
fe = FeatureEngineer(verbose=True, **{k: v for k, v in feature_params.items() if k != "verbose"})

## Cache usage
cache_ready = bool(cache_files)  # Use actual cache status

manual_features = None
if cache_ready and fe.heavy_cache.load():
    print("\nâœ“ Using heavy cache (only prev_cycle features cached)")
    print("  Note: Rolling/stateless features still computed on-the-fly")
    t1 = time.time()
    fe._heavy_payload = fe.heavy_cache.payload
    reference = fe._prepare_reference_frame(history_slice)
    fe._full_reference = reference
    manual_features = fe._compute_all_features(reference, build_heavy=False)
    fe.feature_names_out_ = manual_features.columns.tolist()
    fe._reference_features = manual_features
    print(f"  Features computed in {time.time()-t1:.2f}s -> shape: {manual_features.shape}")
else:
    print("\nâš  Heavy cache not available; running full fit (slower)")
    t1 = time.time()
    verbose_flag = feature_params.pop("verbose", False)
    fe = FeatureEngineer(verbose=True, **feature_params)
    fe.fit(history_slice)
    manual_features = fe.transform(history_slice)
    print(f"  Full fit+transform in {time.time()-t1:.2f}s -> shape: {manual_features.shape}")

feature_engineer = fe
features = manual_features.copy()

## 2a. Volatility Regime Target Engineering ---
from targetEngineer import VolatilityRegimeEngineer

print("\n--- Building Volatility Regime Targets ---")
t2 = time.time()

regime_engineer = VolatilityRegimeEngineer(
    lookback_window=24*3,    # 3 days lookback for vol
    seasonal_window=24*30,   # 30 days to learn patterns
    forward_window=24,       # 24h classification
    trend_std=1.2,           # 1.2 daily sigmas
    jump_std=3.0,            # 3.0 daily sigmas
    jump_speed_window=6,     # 6h window for jump detection
)

regime_engineer.fit(features)
targets = regime_engineer.transform(features)
print(f"Regime targets built in {time.time()-t2:.2f}s -> shape: {targets.shape}")

# Check distribution
dist = regime_engineer.get_regime_distribution(features)
print("\nRegime distribution:")
print(dist)

# Combine
combined_df = pd.concat([features, targets], axis=1)
print(f"\nCombined shape: {combined_df.shape}")
print(f"Total pipeline time: {time.time()-t0:.2f}s")


Imports and configuration ready
=== Loading .hist_db_1h.csv ===

Initial rows: 53,963

=== FOUND ISSUES (prior to automated fixes) ===

ðŸ”´ TEMPORAL: Missing hours: 1 cases
  Missing timestamps sample:
    2025-11-04 13:00:00

ðŸ”´ DATA INTEGRITY: Identical consecutive OHLC rows: 174 cases
  Sample cases:
    {'o': '7110.10', 'h': '7110.10', 'l': '7110.10', 'c': '7110.10', 'volCcy': '0.00'}
    {'o': '7110.10', 'h': '7110.10', 'l': '7110.10', 'c': '7110.10', 'volCcy': '0.00'}
    {'o': '7110.10', 'h': '7110.10', 'l': '7110.10', 'c': '7110.10', 'volCcy': '0.00'}
  Affected dates (sample): 2020-01-02, 2020-01-03, 2020-01-04, 2020-01-05, 2020-01-06

=== APPLYING AUTOMATED FIXES ===
ACTION: Resampled/Reindexed to 53964 hourly intervals (was 53963).
ACTION: Forward-filled NaNs after resampling. (5 NaNs potentially filled by ffill).

=== FINAL STATUS (after automated fixes) ===
DataFrame shape post-fixes: (53964, 5) (Original: (53963, 6))
Date range: 2019-10-01 00:00:00 to 2025-11-26 11:00:

  df[f"{vol_feat}_x_tte_sin"] = df[vol_feat] * df["tte_phase_sin"]
  df[f"{vol_feat}_x_tte_cos"] = df[vol_feat] * df["tte_phase_cos"]
  df[f"{vol_feat}_x_tte_sqrt"] = df[vol_feat] * tte_sqrt
  df[f"{vol_feat}_x_tte"] = df[vol_feat] * tte
  df[f"{vol_feat}_x_tte_sq"] = df[vol_feat] * (tte_normalized ** 2)
  df[f"{vol_feat}_x_tte_sin"] = df[vol_feat] * df["tte_phase_sin"]
  df[f"{vol_feat}_x_tte_cos"] = df[vol_feat] * df["tte_phase_cos"]
  df["vol_term_x_tte_sqrt"] = vol_term_slope * tte_sqrt  # Black-Scholes scaling
  df["vol_term_x_tte"] = vol_term_slope * tte
  df["vol_term_x_tte_sq"] = vol_term_slope * (tte_normalized ** 2)
  df[f"{vol_feat}_weekend"] = df[vol_feat] * df["is_weekend"]
  df[f"{vol_feat}_weekday"] = df[vol_feat] * (1 - df["is_weekend"])
  df[f"{vol_feat}_weekend"] = df[vol_feat] * df["is_weekend"]
  df[f"{vol_feat}_weekday"] = df[vol_feat] * (1 - df["is_weekend"])
  df[f"{vol_feat}_weekend"] = df[vol_feat] * df["is_weekend"]
  df[f"{vol_feat}_weekday"] = df[vol_feat] *

[FeatureEngineer] feature build complete; rows=3000, cols=450, total=0.69s [stateless:34.3ms, merge_stateless:0.6ms, temporal:1.0ms, rolling:98.1ms, prev_week_cycle:3.3ms, current_cycle:472.9ms, non_linear:32.2ms, custom_interactions:23.3ms, cleanup:19.9ms]
  Features computed in 0.69s -> shape: (3000, 450)

--- Building Volatility Regime Targets ---
Regime targets built in 18.67s -> shape: (3000, 6)
Regime targets built in 18.67s -> shape: (3000, 6)

Regime distribution:
regime_label
0    2379
1     385
2     200
Name: count, dtype: Int64

Combined shape: (3000, 456)
Total pipeline time: 38.29s

Regime distribution:
regime_label
0    2379
1     385
2     200
Name: count, dtype: Int64

Combined shape: (3000, 456)
Total pipeline time: 38.29s


In [1]:

# Find rows with any NaN values in features
nan_mask = features.isna().any(axis=1)
features_with_nans = features[nan_mask]



NameError: name 'features' is not defined

In [None]:


# Clean combined dataframe - drop first month and last 11 rows (minimal cleaning for small dataset)
months_to_drop = 1  # Only 1 month for small dataset
tail_rows_to_drop = 11

cutoff = combined_df.index.min() + pd.DateOffset(months=months_to_drop)
print(f"Removing data before {cutoff:%Y-%m-%d} (first {months_to_drop} months)")
combined_df_clean = combined_df.loc[combined_df.index >= cutoff]

if tail_rows_to_drop > 0:
    print(f"Dropping last {tail_rows_to_drop} rows to avoid trailing NaNs")
    combined_df_clean = combined_df_clean.iloc[:-tail_rows_to_drop]

print(f"Rows after cleaning: {len(combined_df_clean)} (from {combined_df_clean.index[0]} to {combined_df_clean.index[-1]})")

# Split into train/val/test (80/10/10)
n_samples = len(combined_df_clean)
train_end = int(n_samples * 0.8)
val_end = train_end + int(n_samples * 0.1)

# Get feature and target columns
feature_cols = features.columns.intersection(combined_df_clean.columns)
target_cols = targets.columns.intersection(combined_df_clean.columns)

X_train = combined_df_clean[feature_cols].iloc[:train_end]
X_val = combined_df_clean[feature_cols].iloc[train_end:val_end]
X_test = combined_df_clean[feature_cols].iloc[val_end:]

y_train = combined_df_clean[target_cols].iloc[:train_end]
y_val = combined_df_clean[target_cols].iloc[train_end:val_end]
y_test = combined_df_clean[target_cols].iloc[val_end:]

print(f"\nX shapes -> train {X_train.shape}, val {X_val.shape}, test {X_test.shape}")
print(f"y shapes -> train {y_train.shape}, val {y_val.shape}, test {y_test.shape}")

# Quick NaN check on training data
train_nans = X_train.isna().sum()
if train_nans.sum() > 0:
    print(f"\nâš  Training features with NaNs: {(train_nans > 0).sum()} columns")
    print(f"  Max NaNs in any column: {train_nans.max()} ({train_nans.max()/len(X_train):.1%})")
else:
    print("\nâœ“ No NaNs in training features")
