# Feature & Target Pipeline
Quick tests and evaluation on new targets/features/models

In [4]:
import pandas as pd
from pathlib import Path
from typing import Optional

from data_pipeline import load_data  # This just loads the data and cleans it

from featureEngineer import FeatureEngineer
from targetEngineer import ExpirationTargetEngineer

from ML_setup import CONFIG
from ML_general_tools import *
from pathlib import Path

print("Imports and configuration ready")

# Build features, targets, and combined dataframe
raw_history = load_data(CONFIG["data"]["path"])
history_slice = raw_history[-1000:]


feature_params = dict(CONFIG["features"]["params"])
heavy_cache_cfg = CONFIG["features"].get("heavy_cache", {})
heavy_cache_root = Path(heavy_cache_cfg.get("directory", "cache/heavy_features"))

current_output_root_str = CONFIG["output"]["directory"]
current_output_root_path = Path(current_output_root_str)

paths = {
    "root": current_output_root_path,
    "feature_selection": current_output_root_path / CONFIG["output"]["subdirectories"]["features"],
    "trained_models": current_output_root_path / CONFIG["output"]["subdirectories"]["models"],
    "hpt_studies": current_output_root_path / CONFIG["output"]["subdirectories"]["hpt"],
    "feature_cache": current_output_root_path / CONFIG["output"]["subdirectories"]["cache"]
}


cache_dir = heavy_cache_root
cache_dir.mkdir(parents=True, exist_ok=True)
cache_files = sorted(cache_dir.glob("heavy_features_v*.pkl"))
cache_ready = bool(cache_files)
if cache_ready:
    print(f"Heavy cache ready: {cache_files[-1].name} (total {len(cache_files)}) in {cache_dir}")
else:
    print(f"No heavy cache file found in {cache_dir}; initial fit will populate.")

# existing_fe: Optional[FeatureEngineer] = None
# if "feature_engineer" in globals() and isinstance(feature_engineer, FeatureEngineer):
#     existing_fe = feature_engineer

# if existing_fe is not None:
#     print("Reusing existing FeatureEngineer instance.")
#     fe = existing_fe
# else:
#     print("Instantiating new FeatureEngineer instance.")
#     fe = FeatureEngineer(verbose=feature_params.get("verbose", False), **{k: v for k, v in feature_params.items() if k != "verbose"})

fe = FeatureEngineer(verbose=feature_params.get("verbose", False), **{k: v for k, v in feature_params.items() if k != "verbose"})


cache_ready = False  # Force rebuild for testing

manual_features = None
if cache_ready and fe.heavy_cache.load():
    print("Loaded heavy cache payload from disk; skipping rebuild.")
    fe._heavy_payload = fe.heavy_cache.payload
    reference = fe._prepare_reference_frame(history_slice)
    fe._full_reference = reference
    manual_features = fe._compute_all_features(reference, build_heavy=False)
    fe.feature_names_out_ = manual_features.columns.tolist()
    fe._reference_features = manual_features
else:
    print("Heavy cache not available or failed to load; running full fit.")
    verbose_flag = feature_params.pop("verbose", False)
    fe = FeatureEngineer(verbose=verbose_flag, **feature_params)
    fe.fit(history_slice)
    manual_features = fe.transform(history_slice)

feature_engineer = fe
features = manual_features.copy()

## Add targets standard expiration targets
# target_engineer = ExpirationTargetEngineer(**CONFIG["targets"]["params"])
# target_engineer.fit(features)
# targets = target_engineer.transform(features)

## 2a. Volatility Regime Target Engineering Test ---
from targetEngineer import VolatilityRegimeEngineer

df_train = features.copy()

# Instantiate with your parameters
regime_engineer = VolatilityRegimeEngineer(
    lookback_window=24*3,    # 3 days lookback for vol
    seasonal_window=24*30,   # 30 days to learn patterns
    forward_window=24,       # 24h classification
    trend_std=1.2,           # 1.2 daily sigmas
    jump_std=3.0,            # 3.0 daily sigmas (scaled internally for 6h window)
    jump_speed_window=6,     # 6h window for jump detection

    # Hardening Parameters
    trend_min_efficiency=0.15, # Allows looser/messier trends
    trend_min_r2=0.6           # Requires moderate linear fit
)

# Run Fit/Transform
regime_engineer.fit(df_train)
targets = regime_engineer.transform(df_train)

# Check the distribution
dist = regime_engineer.get_regime_distribution(df_train)
print(dist)




# --- 2b. Adding Derived Probability Features (from pre-trained models) ---
# derived_proba_config = CONFIG["features"].get("derived_probability_features", {})
# print(f'derived_proba_config: {derived_proba_config}')
# if derived_proba_config.get("enabled", False):
#     print("\n2b. Adding Derived Probability Features")
#     features, newly_added_proba_features = generate_and_add_derived_probability_features(
#         features.copy(),
#         derived_proba_config,
#         paths
#     )
#     if newly_added_proba_features:
#         print(f"  Successfully added derived probability features: {newly_added_proba_features}")
#         # Check for NaNs in each new feature
#         for feat in newly_added_proba_features:
#             if feat in features.columns:
#                 n_nans = features[feat].isna().sum()
#                 print(f"    [NaN check] {feat}: {n_nans} NaNs ({n_nans/len(features):.2%} of rows)")
#             else:
#                 print(f"    [NaN check] {feat}: not found in features DataFrame!")
#     else:
#         print("  No new derived probability features were added (or generation was disabled).")

initial_feature_names = list(features.columns)

# Generate targets (your existing logic)
print(initial_feature_names)


## deop pice columns from features
# drop_cols = [col for col in ['o', 'h', 'l', 'c'] if col in features.columns]
# if drop_cols: features = features.drop(columns=drop_cols)


# Combine
combined_df = pd.concat([features, targets], axis=1)


Imports and configuration ready
=== Loading .hist_db_1h.csv ===

Initial rows: 53,963

=== FOUND ISSUES (prior to automated fixes) ===

ðŸ”´ TEMPORAL: Missing hours: 1 cases
  Missing timestamps sample:
    2025-11-04 13:00:00

ðŸ”´ DATA INTEGRITY: Identical consecutive OHLC rows: 174 cases
  Sample cases:
    {'o': '7110.10', 'h': '7110.10', 'l': '7110.10', 'c': '7110.10', 'volCcy': '0.00'}
    {'o': '7110.10', 'h': '7110.10', 'l': '7110.10', 'c': '7110.10', 'volCcy': '0.00'}
    {'o': '7110.10', 'h': '7110.10', 'l': '7110.10', 'c': '7110.10', 'volCcy': '0.00'}
  Affected dates (sample): 2020-01-02, 2020-01-03, 2020-01-04, 2020-01-05, 2020-01-06

=== APPLYING AUTOMATED FIXES ===
ACTION: Resampled/Reindexed to 53964 hourly intervals (was 53963).
ACTION: Forward-filled NaNs after resampling. (5 NaNs potentially filled by ffill).

=== FINAL STATUS (after automated fixes) ===
DataFrame shape post-fixes: (53964, 5) (Original: (53963, 6))
Date range: 2019-10-01 00:00:00 to 2025-11-26 11:00:

In [6]:
features

Unnamed: 0,o,h,l,c,volCcy,time_to_exp1_hr,time_elapsed,hour,day_of_week,is_weekend,...,prev_saturday_range_x_vol,prev_sunday_range_x_vol,weekday_vs_saturday_prog,weekday_vs_sunday_prog,prev_cycle_progress_x_hour,extreme_range_vol,skew_vol_extreme,kurtosis_vol_extreme,distance_vol_extreme,vol_surprise_clustering
2025-10-15 20:00:00,111260.1,111550.8,110508.3,111099.8,329.0,11.0,13.0,21,2,0,...,,,-0.006907,-0.021694,-0.004058,,,,,
2025-10-15 21:00:00,111102.0,111432.7,110463.3,110844.9,283.0,10.0,14.0,22,2,0,...,,,-0.007739,-0.023823,-0.004043,,,,,
2025-10-15 22:00:00,110847.4,111399.0,110653.1,110653.1,177.0,9.0,15.0,23,2,0,...,,,-0.009830,-0.031548,-0.004341,,,,,
2025-10-15 23:00:00,110653.2,111015.7,110577.1,110753.4,73.0,8.0,16.0,0,3,0,...,,,-0.008758,-0.025742,-0.003576,,,,,
2025-10-16 00:00:00,110753.3,110857.6,110366.0,110438.0,198.0,7.0,17.0,1,3,0,...,,,-0.008818,-0.021567,-0.003745,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-11-26 07:00:00,87725.2,87900.0,87637.7,87872.6,195.0,24.0,0.0,8,2,0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.012725,0.013903,-1.149395,0.028068,-1.972040
2025-11-26 08:00:00,87872.7,87881.8,87342.9,87361.7,232.0,23.0,1.0,9,2,0,...,0.001486,0.001143,-0.001789,-0.001249,-0.000759,0.007598,0.014572,-1.115242,0.026972,-6.318691
2025-11-26 09:00:00,87353.5,87396.7,86627.9,86776.2,567.0,22.0,2.0,10,2,0,...,0.001571,0.003979,-0.002210,-0.007621,-0.001622,0.014426,-0.054797,-1.139544,0.030126,-4.989376
2025-11-26 10:00:00,86772.2,86999.9,86595.8,86879.7,195.0,21.0,3.0,11,2,0,...,0.001920,0.003870,-0.004913,-0.007516,-0.003209,0.018931,-0.108033,-0.985923,0.033643,7.130272


In [8]:
# Clean combined dataframe - drop first month and last 11 rows (minimal cleaning for small dataset)
months_to_drop = 1  # Only 1 month for small dataset
tail_rows_to_drop = 11

cutoff = combined_df.index.min() + pd.DateOffset(months=months_to_drop)
print(f"Removing data before {cutoff:%Y-%m-%d} (first {months_to_drop} months)")
combined_df_clean = combined_df.loc[combined_df.index >= cutoff]

if tail_rows_to_drop > 0:
    print(f"Dropping last {tail_rows_to_drop} rows to avoid trailing NaNs")
    combined_df_clean = combined_df_clean.iloc[:-tail_rows_to_drop]

print(f"Rows after cleaning: {len(combined_df_clean)} (from {combined_df_clean.index[0]} to {combined_df_clean.index[-1]})")

# Split into train/val/test (80/10/10)
n_samples = len(combined_df_clean)
train_end = int(n_samples * 0.8)
val_end = train_end + int(n_samples * 0.1)

# Get feature and target columns
feature_cols = features.columns.intersection(combined_df_clean.columns)
target_cols = targets.columns.intersection(combined_df_clean.columns)

X_train = combined_df_clean[feature_cols].iloc[:train_end]
X_val = combined_df_clean[feature_cols].iloc[train_end:val_end]
X_test = combined_df_clean[feature_cols].iloc[val_end:]

y_train = combined_df_clean[target_cols].iloc[:train_end]
y_val = combined_df_clean[target_cols].iloc[train_end:val_end]
y_test = combined_df_clean[target_cols].iloc[val_end:]

print(f"\nX shapes -> train {X_train.shape}, val {X_val.shape}, test {X_test.shape}")
print(f"y shapes -> train {y_train.shape}, val {y_val.shape}, test {y_test.shape}")

# Quick NaN check on training data
train_nans = X_train.isna().sum()
if train_nans.sum() > 0:
    print(f"\nâš  Training features with NaNs: {(train_nans > 0).sum()} columns")
    print(f"  Max NaNs in any column: {train_nans.max()} ({train_nans.max()/len(X_train):.1%})")
else:
    print("\nâœ“ No NaNs in training features")


Removing data before 2025-11-15 (first 1 months)
Dropping last 11 rows to avoid trailing NaNs
Rows after cleaning: 245 (from 2025-11-15 20:00:00 to 2025-11-26 00:00:00)

X shapes -> train (196, 450), val (24, 450), test (25, 450)
y shapes -> train (196, 6), val (24, 6), test (25, 6)

âš  Training features with NaNs: 55 columns
  Max NaNs in any column: 75 (38.3%)
