# Feature & Target Pipeline
Quick tests and evaluation on new targets/features/models

In [1]:
import pandas as pd
from pathlib import Path
from typing import Optional
from data_pipeline import load_data  # This just loads the data and cleans it
from featureEngineer import FeatureEngineer
from targetEngineer import ExpirationTargetEngineer
from ML_setup import CONFIG
from ML_general_tools import *
from pathlib import Path

print("Imports and configuration ready")

# Build features, targets, and combined dataframe
raw_history = load_data(CONFIG["data"]["path"])
history_slice = raw_history[:]

feature_params = dict(CONFIG["features"]["params"])
heavy_cache_cfg = CONFIG["features"].get("heavy_cache", {})
heavy_cache_root = Path(heavy_cache_cfg.get("directory", "cache/heavy_features"))

current_output_root_str = CONFIG["output"]["directory"]
current_output_root_path = Path(current_output_root_str)

paths = {
    "root": current_output_root_path,
    "feature_selection": current_output_root_path / CONFIG["output"]["subdirectories"]["features"],
    "trained_models": current_output_root_path / CONFIG["output"]["subdirectories"]["models"],
    "hpt_studies": current_output_root_path / CONFIG["output"]["subdirectories"]["hpt"],
    "feature_cache": current_output_root_path / CONFIG["output"]["subdirectories"]["cache"]
}


cache_dir = heavy_cache_root
cache_dir.mkdir(parents=True, exist_ok=True)
cache_files = sorted(cache_dir.glob("heavy_features_v*.pkl"))
cache_ready = bool(cache_files)
if cache_ready:
    print(f"Heavy cache ready: {cache_files[-1].name} (total {len(cache_files)}) in {cache_dir}")
else:
    print(f"No heavy cache file found in {cache_dir}; initial fit will populate.")

## Feture Engineering
fe = FeatureEngineer(verbose=feature_params.get("verbose", False), **{k: v for k, v in feature_params.items() if k != "verbose"})

## cache
cache_ready = True  # Force rebuild for testing

manual_features = None
if cache_ready and fe.heavy_cache.load():
    print("Loaded heavy cache payload from disk; skipping rebuild.")
    fe._heavy_payload = fe.heavy_cache.payload
    reference = fe._prepare_reference_frame(history_slice)
    fe._full_reference = reference
    manual_features = fe._compute_all_features(reference, build_heavy=False)
    fe.feature_names_out_ = manual_features.columns.tolist()
    fe._reference_features = manual_features
else:
    print("Heavy cache not available or failed to load; running full fit.")
    verbose_flag = feature_params.pop("verbose", False)
    fe = FeatureEngineer(verbose=verbose_flag, **feature_params)
    fe.fit(history_slice)
    manual_features = fe.transform(history_slice)

feature_engineer = fe
features = manual_features.copy()

## Add targets standard expiration targets
# target_engineer = ExpirationTargetEngineer(**CONFIG["targets"]["params"])
# target_engineer.fit(features)
# targets = target_engineer.transform(features)

## 2a. Volatility Regime Target Engineering Test ---
from targetEngineer import VolatilityRegimeEngineer

df_train = features.copy()

# Instantiate with your parameters
regime_engineer = VolatilityRegimeEngineer(
    lookback_window=24*3,    # 3 days lookback for vol
    seasonal_window=24*30,   # 30 days to learn patterns
    forward_window=24,       # 24h classification
    trend_std=1.2,           # 1.2 daily sigmas
    jump_std=3.0,            # 3.0 daily sigmas (scaled internally for 6h window)
    jump_speed_window=6,     # 6h window for jump detection

    # Hardening Parameters
    trend_min_efficiency=0.15, # Allows looser/messier trends
    trend_min_r2=0.6           # Requires moderate linear fit
)

# Run Fit/Transform
regime_engineer.fit(df_train)
targets = regime_engineer.transform(df_train)

# Check the distribution
dist = regime_engineer.get_regime_distribution(df_train)
print(dist)


initial_feature_names = list(features.columns)

# Generate targets (your existing logic)
print(initial_feature_names)


## deop price columns from features
# drop_cols = [col for col in ['o', 'h', 'l', 'c', 'volCcy'] if col in features.columns]
# if drop_cols: features = features.drop(columns=drop_cols)


# Combine
combined_df = pd.concat([features, targets], axis=1)


  from .autonotebook import tqdm as notebook_tqdm


Imports and configuration ready
=== Loading .hist_db_1h.csv ===

Initial rows: 53,963

=== FOUND ISSUES (prior to automated fixes) ===

ðŸ”´ TEMPORAL: Missing hours: 1 cases
  Missing timestamps sample:
    2025-11-04 13:00:00

ðŸ”´ DATA INTEGRITY: Identical consecutive OHLC rows: 174 cases
  Sample cases:
    {'o': '7110.10', 'h': '7110.10', 'l': '7110.10', 'c': '7110.10', 'volCcy': '0.00'}
    {'o': '7110.10', 'h': '7110.10', 'l': '7110.10', 'c': '7110.10', 'volCcy': '0.00'}
    {'o': '7110.10', 'h': '7110.10', 'l': '7110.10', 'c': '7110.10', 'volCcy': '0.00'}
  Affected dates (sample): 2020-01-02, 2020-01-03, 2020-01-04, 2020-01-05, 2020-01-06

=== APPLYING AUTOMATED FIXES ===
ACTION: Resampled/Reindexed to 53964 hourly intervals (was 53963).
ACTION: Forward-filled NaNs after resampling. (5 NaNs potentially filled by ffill).

=== FINAL STATUS (after automated fixes) ===
DataFrame shape post-fixes: (53964, 5) (Original: (53963, 6))
Date range: 2019-10-01 00:00:00 to 2025-11-26 11:00:

In [None]:



# Clean combined dataframe - drop first month and last 11 rows (minimal cleaning for small dataset)
months_to_drop = 1  # Only 1 month for small dataset
tail_rows_to_drop = 11

cutoff = combined_df.index.min() + pd.DateOffset(months=months_to_drop)
print(f"Removing data before {cutoff:%Y-%m-%d} (first {months_to_drop} months)")
combined_df_clean = combined_df.loc[combined_df.index >= cutoff]

if tail_rows_to_drop > 0:
    print(f"Dropping last {tail_rows_to_drop} rows to avoid trailing NaNs")
    combined_df_clean = combined_df_clean.iloc[:-tail_rows_to_drop]

print(f"Rows after cleaning: {len(combined_df_clean)} (from {combined_df_clean.index[0]} to {combined_df_clean.index[-1]})")

# Split into train/val/test (80/10/10)
n_samples = len(combined_df_clean)
train_end = int(n_samples * 0.8)
val_end = train_end + int(n_samples * 0.1)

# Get feature and target columns
feature_cols = features.columns.intersection(combined_df_clean.columns)
target_cols = targets.columns.intersection(combined_df_clean.columns)

X_train = combined_df_clean[feature_cols].iloc[:train_end]
X_val = combined_df_clean[feature_cols].iloc[train_end:val_end]
X_test = combined_df_clean[feature_cols].iloc[val_end:]

y_train = combined_df_clean[target_cols].iloc[:train_end]
y_val = combined_df_clean[target_cols].iloc[train_end:val_end]
y_test = combined_df_clean[target_cols].iloc[val_end:]

print(f"\nX shapes -> train {X_train.shape}, val {X_val.shape}, test {X_test.shape}")
print(f"y shapes -> train {y_train.shape}, val {y_val.shape}, test {y_test.shape}")

# Quick NaN check on training data
train_nans = X_train.isna().sum()
if train_nans.sum() > 0:
    print(f"\nâš  Training features with NaNs: {(train_nans > 0).sum()} columns")
    print(f"  Max NaNs in any column: {train_nans.max()} ({train_nans.max()/len(X_train):.1%})")
else:
    print("\nâœ“ No NaNs in training features")
