# Feature Engineering Workbench

**Objective:** This notebook is a laboratory for exploring, visualizing, and testing new features for the Daily Bias prediction models. 

**Workflow:**
1.  Load historical multi-timeframe data for a single symbol.
2.  Apply the existing feature engineering logic (from `train_daily_bias_models.py`).
3.  Visualize the relationships between features and the target bias.
4.  Experiment with creating new, experimental features.
5.  Once a new feature is proven to be valuable, its logic should be copied into the main `engineer_features` function in the production training script.

## 1. Setup and Data Loading

In [None]:
import os
import sys
import pandas as pd
import numpy as np
import pickle
from datetime import datetime, timedelta
import warnings
import alpaca_trade_api as tradeapi
from dotenv import load_dotenv
import matplotlib.pyplot as plt
import seaborn as sns

# Add project root to path to allow imports from config
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath('__file__')))))
from config.settings import WATCHLIST

warnings.filterwarnings('ignore')
sns.set_style('whitegrid')

# --- Load API Keys ---
load_dotenv(dotenv_path='../../.env')
api_key = os.getenv('ALPACA_API_KEY')
secret_key = os.getenv('ALPACA_SECRET_KEY')

api = tradeapi.REST(
    key_id=api_key,
    secret_key=secret_key,
    base_url='https://paper-api.alpaca.markets'
)

print("Libraries loaded and Alpaca API client initialized.")

In [None]:
# --- Single Symbol Data Loading ---
SYMBOL_TO_ANALYZE = 'SPY' # Change this to analyze different symbols

print(f"Fetching data for {SYMBOL_TO_ANALYZE}...")

end_date = datetime.now().isoformat()
start_1d = (datetime.now() - timedelta(days=5*365)).isoformat()

df_daily = api.get_bars(SYMBOL_TO_ANALYZE, '1Day', start=start_1d, end=end_date).df

print(f"Downloaded {len(df_daily)} daily bars.")
df_daily.head()

## 2. Feature Engineering Logic

This section contains the exact same feature engineering functions from the production `train_daily_bias_models.py` script. This ensures our research environment matches our production environment.

In [None]:
def _safe_divide(numerator, denominator, default=0):
    return np.where(denominator != 0, numerator / denominator, default)

def engineer_features(df):
    df = df.copy()
    df.columns = [x.lower() for x in df.columns]

    df['prev_close'] = df['close'].shift(1)
    df['prev_high'] = df['high'].shift(1)
    df['prev_low'] = df['low'].shift(1)

    # Basic & ICT Features
    df['gap_pct'] = _safe_divide((df['open'] - df['prev_close']), df['prev_close']) * 100
    df['gap_size_abs'] = abs(df['gap_pct'])
    df['prev_day_range_pct'] = _safe_divide((df['prev_high'] - df['prev_low']), df['prev_close']) * 100
    df['current_range_pct'] = _safe_divide((df['high'] - df['low']), df['open']) * 100
    df['volume_ratio_5d'] = _safe_divide(df['volume'], df['volume'].rolling(5, min_periods=1).mean(), 1.0)
    df['swept_prev_high'] = np.where(df['high'] > df['prev_high'], 1, 0)
    df['swept_prev_low'] = np.where(df['low'] < df['prev_low'], 1, 0)
    df['body_size'] = abs(df['close'] - df['open'])
    df['total_range'] = df['high'] - df['low']
    df['body_pct'] = _safe_divide(df['body_size'], df['total_range']) * 100
    df['upper_wick_pct'] = _safe_divide(df['high'] - df[['open', 'close']].max(axis=1), df['total_range']) * 100
    df['lower_wick_pct'] = _safe_divide(df[['open', 'close']].min(axis=1) - df['low'], df['total_range']) * 100
    range_nonzero = np.maximum(df['high'] - df['low'], 1e-10)
    df['close_vs_range'] = _safe_divide((df['close'] - df['low']), range_nonzero)
    
    df = df.replace([np.inf, -np.inf], np.nan).dropna()
    return df

def define_bias_label(df, threshold=0.5):
    daily_move_pct = _safe_divide(df['close'] - df['open'], df['open']) * 100
    conditions = [daily_move_pct > threshold, daily_move_pct < -threshold]
    choices = ['bullish', 'bearish']
    df['bias_label'] = np.select(conditions, choices, default='choppy')
    return df

# --- Run the feature engineering pipeline ---
df_features = engineer_features(df_daily)
df_labeled = define_bias_label(df_features)

print("Feature engineering complete.")
df_labeled[['gap_pct', 'volume_ratio_5d', 'swept_prev_high', 'body_pct', 'bias_label']].head()

## 3. Visualization and Analysis

Let's visualize some of the features we created to understand their relationship with the daily bias.

In [None]:
# Visualize the distribution of the target variable
plt.figure(figsize=(8, 5))
sns.countplot(data=df_labeled, x='bias_label', palette='viridis')
plt.title(f'Distribution of Daily Bias Labels for {SYMBOL_TO_ANALYZE}', fontsize=16)
plt.ylabel('Count')
plt.xlabel('Bias Label')
plt.show()

In [None]:
# Visualize the relationship between Gap Percentage and Bias
plt.figure(figsize=(10, 6))
sns.boxplot(data=df_labeled, x='bias_label', y='gap_pct', palette='coolwarm')
plt.title('Gap Percentage vs. Daily Bias', fontsize=16)
plt.axhline(0, color='grey', linestyle='--')
plt.ylabel('Gap %')
plt.xlabel('Bias Label')
plt.show()

In [None]:
# --- Correlation Heatmap ---
feature_names = [
    'gap_pct', 'gap_size_abs', 'prev_day_range_pct', 'current_range_pct',
    'volume_ratio_5d', 'swept_prev_high', 'swept_prev_low', 'body_pct',
    'upper_wick_pct', 'lower_wick_pct', 'close_vs_range'
]

corr_matrix = df_labeled[feature_names].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, cmap='viridis', fmt='.2f')
plt.title('Feature Correlation Matrix', fontsize=16)
plt.show()

## 4. Experimental Zone: Creating a New Feature

This is where you can test ideas for new features. Let's try creating a 'consecutive_day_direction' feature.

In [None]:
def add_experimental_feature(df):
    df_exp = df.copy()
    
    # Calculate the direction of the previous day's move
    daily_move = df_exp['close'] - df_exp['open']
    direction = np.sign(daily_move).shift(1).fillna(0)
    
    # Calculate consecutive days in the same direction
    consecutive_days = direction.groupby((direction != direction.shift()).cumsum()).cumcount() + 1
    df_exp['consecutive_day_direction'] = consecutive_days * direction
    
    return df_exp

df_experimental = add_experimental_feature(df_labeled)

# Visualize the new feature
plt.figure(figsize=(10, 6))
sns.boxplot(data=df_experimental, x='bias_label', y='consecutive_day_direction', palette='plasma')
plt.title('Experimental Feature: Consecutive Day Direction vs. Bias', fontsize=16)
plt.ylabel('Consecutive Days (Positive=Up, Negative=Down)')
plt.xlabel('Bias Label')
plt.show()

df_experimental[['consecutive_day_direction', 'bias_label']].head(10)