In [3]:
import os
os.chdir("/Users/auguste/Desktop/Dossiers/HEC/Courses/RP/rp-adaptive-ml-trading/data/raw/AAPL")

In [None]:
import pandas as pd
import numpy as np

# === Load and prepare ===
df = pd.read_csv("AAPL_2014_2024_technical_cleaned.csv", parse_dates=["date"])
df = df.sort_values("date").reset_index(drop=True)

# === Clean column names ===
df.columns = (
    df.columns
    .str.lower()
    .str.replace(r"[^\w\s]", "", regex=True)
    .str.replace(r"\s+", "_", regex=True)
    .str.strip("_")
)

# === Label construction ===
df['y_up_1d']  = (df['close'].shift(-1) > df['close']).astype(int)
df['y_up_5d']  = (df['close'].shift(-5) > df['close']).astype(int)
df['y_up_20d'] = (df['close'].shift(-20) > df['close']).astype(int)

df['y_ret_1d'] = df['close'].pct_change().shift(-1)
df['y_ret_5d'] = df['close'].pct_change(5).shift(-5)
df['y_vol_5d'] = df['close'].pct_change().rolling(5).std().shift(-5)

# === Intelligent feature matching ===
candidate_keywords = [
    "rsi", "macd", "signal", "hist", "momentum", "money", "obv", "roc",
    "osc", "trix", "williams", "cci", "std", "vol", "schaff", "ultimate",
    "trix", "price", "trend", "acc", "ulcer", "klinger", "psar", "mass"
]

# Heuristic filter: keep features with numeric type and matching any keyword
feature_candidates = [
    col for col in df.columns
    if any(k in col for k in candidate_keywords)
    and df[col].dtype in [np.float64, np.float32, np.int64]
]

# Optional: drop columns with too much missing data
valid_features = [col for col in feature_candidates if df[col].isna().mean() < 0.3]

# Final feature matrix
X = df[["date"] + valid_features].copy()
y = df[["date", "y_up_1d", "y_up_5d", "y_up_20d", "y_ret_1d", "y_ret_5d", "y_vol_5d"]].copy()

# === Diagnostics ===
print(" Patched technical features script.")
print(f"Selected {len(valid_features)} features.")
print(f"Feature matrix shape: {X.shape}")
print(f"Label matrix shape:   {y.shape}")


✅ Patched technical features script.
✔ Selected 51 features.
📐 Feature matrix shape: (2769, 52)
🎯 Label matrix shape:   (2769, 7)
