In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import talib

In [2]:
# Paths
PROCESSED_DATA_DIR = Path("../data/processed")
FEATURES_DATA_DIR = PROCESSED_DATA_DIR

In [3]:
# Helper function for features
def add_technical_indicators(df):
    # Sort by date just in case
    df = df.sort_values("Date").reset_index(drop=True)

    # Daily returns
    df["Return"] = df["Close"].pct_change()

    # Historical volatility (rolling std of returns)
    df["Volatility_5d"] = df["Return"].rolling(window=5).std() * np.sqrt(252)
    df["Volatility_21d"] = df["Return"].rolling(window=21).std() * np.sqrt(252)

    # RSI (Relative Strength Index)
    df["RSI_14"] = talib.RSI(df["Close"], timeperiod=14)

    # MACD
    macd, macd_signal, macd_hist = talib.MACD(df["Close"], fastperiod=12, slowperiod=26, signalperiod=9)
    df["MACD"] = macd
    df["MACD_signal"] = macd_signal
    df["MACD_hist"] = macd_hist

    # ATR (Average True Range)
    df["ATR_14"] = talib.ATR(df["High"], df["Low"], df["Close"], timeperiod=14)

    # Moving averages
    df["SMA_20"] = df["Close"].rolling(window=20).mean()
    df["SMA_50"] = df["Close"].rolling(window=50).mean()
    df["SMA_200"] = df["Close"].rolling(window=200).mean()

    return df

In [4]:
# Process all market files
market_files = [
    "AAPL_market.csv",
    "IBEX35_market.csv",
    "MSFT_market.csv",
    "NASDAQ_market.csv",
    "S&P500_market.csv"
]

for file in market_files:
    file_path = PROCESSED_DATA_DIR / file
    if file_path.exists():
        print(f"Processing features for {file}...")
        df = pd.read_csv(file_path)

        # Ensure column names are consistent
        df.columns = [col.strip().capitalize() for col in df.columns]

        # Add indicators
        df_features = add_technical_indicators(df)

        # Drop initial NaNs caused by rolling calculations
        df_features = df_features.dropna()

        # Save to processed folder
        output_path = FEATURES_DATA_DIR / file.replace("_market.csv", "_features.csv")
        df_features.to_csv(output_path, index=False)
        print(f"Saved features to {output_path}")
    else:
        print(f"File not found: {file}")

Processing features for AAPL_market.csv...
Saved features to ..\data\processed\AAPL_features.csv
Processing features for IBEX35_market.csv...
Saved features to ..\data\processed\IBEX35_features.csv
Processing features for MSFT_market.csv...
Saved features to ..\data\processed\MSFT_features.csv
Processing features for NASDAQ_market.csv...
Saved features to ..\data\processed\NASDAQ_features.csv
Processing features for S&P500_market.csv...
Saved features to ..\data\processed\S&P500_features.csv


In [5]:
# Quick check
sample_df = pd.read_csv(FEATURES_DATA_DIR / "AAPL_features.csv")
sample_df.tail()

Unnamed: 0,Date,Close,High,Low,Open,Volume,Ticker,Return,Volatility_5d,Volatility_21d,RSI_14,MACD,MACD_signal,MACD_hist,ATR_14,SMA_20,SMA_50,SMA_200
2456,2025-07-25,213.880005,215.240005,213.399994,214.699997,40268800,AAPL,0.000561,0.076108,0.144279,63.143004,2.609528,2.397952,0.211576,3.521561,210.6245,205.4042,221.665316
2457,2025-07-28,214.050003,214.850006,213.059998,214.029999,37858000,AAPL,0.000795,0.069335,0.143043,63.486928,2.589338,2.43623,0.153109,3.397879,211.273,205.4386,221.630999
2458,2025-07-29,211.270004,214.809998,210.820007,214.179993,51411700,AAPL,-0.012988,0.09104,0.153304,54.526394,2.322245,2.413433,-0.091187,3.440172,211.578,205.435,221.562453
2459,2025-07-30,209.050003,212.389999,207.720001,211.899994,45512500,AAPL,-0.010508,0.103043,0.144831,48.62442,1.909426,2.312631,-0.403205,3.528017,211.6395,205.3908,221.464023
2460,2025-07-31,207.570007,209.839996,207.160004,208.490005,80698400,AAPL,-0.00708,0.100215,0.140482,45.118237,1.44617,2.139339,-0.693169,3.467444,211.396,205.3666,221.360685
