# Fetch data from yahoo finance

In [None]:
import os
import yfinance as yf

tickers = ["GE", "MSFT", "JNJ", "IBM", "AAPL"]
output_folder = "data/raw/stock_data"
os.makedirs(output_folder, exist_ok=True)

for ticker in tickers:
    print(f"Fetching data for {ticker}...")
    data = yf.download(ticker, period="max", interval="1d")

    # Reset index to make Date a column
    data.reset_index(inplace=True)

    # Flatten column names (removing ticker name)
    data.columns = [col[0] if isinstance(col, tuple) else col for col in data.columns]

    # Save to CSV
    file_path = os.path.join(output_folder, f"{ticker}_daily.csv")
    data.to_csv(file_path, index=False)
    print(f"Saved: {file_path}")

print("All data downloaded and flattened correctly!")


Fetching data for AAPL...


[*********************100%***********************]  1 of 1 completed

Saved: data/raw/stock_data\AAPL_daily.csv
All data downloaded and flattened correctly!





# Feature engineering

In [41]:
import pandas as pd
import numpy as np
import os
from glob import glob

# Paths
input_folder = "data/raw/stock_data"
output_folder = "data/processed/stock_data"
os.makedirs(output_folder, exist_ok=True)

def compute_rsi(series, period=14):
    delta = series.diff()
    gain = np.where(delta > 0, delta, 0)
    loss = np.where(delta < 0, -delta, 0)
    avg_gain = pd.Series(gain).rolling(window=period).mean()
    avg_loss = pd.Series(loss).rolling(window=period).mean()
    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

def feature_engineering(df):
    df.sort_values("Date", inplace=True)

    # % Change in Volume
    df['Volume_pct_change'] = df['Volume'].pct_change()

    # **Log Returns**
    df['Log_Return'] = np.log(df['Close'] / df['Close'].shift(1))

    # Lag features
    df['Close_lag1'] = df['Close'].shift(1)
    df['High_lag1'] = df['High'].shift(1)
    df['Low_lag1'] = df['Low'].shift(1)
    df['Open_lag1'] = df['Open'].shift(1)
    df['Log_Return_lag1'] = df['Log_Return'].shift(1)

    # Simple Moving Averages (SMA)
    df['SMA_10'] = df['Close'].rolling(window=10).mean()
    df['SMA_5'] = df['Close'].rolling(window=5).mean()

    # Exponential Moving Average (EMA 10 periods)
    df['EMA_10'] = df['Close'].ewm(span=10, adjust=False).mean()

    # RSI 14
    df['RSI_14'] = compute_rsi(df['Close'], 14)

    # Stochastic %K (14 days)
    low_14 = df['Low'].rolling(window=14).min()
    high_14 = df['High'].rolling(window=14).max()
    df['Stoch_%K_14'] = (df['Close'] - low_14) / (high_14 - low_14) * 100

    # Stochastic %D (3-day SMA of %K)
    df['Stoch_%D_14'] = df['Stoch_%K_14'].rolling(window=3).mean()

    # MACD (12EMA - 26EMA) and Signal line (9EMA)
    ema_12 = df['Close'].ewm(span=12, adjust=False).mean()
    ema_26 = df['Close'].ewm(span=26, adjust=False).mean()
    df['MACD'] = ema_12 - ema_26
    df['MACD_Signal'] = df['MACD'].ewm(span=9, adjust=False).mean()

    # Bollinger Bands (20 periods, 2 std dev)
    bb_period = 20
    df['BB_middle'] = df['Close'].rolling(window=bb_period).mean()
    df['BB_upper'] = df['BB_middle'] + (2 * df['Close'].rolling(window=bb_period).std())
    df['BB_lower'] = df['BB_middle'] - (2 * df['Close'].rolling(window=bb_period).std())
    df['BB_width'] = df['BB_upper'] - df['BB_lower']

    # **Target: Next-day log return**
    df['Target'] = np.log(df['Close'].shift(-1) / df['Close'])

    # Replace inf/-inf with NaN
    df.replace([np.inf, -np.inf], np.nan, inplace=True)

    # Drop NaNs
    before_drop = len(df)
    df.dropna(inplace=True)
    after_drop = len(df)
    dropped_rows = before_drop - after_drop

    return df, dropped_rows

# Process all CSVs
for file_path in glob(os.path.join(input_folder, "*.csv")):
    stock_name = os.path.basename(file_path).replace(".csv", "")
    print(f"Processing {stock_name}...")
    df = pd.read_csv(file_path, parse_dates=["Date"])
    df_processed, dropped_rows = feature_engineering(df)
    output_file = os.path.join(output_folder, f"{stock_name}_features.csv")
    df_processed.to_csv(output_file, index=False)
    print(f"Saved: {output_file} | Dropped rows: {dropped_rows}")

print("\nAll stocks processed with log returns!")


Processing AAPL_daily...
Saved: data/processed/stock_data\AAPL_daily_features.csv | Dropped rows: 21
Processing GE_daily...
Saved: data/processed/stock_data\GE_daily_features.csv | Dropped rows: 21
Processing IBM_daily...
Saved: data/processed/stock_data\IBM_daily_features.csv | Dropped rows: 23
Processing JNJ_daily...
Saved: data/processed/stock_data\JNJ_daily_features.csv | Dropped rows: 30
Processing MSFT_daily...
Saved: data/processed/stock_data\MSFT_daily_features.csv | Dropped rows: 20

All stocks processed with log returns!
