In [1]:
import yfinance as yf
import pandas as pd

spy = yf.download("SPY", start="2010-01-01", auto_adjust=True)
spy.head(), spy.tail(), spy.shape


[*********************100%***********************]  1 of 1 completed


(Price           Close       High        Low       Open     Volume
 Ticker            SPY        SPY        SPY        SPY        SPY
 Date                                                             
 2010-01-04  85.027962  85.072976  83.662473  84.307704  118944600
 2010-01-05  85.253036  85.290552  84.667828  84.975441  111579900
 2010-01-06  85.313057  85.523131  85.102983  85.170504  116074400
 2010-01-07  85.673210  85.778247  84.915437  85.155523  131091100
 2010-01-08  85.958313  85.995829  85.275571  85.448130  126402800,
 Price            Close        High         Low        Open     Volume
 Ticker             SPY         SPY         SPY         SPY        SPY
 Date                                                                 
 2025-12-19  680.590027  681.090027  676.469971  676.590027  103599500
 2025-12-22  684.830017  685.359985  680.590027  683.940002   69556700
 2025-12-23  687.960022  688.200012  683.869995  683.919983   64840000
 2025-12-24  690.380005  690.830017  

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import TimeSeriesSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error

import yfinance as yf

pd.set_option("display.max_columns", 50)


In [None]:
# Downloads SPY dataset 

df = yf.download("SPY", start="2010-01-01", auto_adjust=True)
df = df.rename(columns=str.lower)
df.head()


[*********************100%***********************]  1 of 1 completed


Price,close,high,low,open,volume
Ticker,spy,spy,spy,spy,spy
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2010-01-04,85.027939,85.072953,83.66245,84.307682,118944600
2010-01-05,85.253059,85.290575,84.667851,84.975464,111579900
2010-01-06,85.313057,85.523131,85.102983,85.170504,116074400
2010-01-07,85.67318,85.778217,84.915407,85.155492,131091100
2010-01-08,85.95829,85.995806,85.275549,85.448107,126402800


In [None]:
df["ret_1d"] = df["close"].pct_change() # calculates next day return

H = 5 # looking 5 days ahead for the next 5 day return

df["y"] = df["close"].pct_change(H).shift(-H) # calculates future H-day return

In [None]:
# Feature engineering step

# Momentum
df["mom_5"]  = df["close"].pct_change(5)
df["mom_20"] = df["close"].pct_change(20)
df["mom_60"] = df["close"].pct_change(60)

# Moving averages
df["ma_20"] = df["close"].rolling(20).mean()
df["ma_60"] = df["close"].rolling(60).mean()
df["trend_20_60"] = (df["ma_20"] - df["ma_60"]) / df["ma_60"]

# Volatility 
df["vol_20"] = df["ret_1d"].rolling(20).std()

# Mean reversion
roll_mean = df["close"].rolling(20).mean()
roll_std  = df["close"].rolling(20).std()
df["z_20"] = (df["close"] - roll_mean) / roll_std

# We will include more non-price features after the baseline model is tested.

In [None]:
# Cleaning the dataset to drop NaNs

feature_cols = ["mom_5", "mom_20", "mom_60", "trend_20_60", "vol_20", "z_20"]
data = df[feature_cols + ["y"]].dropna()

X = data[feature_cols]
y = data["y"]
data.tail()



In [None]:
# Walk forward training (with Ridge Regression) and testing

tscv = TimeSeriesSplit(n_splits=8) # Splits data into 8 partitions, so that we can use earlier data for training and later ones for testing

model = Pipeline([
    ("scaler", StandardScaler()),
    ("ridge", Ridge(alpha=1.0))
]) # Builds a pipeline model that standardizes the data (subtract mean, divide by SD), then uses ridge regression

pred = pd.Series(index=X.index, dtype=float) # Creates a series to store predictions for every date in X

# Loops through our time based splits, uses some for training, some for testing
for train_idx, test_idx in tscv.split(X):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train = y.iloc[train_idx]

    model.fit(X_train, y_train)
    pred.iloc[test_idx] = model.predict(X_test)  # Populates our pred series with the predictions

# Align realized y with pred
out = pd.DataFrame({"y": y, "pred": pred}).dropna()
out.head()



In [None]:
# Looking at the metrics to evaluate model performance

mse = mean_squared_error(out["y"], out["pred"])
mae = mean_absolute_error(out["y"], out["pred"])
ic  = out["y"].corr(out["pred"])  # Information Coefficient

mse, mae, ic



In [None]:
# Use rolling standard deviations of predictions to scale risk
scale = out["pred"].rolling(252).std()
signal = (out["pred"] / scale).replace([np.inf, -np.inf], np.nan).fillna(0.0)

pos = signal.clip(-1, 1)  # position in [-1, 1]
out["pos"] = pos


In [None]:
tcost = 0.0005  # 5 bps per 1.0 change in position (tweakable)

out["pos_prev"] = out["pos"].shift(1).fillna(0.0)
out["turnover"] = (out["pos"] - out["pos_prev"]).abs()
out["strat_ret"] = out["pos_prev"] * out["y"] - tcost * out["turnover"]


def sharpe(r, periods=252):
    r = r.dropna()
    if r.std() == 0:
        return np.nan
    return np.sqrt(periods) * r.mean() / r.std()

sh = sharpe(out["strat_ret"])
bh = sharpe(out["y"])  # buy & hold proxy (always long)

sh, bh
