In [1]:
import yfinance as yf
import pandas as pd
import numpy as np
import time
from ta import add_all_ta_features
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_squared_error
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.neural_network import MLPRegressor

In [2]:
TICKERS = ["AAPL", "MSFT", "AMZN", "GOOG", "META"]   
START    = "2018-01-01"                             
END      = "2024-12-31"                  

In [3]:
LOOKBACK = 30
HORIZON  = 1

In [4]:
def get_prices(tickers, start, end):
    data = yf.download(
        tickers=" ".join(tickers),
        start=start, end=end,
        auto_adjust=False, progress=True, group_by="ticker"
    )
    return data

In [5]:
raw_prices = get_prices(TICKERS, START, END)

[*********************100%***********************]  5 of 5 completed


In [6]:
raw_prices.to_parquet("raw_prices.parquet")

In [7]:
raw = pd.read_parquet("raw_prices.parquet")

long = (
    raw.stack(level=0, future_stack=True)            
       .reset_index()            
       .rename(columns={"level_1":"Ticker"})
)

HORIZON = 1
long["return_fwd"] = (
    long.groupby("Ticker")["Adj Close"]
        .pct_change(HORIZON)
        .shift(-HORIZON)
)
long["direction"] = (long["return_fwd"] > 0).astype(int)

long = long.dropna(subset=["return_fwd"]).reset_index(drop=True)

print(long.head())
print(f"# samples：{len(long)}")

Price       Date Ticker        Open        High         Low       Close  \
0     2018-01-02   GOOG   52.417000   53.347000   52.261501   53.250000   
1     2018-01-03   META  181.880005  184.779999  181.330002  184.669998   
2     2018-01-03   AMZN   59.415001   60.274502   59.415001   60.209999   
3     2018-01-03   MSFT   86.059998   86.510002   85.970001   86.349998   
4     2018-01-03   AAPL   43.132500   43.637501   42.990002   43.057499   

Price   Adj Close     Volume  return_fwd  direction  
0       52.935345   24752000    0.017914          1  
1      183.803726   16886600    0.012775          1  
2       60.209999   62176000    0.004654          1  
3       79.697723   26061400   -0.000174          0  
4       40.419792  118071600    0.016413          1  
# samples：8795


In [8]:
df = (
    long
    .sort_values(["Ticker","Date"])        
    .groupby("Ticker", group_keys=False)   
    .apply(lambda d: add_all_ta_features(
        d,
        open="Open", high="High", low="Low",
        close="Adj Close", volume="Volume",
        fillna=True
    ))
    .reset_index(drop=True)
)

print(df.columns[-10:])
print(df.head())

Index(['momentum_ppo', 'momentum_ppo_signal', 'momentum_ppo_hist',
       'momentum_pvo', 'momentum_pvo_signal', 'momentum_pvo_hist',
       'momentum_kama', 'others_dr', 'others_dlr', 'others_cr'],
      dtype='object', name='Price')
Price       Date Ticker       Open       High        Low      Close  \
0     2018-01-03   AAPL  43.132500  43.637501  42.990002  43.057499   
1     2018-01-04   AAPL  43.134998  43.367500  43.020000  43.257500   
2     2018-01-05   AAPL  43.360001  43.842499  43.262501  43.750000   
3     2018-01-08   AAPL  43.587502  43.902500  43.482498  43.587502   
4     2018-01-09   AAPL  43.637501  43.764999  43.352501  43.582500   

Price  Adj Close     Volume  return_fwd  direction  ...  momentum_ppo  \
0      40.419792  118071600    0.016413          1  ...      0.000000   
1      40.607529   89738400    0.003621          1  ...      0.037039   
2      41.069847   94640000    0.014571          1  ...      0.156664   
3      40.917320   82271200    0.004273       

  long


## ML

In [9]:
feat_cols = [c for c in df.columns
             if c not in ["Date","Ticker","return_fwd","direction",
                          "Open","High","Low","Close","Adj Close","Volume"]]

In [79]:
feat_try = ["momentum_rsi","trend_macd","trend_sma_fast","volatility_atr","volume_obv"]

In [80]:
split_dates = {
    "train_end": pd.Timestamp("2022-12-31"),
    "val_end"  : pd.Timestamp("2023-12-31"),
}

train_mask = df["Date"] <= split_dates["train_end"]
val_mask   = (df["Date"] > split_dates["train_end"]) & (df["Date"] <= split_dates["val_end"])
test_mask  = df["Date"] > split_dates["val_end"]

X_train, y_train_reg, y_train_clf = (
    df.loc[train_mask, feat_try],
    df.loc[train_mask, "return_fwd"],
    df.loc[train_mask, "direction"]
)
X_val, y_val_reg, y_val_clf = (
    df.loc[val_mask, feat_try],
    df.loc[val_mask, "return_fwd"],
    df.loc[val_mask, "direction"]
)
X_test, y_test_reg, y_test_clf = (
    df.loc[test_mask, feat_try],
    df.loc[test_mask, "return_fwd"],
    df.loc[test_mask, "direction"]
)

print("Train size:", len(X_train),
      "| Val size:", len(X_val),
      "| Test size:", len(X_test))

Train size: 6291 | Val size: 1250 | Test size: 1254


In [81]:
scaler = StandardScaler().fit(X_train)
X_train_s = scaler.transform(X_train)
X_val_s   = scaler.transform(X_val)
X_test_s  = scaler.transform(X_test)

In [82]:
df_val  = df.loc[val_mask].reset_index(drop=True)
df_test = df.loc[test_mask].reset_index(drop=True)

In [83]:
def _daily_sharpe(signal, returns, dates, min_pairs=1):
    df = pd.DataFrame({"Date": dates, "sig": signal, "ret": returns})

    def long_minus_short(g):
        long = g.loc[g.sig ==  1, "ret"]
        if len(long) >= min_pairs:
            return long.mean()

    daily_ret = df.groupby("Date").apply(long_minus_short).dropna()
    if len(daily_ret) < 60:
        return np.nan      
    return daily_ret.mean() / daily_ret.std(ddof=1) * np.sqrt(252)

In [84]:
df23 = df[df["Date"].dt.year==2023].copy()
df24 = df[df["Date"].dt.year==2024].copy()

In [85]:
def test_sharpe(df_year):
    daily = (df_year.groupby("Date")["return_fwd"].mean())
    return daily.mean()/daily.std(ddof=1)*np.sqrt(252), daily

sh_23,daily23 = test_sharpe(df23)
sh_24,daily24 = test_sharpe(df24)

print(sh_23)
print(sh_24)

3.212634904481462
2.1258680851797975


In [86]:
def evaluate_direction(df_rows, y_pred):
    sig = np.where(y_pred == 1, 1, -1)

    return {
        "Acc":  accuracy_score(df_rows["direction"], y_pred),
        "Prec": precision_score(df_rows["direction"], y_pred, zero_division=0),
        "Rec":  recall_score(df_rows["direction"], y_pred, zero_division=0),
        "F1":   f1_score(df_rows["direction"], y_pred, zero_division=0),
        "Sharpe": _daily_sharpe(sig, df_rows["return_fwd"].values, df_rows["Date"])
    }

In [87]:
def evaluate_regression(df_rows, y_pred):
    mse  = mean_squared_error(df_rows["return_fwd"], y_pred)
    rmse = np.sqrt(mse)
    sig  = np.sign(y_pred); sig[sig == 0] = -1
    return {
        "MSE": mse,
        "RMSE": rmse,
        "Sharpe": _daily_sharpe(sig, df_rows["return_fwd"].values, df_rows["Date"])
    }

Classification Model  

In [88]:
models_cls = {
    "DT" : DecisionTreeClassifier(max_depth=5, random_state=42),
    "RF" : RandomForestClassifier(n_estimators=300, max_depth=8, random_state=42, n_jobs=-1),
    "LR" : LogisticRegression(max_iter=500, random_state=42),
    "NN" : MLPClassifier(hidden_layer_sizes=(64,32), max_iter=500, random_state=42)
}

In [89]:
val_cls, test_cls = {}, {}
for name, clf in models_cls.items():
    X_tr = X_train_s if name in ["LR","NN"] else X_train
    X_va = X_val_s   if name in ["LR","NN"] else X_val
    X_te = X_test_s  if name in ["LR","NN"] else X_test
    
    clf.fit(X_tr, y_train_clf)
    val_cls[name]  = evaluate_direction(df_val,  clf.predict(X_va))
    test_cls[name] = evaluate_direction(df_test, clf.predict(X_te))

  daily_ret = df.groupby("Date").apply(long_minus_short).dropna()
  daily_ret = df.groupby("Date").apply(long_minus_short).dropna()
  daily_ret = df.groupby("Date").apply(long_minus_short).dropna()
  daily_ret = df.groupby("Date").apply(long_minus_short).dropna()
  daily_ret = df.groupby("Date").apply(long_minus_short).dropna()
  daily_ret = df.groupby("Date").apply(long_minus_short).dropna()
  daily_ret = df.groupby("Date").apply(long_minus_short).dropna()
  daily_ret = df.groupby("Date").apply(long_minus_short).dropna()


In [90]:
print("=== Classification Validation (2023) ===")
display(pd.DataFrame(val_cls).T.round(3))

print("=== Classification Test (2024) ===")
display(pd.DataFrame(test_cls).T.round(3))

=== Classification Validation (2023) ===


Unnamed: 0,Acc,Prec,Rec,F1,Sharpe
DT,0.574,0.573,0.859,0.687,3.454
RF,0.56,0.574,0.749,0.65,2.703
LR,0.593,0.602,0.746,0.666,3.237
NN,0.581,0.61,0.639,0.624,3.506


=== Classification Test (2024) ===


Unnamed: 0,Acc,Prec,Rec,F1,Sharpe
DT,0.582,0.594,0.774,0.672,1.944
RF,0.573,0.61,0.637,0.624,1.906
LR,0.573,0.605,0.662,0.632,1.567
NN,0.586,0.647,0.558,0.599,3.283


Regression Model  

In [91]:
models_reg = {
    "DTreg" : DecisionTreeRegressor(max_depth=5, random_state=42),
    "RFreg" : RandomForestRegressor(n_estimators=300, max_depth=8, random_state=42, n_jobs=-1),
    "Ridge" : Ridge(alpha=1.0, random_state=42),
    "MLPreg": MLPRegressor(hidden_layer_sizes=(64,32), max_iter=500,
                           learning_rate_init=1e-3, random_state=42)
}

In [92]:
val_reg, test_reg = {}, {}
for name, reg in models_reg.items():
    X_tr = X_train_s if name in ["Ridge","MLPreg"] else X_train
    X_va = X_val_s   if name in ["Ridge","MLPreg"] else X_val
    X_te = X_test_s  if name in ["Ridge","MLPreg"] else X_test
    
    reg.fit(X_tr, y_train_reg)
    val_reg[name]  = evaluate_regression(df_val,  reg.predict(X_va))
    test_reg[name] = evaluate_regression(df_test, reg.predict(X_te))

  daily_ret = df.groupby("Date").apply(long_minus_short).dropna()
  daily_ret = df.groupby("Date").apply(long_minus_short).dropna()
  daily_ret = df.groupby("Date").apply(long_minus_short).dropna()
  daily_ret = df.groupby("Date").apply(long_minus_short).dropna()
  daily_ret = df.groupby("Date").apply(long_minus_short).dropna()
  daily_ret = df.groupby("Date").apply(long_minus_short).dropna()
  daily_ret = df.groupby("Date").apply(long_minus_short).dropna()
  daily_ret = df.groupby("Date").apply(long_minus_short).dropna()


In [93]:
print("=== Regression Validation (2023) ===")
display(pd.DataFrame(val_reg).T.round(6))

print("=== Regression Test (2024) ===")
display(pd.DataFrame(test_reg).T.round(6))

=== Regression Validation (2023) ===


Unnamed: 0,MSE,RMSE,Sharpe
DTreg,0.00038,0.019504,2.772812
RFreg,0.000362,0.019025,3.130667
Ridge,0.000361,0.018987,3.285315
MLPreg,0.000646,0.025407,3.574048


=== Regression Test (2024) ===


Unnamed: 0,MSE,RMSE,Sharpe
DTreg,0.000444,0.021072,1.78491
RFreg,0.000312,0.017662,1.978496
Ridge,0.000302,0.017375,1.282334
MLPreg,0.001834,0.042829,2.435792
