In [1]:
import yfinance as yf
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np

In [2]:
sp500 = yf.Ticker("^GSPC")
sp500 = sp500.history(period="max")
sp500.index

del sp500["Dividends"]
del sp500["Stock Splits"]

# set up target
sp500["Tomorrow"] = sp500["Close"].shift(-10)
sp500["Target"] = (sp500["Tomorrow"] > sp500["Close"]).astype(int)

# set up indicators
sp500["2dreturn"] = sp500['Close'] / sp500["Close"].shift(2) - 1
sp500["2dspread"] = sp500['Close'] / sp500["Close"].rolling(2).mean() - 1 
sp500["5dreturn"] = sp500['Close'] / sp500["Close"].shift(5) - 1
sp500["5dspread"] = sp500['Close'] / sp500["Close"].rolling(5).mean() - 1  
sp500["60dreturn"] = sp500['Close'] / sp500["Close"].shift(60) - 1
sp500["60dspread"] = sp500['Close'] / sp500["Close"].rolling(60).mean() - 1 
sp500["250dreturn"] = sp500['Close'] / sp500["Close"].shift(250) - 1
sp500["250dspread"] = sp500['Close'] / sp500["Close"].rolling(250).mean() - 1 
sp500["1000dreturn"] = sp500['Close'] / sp500["Close"].shift(1000) - 1
sp500["1000dspread"] = sp500['Close'] / sp500["Close"].rolling(1000).mean() - 1 


sp500 = sp500.loc["1990-01-01":].copy()
sp500 = sp500.iloc[1100:]
sp500.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 7444 entries, 1994-05-09 00:00:00-04:00 to 2023-11-29 00:00:00-05:00
Data columns (total 17 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Open         7444 non-null   float64
 1   High         7444 non-null   float64
 2   Low          7444 non-null   float64
 3   Close        7444 non-null   float64
 4   Volume       7444 non-null   int64  
 5   Tomorrow     7434 non-null   float64
 6   Target       7444 non-null   int64  
 7   2dreturn     7444 non-null   float64
 8   2dspread     7444 non-null   float64
 9   5dreturn     7444 non-null   float64
 10  5dspread     7444 non-null   float64
 11  60dreturn    7444 non-null   float64
 12  60dspread    7444 non-null   float64
 13  250dreturn   7444 non-null   float64
 14  250dspread   7444 non-null   float64
 15  1000dreturn  7444 non-null   float64
 16  1000dspread  7444 non-null   float64
dtypes: float64(15), int64(2)
memory usage: 1.0

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score

model = RandomForestClassifier(n_estimators=100, min_samples_split=100, random_state=1)

# not use cross validation / split train and test
train = sp500.iloc[:-1000]
test = sp500.iloc[-1000:]

predictors = ["2dreturn", "2dspread", "5dreturn", "5dspread", "60dreturn",
              "60dspread", "250dreturn", "250dspread", "1000dreturn", "1000dspread"]
# model.fit(train[predictors], train["Target"])

# preds = model.predict(test[predictors])

In [4]:
def predict(train, test, predictors, model):
    model.fit(train[predictors], train["Target"])
    preds = model.predict(test[predictors])
    preds = pd.Series(preds, index=test.index, name="Predictions")
    combined = pd.concat([test["Target"], preds], axis=1)
    return combined

def backtest(data, model, predictor, start=2500, step=500):
    all_predictions = []

    for i in range(start, data.shape[0], step):
        train = data.iloc[0:i].copy()
        test = data.iloc[i:(i+step)].copy()
        predictions = predict(train, test, predictors, model)
        all_predictions.append(predictions)
    return pd.concat(all_predictions)

In [5]:
model = RandomForestClassifier(n_estimators=200, min_samples_split=50, random_state=1)
predictions = backtest(sp500, model, predictors)
print(predictions["Predictions"].value_counts())
precision_score(predictions["Target"], predictions["Predictions"])

Predictions
1    4096
0     848
Name: count, dtype: int64


0.614501953125

In [6]:
sp500['Target'].sum()/sp500.shape[0]

0.6037076840408383

In [None]:
from sklearn.preprocessing import StandardScaler

X = np.array(predictors)

In [11]:
predictors

['2dreturn',
 '2dspread',
 '5dreturn',
 '5dspread',
 '60dreturn',
 '60dspread',
 '250dreturn',
 '250dspread',
 '1000dreturn',
 '1000dspread']