In [40]:
import pandas as pd
import yfinance as yf
#Reads in data from csv file
sp500 = pd.read_csv("sp500.csv", index_col='Date')
#set the Date column as the index
sp500.index = pd.to_datetime(sp500.index)
sp500

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1950-01-03,16.660000,16.660000,16.660000,16.660000,1260000,0,0
1950-01-04,16.850000,16.850000,16.850000,16.850000,1890000,0,0
1950-01-05,16.930000,16.930000,16.930000,16.930000,2550000,0,0
1950-01-06,16.980000,16.980000,16.980000,16.980000,2010000,0,0
1950-01-09,17.080000,17.080000,17.080000,17.080000,2520000,0,0
...,...,...,...,...,...,...,...
2022-09-06,3930.889893,3942.550049,3886.750000,3908.189941,2209800080,0,0
2022-09-07,3909.429932,3987.889893,3906.030029,3979.870117,0,0,0
2022-09-08,3959.939941,4010.500000,3944.810059,4006.179932,0,0,0
2022-09-09,4022.939941,4076.810059,4022.939941,4067.360107,0,0,0


In [41]:
#'Tomorrow' column contains the closing price for the next day.
sp500["Tomorrow"] = sp500["Close"].shift(-1)
#'Target' column is set to either 1 or 0 depending if the price went up or not. True value!
sp500["Target"] = (sp500["Tomorrow"] > sp500["Close"]).astype(int)
#Remove the rows from before 1990
sp500 = sp500.loc["1990-01-01":].copy()


In [42]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, min_samples_split=100, random_state=1)
#train data from beginning to end-100 days
train = sp500.iloc[:-100]
#test data on last 100 days
test = sp500.iloc[-100:]

#Fit the model using the below attributes
predictors = ["Close", "Volume", "Open", "High", "Low"]
model.fit(train[predictors], train["Target"])


RandomForestClassifier(min_samples_split=100, random_state=1)

In [43]:
#First prediction model without using the backtest. Accuracy 47%
#This model tests the last 100 days over the period starting from 1990.
#Only one prediction model.
from sklearn.metrics import precision_score
#Predicts last 100 days
prediction = model.predict(test[predictors])
#Converts numPy array to pandas' series
prediction = pd.Series(prediction, index=test.index)
#Percentage of correct predictions
precision_score(test["Target"], prediction)

0.47058823529411764

In [44]:
#Second prediction model using the backtest. Accuracy 53%
#Backtest involves prediction once every year starting from year 1999.
#Years 1990-1999 are excluded to use for the 11th year prediction.
def predict(train, test, predictors, model):
    model.fit(train[predictors], train["Target"])
    preds = model.predict(test[predictors])
    preds = pd.Series(preds, index=test.index, name="Predictions")
    combined = pd.concat([test["Target"], preds], axis=1)
    return combined
#start=2500 (10 trading years)
#step=250 (1 trading year)
def backtest(data, model, predictors, start=2500, step=250):
    all_predictions = []
#loop from row 2500 to the end with a step of 250
#train list contains first 10 years and goes up by a year every loop.
#test list contains the subsequent 1 year that comes after train list.
    for i in range(start, data.shape[0], step):
        train = data.iloc[0:i].copy()
        test = data.iloc[i:(i+step)].copy()
        predictions = predict(train, test, predictors, model)
        all_predictions.append(predictions)
    
    return pd.concat(all_predictions)


In [45]:
predictions = backtest(sp500, model, predictors)
precision_score(predictions['Target'], predictions['Predictions'])
#Number of predictions for the stock market to go up is 2401
predictions['Predictions'].value_counts()

0    3337
1    2401
Name: Predictions, dtype: int64

In [46]:
#Third prediction model by adding more predictor attributes and modifying 'predict' function
#Accuracy 56%

#Adding 2 columns for each of the elements of horizon
#'horizon' elements correspond to 1 day, 1 week, 3 months, 1 year, and 4 years
#ratio_column is the ratio between closing price of today and average closing price over specified period of time
#trend_column is the sum of 'Target' over specified period of time
horizons = [2,5,60,250,1000]
new_predictors = []

for horizon in horizons:
    rolling_averages = sp500.rolling(horizon).mean()
    
    ratio_column = f"Close_Ratio_{horizon}"
    sp500[ratio_column] = sp500["Close"] / rolling_averages["Close"]
    
    trend_column = f"Trend_{horizon}"
    sp500[trend_column] = sp500.shift(1).rolling(horizon).sum()["Target"]
    
    new_predictors+= [ratio_column, trend_column]
    
sp500 = sp500.dropna(subset=sp500.columns[sp500.columns != "Tomorrow"])
sp500

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,Tomorrow,Target,Close_Ratio_2,Trend_2,Close_Ratio_5,Trend_5,Close_Ratio_60,Trend_60,Close_Ratio_250,Trend_250,Close_Ratio_1000,Trend_1000
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1993-12-14,465.730011,466.119995,462.459991,463.059998,275050000,0,0,461.839996,0,0.997157,1.0,0.996617,1.0,1.000283,32.0,1.028047,127.0,1.176082,512.0
1993-12-15,463.059998,463.690002,461.839996,461.839996,331770000,0,0,463.339996,1,0.998681,0.0,0.995899,1.0,0.997329,32.0,1.025151,126.0,1.172676,512.0
1993-12-16,461.859985,463.980011,461.859985,463.339996,284620000,0,0,466.380005,1,1.001621,1.0,0.999495,2.0,1.000311,32.0,1.028274,127.0,1.176163,513.0
1993-12-17,463.339996,466.380005,463.339996,466.380005,363750000,0,0,465.850006,0,1.003270,2.0,1.004991,3.0,1.006561,32.0,1.034781,128.0,1.183537,514.0
1993-12-20,466.380005,466.899994,465.529999,465.850006,255900000,0,0,465.299988,0,0.999431,1.0,1.003784,2.0,1.005120,32.0,1.033359,128.0,1.181856,513.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-09-06,3930.889893,3942.550049,3886.750000,3908.189941,2209800080,0,0,3979.870117,1,0.997948,0.0,0.989893,1.0,0.982136,26.0,0.902791,120.0,1.103594,542.0
2022-09-07,3909.429932,3987.889893,3906.030029,3979.870117,0,0,0,4006.179932,1,1.009087,1.0,1.008370,2.0,0.999819,27.0,0.919786,121.0,1.123489,543.0
2022-09-08,3959.939941,4010.500000,3944.810059,4006.179932,0,0,0,4067.360107,1,1.003294,2.0,1.012411,3.0,1.005349,28.0,0.926253,122.0,1.130564,543.0
2022-09-09,4022.939941,4076.810059,4022.939941,4067.360107,0,0,0,4107.279785,1,1.007578,2.0,1.022676,3.0,1.019287,29.0,0.940748,122.0,1.147454,543.0


In [47]:
#Modifying existing 'predict' function
#Instead of using model's predict function, we are using predict_proba
#For each day instead of predicting 1 or 0, it gives the probability of those two
#Set to 1 only if it's probability is equal to or more than 60%, otherwaise set to 0
#By doing so, it increases our chances of predicting stock market going up
#Therefore, it predicts stock market going up less often.
def predict(train, test, predictors, model):
    model.fit(train[predictors], train["Target"])
    preds = model.predict_proba(test[predictors])[:,1]
    preds[preds >=.6] = 1
    preds[preds <.6] = 0
    preds = pd.Series(preds, index=test.index, name="Predictions")
    combined = pd.concat([test["Target"], preds], axis=1)
    return combined

In [48]:
#Run the backtest using the new predictors and the modified 'predict' function
predictions = backtest(sp500, model, new_predictors)
#Number of predictions for the stock market to go up is 679 as opposed to
# 2401 predictions in the second model
predictions["Predictions"].value_counts()

0.0    4059
1.0     679
Name: Predictions, dtype: int64

In [49]:
#Accuracy 56%
precision_score(predictions["Target"], predictions["Predictions"])

0.5655375552282769

In [None]:
#Done