In [2]:
import yfinance as yf
import os
import pandas as pd
from dotenv import load_dotenv
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score
from datetime import timedelta
import numpy as np
import random as rand
load_dotenv()

True

In [9]:
symbol = "MSFT"
DATA_PATH = f"data/{symbol}_data.json"
if os.path.exists(DATA_PATH):
    # Read from file if we've already downloaded the data.
    with open(DATA_PATH) as f:
        ticker_hist = pd.read_json(DATA_PATH)

else:
    ticker = yf.Ticker(symbol)
    ticker_hist = ticker.history(period="max")

    # Save file to json in case we need it later.  This prevents us from having to re-download it every time.
    ticker_hist.to_json(DATA_PATH)
# Ensure we know the actual closing price
data = ticker_hist[["Close"]]
data = data.rename(columns = {'Close':'Actual_Close'})

# Setup our target.  This identifies if the price went up or down
data["Target"] = ticker_hist.rolling(2).apply(lambda x: x.iloc[1] > x.iloc[0])["Close"]

# Shift stock prices forward one day, so we're predicting tomorrow's stock prices from today's prices.
ticker_prev = ticker_hist.copy()
ticker_prev = ticker_prev.shift(1)

predictors = ["Close", "Volume", "Open", "High", "Low"]
data = data.join(ticker_prev[predictors]).iloc[1:]
model = RandomForestClassifier(n_estimators=100, min_samples_split=200, random_state=1)

In [10]:
def backtest(data, model, predictors, start=1000, step=750):
    predictions = []
    # Loop over the dataset in increments
    for i in range(start, data.shape[0], step):
        # Split into train and test sets
        train = data.iloc[0:i].copy()
        test = data.iloc[i:(i+step)].copy()

        # Fit the random forest model
        model.fit(train[predictors], train["Target"])

        # Make predictions
        preds = model.predict_proba(test[predictors])[:,1]
        preds = pd.Series(preds, index=test.index)
        preds[preds > .6] = 1
        preds[preds <= .6] = 0
        
        # Combine predictions and test values
        combined = pd.concat({"Target": test["Target"],"Predictions": preds}, axis=1)

        predictions.append(combined)

    return pd.concat(predictions)

In [77]:
def create_new_predictors(data):
    weekly_mean = data.rolling(7).mean()["Close"]
    quarterly_mean = data.rolling(90).mean()["Close"]
    annual_mean = data.rolling(365).mean()["Close"]

    weekly_trend = data.shift(1).rolling(7).sum()["Target"]

    data["weekly_mean"] = weekly_mean / data["Close"]
    data["quarterly_mean"] = quarterly_mean / data["Close"]
    data["annual_mean"] = annual_mean / data["Close"]

    data["annual_weekly_mean"] = data["annual_mean"] / data["weekly_mean"]
    data["annual_quarterly_mean"] = data["annual_mean"] / data["quarterly_mean"]

    data["weekly_trend"] = weekly_trend

    data["open_close_ratio"] = data["Open"] / data["Close"]
    data["high_close_ratio"] = data["High"] / data["Close"]
    data["low_close_ratio"] = data["Low"] / data["Close"]

    return data

In [38]:
data = create_new_predictors(data)
# print(data)

full_predictors = predictors + ["weekly_mean", "quarterly_mean", "annual_mean", "annual_weekly_mean", "annual_quarterly_mean", 
                                "open_close_ratio", "high_close_ratio", "low_close_ratio", "weekly_trend"]

predictions = backtest(data.iloc[365:], model, full_predictors)

                     Actual_Close  Target       Close        Volume  \
1986-03-14 05:00:00      0.062427     1.0    0.060274  1.031789e+09   
1986-03-17 05:00:00      0.063504     1.0    0.062427  3.081600e+08   
1986-03-18 05:00:00      0.061889     0.0    0.063504  1.331712e+08   
1986-03-19 05:00:00      0.060812     0.0    0.061889  6.776640e+07   
1986-03-20 05:00:00      0.059198     0.0    0.060812  4.789440e+07   
...                           ...     ...         ...           ...   
2023-12-15 05:00:00    370.730011     1.0  365.929993  4.327750e+07   
2023-12-18 05:00:00    372.649994     1.0  370.730011  7.847820e+07   
2023-12-19 05:00:00    373.260010     1.0  372.649994  2.180290e+07   
2023-12-20 05:00:00    370.619995     0.0  373.260010  2.060370e+07   
2023-12-21 05:00:00    373.540009     1.0  370.619995  2.631670e+07   

                           Open        High         Low  weekly_mean  \
1986-03-14 05:00:00    0.054893    0.062965    0.054893          NaN   
198

In [None]:
predictions.plot(backend='plotly')

In [None]:
def forwardPrediction(data, model, predictors, n_days):
    predictions = []
    d = data[-100:].copy()

    for _ in range(n_days):
        train = d
        returns_d = (d / d.shift(1)).iloc[1:]
        
        # test_dict = {}

        row = rand.randint(0, 98)
        t = returns_d.iloc[row]

        test_dict = t * d.iloc[99]
        test_dict["Target"] = 0 if t["Close"] <= 1 else 1
        

        # Index for new day
        last_date = d.iloc[[-1]].index[0]
        last_date = last_date + timedelta(days=1)


        test = pd.DataFrame([test_dict], index = [last_date])
        
        model.fit(train[predictors], train["Target"])
        
        preds = model.predict_proba(test[predictors])[:,1]
        preds = pd.Series(preds, index=test.index)
        preds[preds > .6] = 1
        preds[preds <= .6] = 0
        
        # Combine predictions and test values
        combined = pd.concat({"Target": test["Target"],"Predictions": preds}, axis=1)
        # with open("testing", 'a') as f:
        #     print(d.tail, file=f)
        #     print(d.iloc[99], file=f)

        d = pd.concat([d, test])[-100:]

        predictions.append(combined)
    print(d)
    return predictions

In [81]:
def generate_intraday_returns(mu, sigma, steps_per_day, days):

    T = days / 252.0  # Business days in a year
    dt = T / (steps_per_day * days)  # 4.0 is needed as four prices per day are required

    # dt = 1 / steps_per_day  # time step
    total_steps = steps_per_day * days
    returns = np.exp((mu - sigma**2 / 2) * dt + sigma * np.random.normal(0, np.sqrt(dt), size=(days, steps_per_day)))
    return returns

In [82]:
def generate_intraday_prices(last_date, initial_price, mu, sigma, steps_per_day, days):
    returns = generate_intraday_returns(mu, sigma, steps_per_day, days)
    # print(returns)
    
    close_prices = initial_price * np.cumprod(returns, axis=0)
    
    # Reshape the close prices to steps_per_day rows and days columns
    close_prices_reshaped = close_prices.reshape((days, steps_per_day))
    
    # Extract open, high, low, and close prices from intraday data
    open_prices = close_prices_reshaped[:, 0]
    close_prices = close_prices_reshaped[:, -1]
    high_prices = np.max(close_prices_reshaped, axis=1)
    low_prices = np.min(close_prices_reshaped, axis=1)
    

    ind_array = np.array([last_date + timedelta(days=i) for i in range(1, days+1)])

    # Create a DataFrame to store the prices
    price_data = pd.DataFrame({
        'Open': open_prices,
        'High': high_prices,
        'Low': low_prices,
        'Close': close_prices
    }, index = ind_array)

    # print(price_data)
    
    return price_data

In [85]:
def forecast_GBM(data, steps_per_day, days):
    close = data["Close"]
    close_ret = ((data["Close"] / data["Close"].shift(1)) - 1).iloc[1:]
    mu = close_ret.mean()
    sigma = close_ret.std()
    last_date = data.iloc[[-1]].index[0]
    last_date = last_date + timedelta(days=1)
    init_price = close.iloc[-1]
    d = generate_intraday_prices(last_date, init_price, mu, sigma, steps_per_day, days)
    d["Target"] = ticker_hist.rolling(2).apply(lambda x: x.iloc[1] > x.iloc[0])["Close"] # FIX THE FACT THAT WE CHANGED DATA IN BEGINNING TO PREDICT ONE DAY AHEAD
    return create_new_predictors(d)

In [86]:
forecast_GBM(data, 4, 500)

Unnamed: 0,Open,High,Low,Close,Target,weekly_mean,quarterly_mean,annual_mean,annual_weekly_mean,annual_quarterly_mean,weekly_trend,open_close_ratio,high_close_ratio,low_close_ratio
2023-12-23 05:00:00,371.061556,371.061556,370.432333,370.594705,,,,,,,,1.001260,1.001260,0.999562
2023-12-24 05:00:00,371.118824,371.118824,370.328185,370.328185,,,,,,,,1.002135,1.002135,1.000000
2023-12-25 05:00:00,370.651646,370.799419,369.849396,370.508343,,,,,,,,1.000387,1.000786,0.998222
2023-12-26 05:00:00,370.450143,370.721499,369.518978,370.578337,,,,,,,,0.999654,1.000386,0.997141
2023-12-27 05:00:00,370.325824,370.791109,369.589680,370.791109,,,,,,,,0.998745,1.000000,0.996760
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-05-01 05:00:00,361.682832,384.041608,361.682832,377.789300,,1.000283,1.002142,0.995606,0.995324,0.993479,,0.957367,1.016550,0.957367
2025-05-02 05:00:00,361.456245,384.068171,361.456245,378.091804,,0.999602,1.001264,0.994865,0.995261,0.993610,,0.956001,1.015807,0.956001
2025-05-03 05:00:00,361.209114,384.387361,361.209114,378.159040,,0.999596,1.001010,0.994744,0.995146,0.993740,,0.955178,1.016470,0.955178
2025-05-04 05:00:00,361.546629,384.590691,361.546629,377.854542,,1.000366,1.001728,0.995597,0.995233,0.993880,,0.956841,1.017827,0.956841


In [None]:
def runSims(data, model, predictors, n_days, n_sims):
    sims = []
    
    for _ in range(n_sims):
        last_date = data.iloc[[-1]].index[0]
        last_date = last_date + timedelta(days=1)
        predictions = pd.concat(forwardPrediction(data, model, predictors, n_days))
        print(predictions)
        score = precision_score(predictions["Target"], predictions["Predictions"])
        sims.append(score)
    return sims

In [None]:
predictions = runSims(data, model, full_predictors, 100, 1)
predictions

: 

Clear issues with low is not low and high is not high

In [8]:
df = pd.DataFrame({
    'a': [1, 2, 3],
    'b': [1, 2, 3]
}, index=[1, 2, 3])

df['a'][-1]

KeyError: -1