# Project: Stock Price Predictor

The goal of investing in stocks is to gain money either short term or long term. Generally, people buy what they believe will go up. In this project I wanted to practice creating a ML algorithm to predict whether stock price will increase tomorrow, so I can buy today. 

## Prepare Data for Machine Learning

In [36]:
import pandas as pd
import yfinance as yf 
import matplotlib
from sklearn.ensemble import RandomForestClassifier


msft = yf.Ticker("MSFT")
msft_hist = msft.history(period="max")
msft_hist.head(5)

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1986-03-13 00:00:00-05:00,0.05538,0.063524,0.05538,0.060809,1031788800,0.0,0.0
1986-03-14 00:00:00-05:00,0.060809,0.064067,0.060809,0.06298,308160000,0.0,0.0
1986-03-17 00:00:00-05:00,0.06298,0.064609,0.06298,0.064067,133171200,0.0,0.0
1986-03-18 00:00:00-05:00,0.064067,0.064609,0.061894,0.062437,67766400,0.0,0.0
1986-03-19 00:00:00-05:00,0.062437,0.06298,0.060809,0.061351,47894400,0.0,0.0


In [37]:
data = msft_hist[['Close']]
data = data.rename(columns = {'Close':'Actual Close'})
data["Target"] = msft_hist.rolling(2).apply(lambda x: x.iloc[0] < x.iloc[1])['Close']
data.head(5) 

Unnamed: 0_level_0,Actual Close,Target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
1986-03-13 00:00:00-05:00,0.060809,
1986-03-14 00:00:00-05:00,0.06298,1.0
1986-03-17 00:00:00-05:00,0.064067,1.0
1986-03-18 00:00:00-05:00,0.062437,0.0
1986-03-19 00:00:00-05:00,0.061351,0.0


In [44]:
msft_prev = msft_hist.copy()
msft_prev = msft_prev.shift(1)
predictors = ['Close', 'High', 'Low', 'Open', 'Volume']
data = data.join(msft_prev[predictors].iloc[1:], rsuffix="_prev").dropna()

## Training a Machine Learning Model

In [45]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, min_samples_split=200, random_state=1)

train = data.iloc[:-100]
test = data.iloc[-100:]

model.fit(train[predictors], train["Target"])

In [47]:
from sklearn.metrics import precision_score
preds = model.predict(test[predictors])

preds = pd.Series(preds, index=test.index)
preds

Date
2022-09-13 00:00:00-04:00    0.0
2022-09-14 00:00:00-04:00    1.0
2022-09-15 00:00:00-04:00    1.0
2022-09-16 00:00:00-04:00    0.0
2022-09-19 00:00:00-04:00    1.0
                            ... 
2023-01-30 00:00:00-05:00    0.0
2023-01-31 00:00:00-05:00    1.0
2023-02-01 00:00:00-05:00    1.0
2023-02-02 00:00:00-05:00    0.0
2023-02-03 00:00:00-05:00    1.0
Length: 100, dtype: float64

In [48]:
precision_score(test["Target"], preds)

0.52

In [50]:
combined = pd.concat({"Target":test["Target"], "Predictions":preds}, axis=1)
combined
                     

Unnamed: 0_level_0,Target,Predictions
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-09-13 00:00:00-04:00,0.0,0.0
2022-09-14 00:00:00-04:00,1.0,1.0
2022-09-15 00:00:00-04:00,0.0,1.0
2022-09-16 00:00:00-04:00,0.0,0.0
2022-09-19 00:00:00-04:00,0.0,1.0
...,...,...
2023-01-30 00:00:00-05:00,0.0,0.0
2023-01-31 00:00:00-05:00,1.0,1.0
2023-02-01 00:00:00-05:00,1.0,1.0
2023-02-02 00:00:00-05:00,1.0,0.0


## Backtest Engine

In [51]:
start = 1000
step = 750
def backtest(data, model, predicors, start=100, step=750):
    predictions:[]
    for i in range (start, data.shape[0], step):
        
        train = data.iloc[0:i].copy()
        test = data.iloc[i:(i + step)].copy()
        
        model.fit(train[predictors], train["Target"])
        
        preds = model.predict_proba(test[predictors])[:1]
        preds = pd.Series(preds, index = test.index)
        preds[preds>.6]=1
        preds[preds<=.6]=0
        
        combined = pd.concat({"Target":test["Target"], "Predictions":preds}, axis=1)
        predictions.append(combined)
    pd.concat(predictions)
                              

## Improving Accuracy

In [52]:
weekly_mean = data.rolling(7).mean()
quarterly_mean = data.rolling(90).mean()
annual_mean = data.rolling(365).mean()

weekly_trend = data.shift(1).rolling(7).mean()["Target"]

In [None]:
import pandas as pd
import yfinance as yf 
import matplotlib
from sklearn.ensemble import RandomForestClassifier


msft = yf.Ticker("MSFT")
msft_hist = msft.history(period="max")
msft_hist.head(5)

data = msft_hist[['Close']]
data = data.rename(columns = {'Close':'Actual Close'})
data["Target"] = msft_hist.rolling(2).apply(lambda x: x.iloc[0] < x.iloc[1])['Close']
data.head(5) 

msft_prev = msft_hist.copy()
msft_prev = msft_prev.shift(1)
predictors = ['Close', 'High', 'Low', 'Open', 'Volume']
data = data.join(msft_prev[predictors].iloc[1:], rsuffix="_prev").dropna()

from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, min_samples_split=200, random_state=1)

train = data.iloc[:-100]
test = data.iloc[-100:]

model.fit(train[predictors], train["Target"])

from sklearn.metrics import precision_score
preds = model.predict(test[predictors])

preds = pd.Series(preds, index=test.index)
preds
precision_score(test["Target"], preds)

combined = pd.concat({"Target":test["Target"], "Predictions":preds}, axis=1)
combined

start = 1000
step = 750
def backtest(data, model, predicors, start=100, step=750):
    predictions:[]
    for i in range (start, data.shape[0], step):
        
        train = data.iloc[0:i].copy()
        test = data.iloc[i:(i + step)].copy()
        
        model.fit(train[predictors], train["Target"])
        
        preds = model.predict_proba(test[predictors])[:1]
        preds = pd.Series(preds, index = test.index)
        preds[preds>.6]=1
        preds[preds<=.6]=0
        
        combined = pd.concat({"Target":test["Target"], "Predictions":preds}, axis=1)
        predictions.append(combined)
    pd.concat(predictions)
    
weekly_mean = data.rolling(7).mean()
quarterly_mean = data.rolling(90).mean()
annual_mean = data.rolling(365).mean()

weekly_trend = data.shift(1).rolling(7).mean()["Target"]
          

In [None]:
data["weekly_mean"] = weekly_mean["Close"] / data["Close"]
data["quarterly_mean"] = quarterly_mean["Close"] / data["Close"]
data["annual_mean"] = annual_mean["Close"] / data["Close"]

data["annual_weekly_mean"] = data["annual_mean"] / data["weekly_mean"]
data["annual_quarterly_mean"] = data["annual_mean"] / data["quarterly_mean"]
data["weekly_trend"] = weekly_trend

data["open_close_ratio"] = data["Open"] / data["Close"]
data["high_close_ratio"] = data["High"] / data["Close"]
data["low_close_ratio"] = data["Low"] / data["Close"]

In [None]:
full_predictors = predictors + ["weekly_mean", "quarterly_mean", "annual_mean", "annual_weekly_mean", "annual_quarterly_mean", "open_close_ratio", "high_close_ratio", "low_close_ratio", "weekly_trend"]
predictions = backtest(data.iloc[365:], model, full_predictors)

In [None]:
precision_score(predictions["Target"], predictions["Predictions"])

In [None]:
# Show how many trades we would make

predictions["Predictions"].value_counts()
predictions.iloc[-100:].plot()