In [None]:
# For this lab, I want to predict whether the value of a stock will go up or down. For this reason, I will use Random Forest Classification rather than Regression (which would be used for trying to predict the actual value)

# install pip yfinance (if not installed already :))
#! pip install yfinance

In [1]:
# import yfinance library which contains historical data for stocks
import yfinance as yf

In [37]:
# import other relevant libraries and modules needed for this lab
import pandas as pd # for data manipulation and analysis
import matplotlib.pyplot as plt # for data visualisation - plots, graphs & charts
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier # module that includes two randomised decision tree algorithms; Random Forest & Extra Trees Method (we're just using Random Forest now)
from sklearn.metrics import precision_score # module for calculating the precision score (observations that the model correctly predicted out of all predictions made)
from sklearn.metrics import confusion_matrix

In [43]:
# define the ticker for the specific stock we want to predict (in this case it's Amazon)
amazon = yf.Ticker("AMZN")

In [44]:
# retrieve the historical data for this ticker and store to the variable
amazon  = amazon.history(period="max")

In [45]:
# create extra columns in order to calculate the incidences where the value went up (represented by 1). Otherwise it's 0.

# "Tomorrow" aligns the closing value of the current day with the open prices of the following day
amazon["Tomorrow"] = amazon["Close"].shift(-1)

# "Target" returns 1 if the value increased, 0 if it didn't increase
amazon["Target"] = (amazon["Tomorrow"]> amazon["Close"]).astype(int)

In [47]:
# the data coming in from yfinance is real-time and is already cleaned, but we can check our new columns for null values (there is only one in the  'Tomorrow' column)
amazon.isna().sum()

Open            0
High            0
Low             0
Close           0
Volume          0
Dividends       0
Stock Splits    0
Tomorrow        1
Target          0
dtype: int64

In [48]:
# start creating our model and defining our train and test datasets

# defining the n_estimators value means tuning our parameters aka the number of trees we want to build. The higher the better, but the slower the model
# min_samples_split evaluates the number of samples in the node. if the sample number is less than 100, then it does not split and the node will become a leaf
# random_state set to 1 results in a fixed dataset/will produce the same split
rf_model = RandomForestClassifier(n_estimators = 100, min_samples_split = 100, random_state = 1)

# train_set contains all values up until the last 100, test_set is the last 100 values
train_set = amazon.iloc[:-100]
test_set = amazon.iloc[-100:]

# define independent/predictor columns
predictor_columns = ["Close", "Volume", "Open", "High", "Low"]

# .fit() method is the training part of the model & finds the coefficients for the equation via the algorithm being used (in our case, Random Forest)
rf_model.fit(train_set[predictor_columns], train_set["Target"])

In [49]:
# create a prediction function that will train the model on our training set and then make predictions on our testing set

# this bit trains our model on our training set
def predict_func(train_set, test_set, predictor_columns, rf_model):
    rf_model.fit(train_set[predictor_columns], train_set["Target"])

# create variable that contains the predictions on our test set
    predictions_test = rf_model.predict(test_set[predictor_columns])

# then change variable to a dataframe series
    predictions_test = pd.Series(predictions_test, index = test_set.index, name = "Predictions")

# then combine the actual target values and the predicted values from the above dataframe
    combine_target_predictions = pd.concat([test_set["Target"], predictions_test], axis = 1)

    return combine_target_predictions

In [51]:
# create a backtesting function, that creates a rolling analysis
# e.g. if we have 10 years of data, this will be used to predict the 11th year. Then the 11 years of data will be used to predict the 12th year, and so forth.

def backtest_func(data, rf_model, predictor_columns, start = 2500, step = 250):
    predictions_result = []

    for i in range(start, data.shape[0], step):
        train_set = data.iloc[0:i].copy()
        test_set = data.iloc[i:(i+step)].copy()

        predictions = predict_func(train_set, test_set, predictor_columns, rf_model)

        predictions_result.append(predictions)

        return pd.concat(predictions_result)

In [52]:
# store our backtest function in the predictions variable.
predictions = backtest_func(amazon, rf_model, predictor_columns)

In [53]:
# use prediction_score module from sklearn.metrics, which gives us a score of observations that were correctly predicted in our model
# the result is 0.62, which is OK but can be improved

precision_score(predictions["Target"], predictions["Predictions"])

0.6190476190476191

In [54]:
# we can try experimenting with different n_estimator_values and min_samples_split_values, to see if the precision_score improves

n_estimators_values = [50, 100, 200]
min_samples_split_values = [2, 5, 10]

# now iterate over different combinations
for n_estimators in n_estimators_values:
    for min_samples_split in min_samples_split_values:

        # create and train the Random Forest model
        rf_model = RandomForestClassifier(n_estimators=n_estimators, min_samples_split=min_samples_split, random_state=1)
        rf_model.fit(train_set[predictor_columns], train_set["Target"])
        
        # create predictio and precision variables again
        predictions = predict_func(train_set, test_set, predictor_columns, rf_model)
        precision = precision_score(predictions["Target"], predictions["Predictions"])
        
        # Print the results
        print(f'n_estimators={n_estimators}, min_samples_split={min_samples_split}, Precision={precision}')

n_estimators=50, min_samples_split=2, Precision=0.5357142857142857
n_estimators=50, min_samples_split=5, Precision=0.5535714285714286
n_estimators=50, min_samples_split=10, Precision=0.576271186440678
n_estimators=100, min_samples_split=2, Precision=0.5185185185185185
n_estimators=100, min_samples_split=5, Precision=0.5471698113207547
n_estimators=100, min_samples_split=10, Precision=0.576271186440678
n_estimators=200, min_samples_split=2, Precision=0.5645161290322581
n_estimators=200, min_samples_split=5, Precision=0.5689655172413793
n_estimators=200, min_samples_split=10, Precision=0.5737704918032787


In [None]:
# we can see that these results are in fact worse than the previous result, but we can try alternative values/combinations to see if there's improvement
# furthermore, we can further try to improve the model through, for example, using feature engineering, or even other algorithms.