In [5]:
# ----------------------------------------------------------
# COMP 542 – Machine Learning
# Class Number # 17086
# Group Project by: Jonathan Cordova and Phani Challabotla
# NEURAL NETWORK MODEL for Stock Price Prediction
# ----------------------------------------------------------

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import yfinance as yf
import talib as ta
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [6]:
# Model
import tensorflow as tf
from tensorflow import keras
from keras.layers import Dense, Dropout, LSTM
from keras.models import Sequential, Model

In [7]:
# get the s&p 500 index fund data
stock_data = yf.Ticker("^GSPC")

# get historical price data for s&p 500
df = pd.DataFrame(stock_data.history(period="max"))

# Print
df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1927-12-30 00:00:00-05:00,17.660000,17.660000,17.660000,17.660000,0,0.0,0.0
1928-01-03 00:00:00-05:00,17.760000,17.760000,17.760000,17.760000,0,0.0,0.0
1928-01-04 00:00:00-05:00,17.719999,17.719999,17.719999,17.719999,0,0.0,0.0
1928-01-05 00:00:00-05:00,17.549999,17.549999,17.549999,17.549999,0,0.0,0.0
1928-01-06 00:00:00-05:00,17.660000,17.660000,17.660000,17.660000,0,0.0,0.0
...,...,...,...,...,...,...,...
2024-05-30 00:00:00-04:00,5259.770020,5260.209961,5222.100098,5235.479980,3818750000,0.0,0.0
2024-05-31 00:00:00-04:00,5243.209961,5280.330078,5191.680176,5277.509766,5437160000,0.0,0.0
2024-06-03 00:00:00-04:00,5297.149902,5302.109863,5234.319824,5283.399902,4046920000,0.0,0.0
2024-06-04 00:00:00-04:00,5278.240234,5298.799805,5257.629883,5291.339844,3707900000,0.0,0.0


In [8]:
#
# --------------------------
# Preprocessing - Creating a custom functions for data cleaning:
# --------------------------
# 

def clean_dataset(df_series):

    # Calculating Momentum (Technical Indicator) and add new data column
    df_series['Momentum_Indicator'] = ta.MOM(df_series['Close'], timeperiod=2)

    # Calculating Money Flow Index (Technical Indicator) and add new data column
    df_series['Money_Flow_Index'] = ta.MFI(df_series['High'], df_series['Low'], df_series['Close'], df_series['Volume'], timeperiod=2)

    # Calculating Relative Strength Index (Technical Indicator) and add new data column
    df_series['Relative_Strength_Index'] = ta.RSI(df_series['Close'], timeperiod=2)

    # Calculating Rate of Change (Technical Indicator) and add new data column
    df_series['Rate_of_Change_Ratio'] = ta.ROCR(df_series['Close'], timeperiod=2)

    # Dropping all NaN rows
    df_series.dropna(subset=['Momentum_Indicator', 'Money_Flow_Index', 'Relative_Strength_Index', 'Rate_of_Change_Ratio'], inplace=True)

    # Drop columns Stock Splits and Dividends, axis = 1 represents type to drop 'column'
    df_series = df_series.drop(['Stock Splits', 'Dividends'], axis=1)

    # Format the Money Flow Index Indicator column and plot
    df_series['Money_Flow_Index'] = df_series['Money_Flow_Index'].round(6)
    
    # Add additional features including range (close - open) and daily mean ((high + low) / 2.0 )
    df_series['Daily_Range'] = (df_series['Close'] - df_series['Open'])
    df_series['Daily_Mean'] = (df_series['High'] + df_series['Low']) / 2.0

    # Turns Date index into a feature (column)
    df_series = df_series.reset_index()

    # Filter data between two dates to reflect data from 10-01-2000 to 10-01-2022
    filtered_df_series = df_series.loc[(df_series['Date'] >= '1990-10-01')
                         & (df_series['Date'] <= '2022-10-01')]

    # Resets the row index to 0
    filtered_df_series = filtered_df_series.reset_index(drop=True)

    # Makes the Date column back into an index 
    filtered_df_series = filtered_df_series.set_index('Date')
    df_series = filtered_df_series
    
    # Drop column 'Rate_of_Change_Ratio' due to high correlation with 'Momentum_Indicator', axis = 1 represents type column
    df_series = df_series.drop(['Rate_of_Change_Ratio'], axis=1)

    return df_series


# Cleaning all datasets for the S&P 500 and sector funds using custom function
df = clean_dataset(df)

In [9]:
# Show the count of tuples (rows) for each data set

print("S&P 500 Tuples (Rows) count: ", len(df.index))

S&P 500 Tuples (Rows) count:  8063


In [10]:
# Using info method to get a quick desc of the data
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 8063 entries, 1990-10-01 00:00:00-04:00 to 2022-09-30 00:00:00-04:00
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Open                     8063 non-null   float64
 1   High                     8063 non-null   float64
 2   Low                      8063 non-null   float64
 3   Close                    8063 non-null   float64
 4   Volume                   8063 non-null   int64  
 5   Momentum_Indicator       8063 non-null   float64
 6   Money_Flow_Index         8063 non-null   float64
 7   Relative_Strength_Index  8063 non-null   float64
 8   Daily_Range              8063 non-null   float64
 9   Daily_Mean               8063 non-null   float64
dtypes: float64(9), int64(1)
memory usage: 692.9 KB


In [11]:
# Information for numerical attributes (features)
df.describe()

Unnamed: 0,Open,High,Low,Close,Volume,Momentum_Indicator,Money_Flow_Index,Relative_Strength_Index,Daily_Range,Daily_Mean
count,8063.0,8063.0,8063.0,8063.0,8063.0,8063.0,8063.0,8063.0,8063.0,8063.0
mean,1511.357602,1520.300416,1501.698549,1511.606558,2450039000.0,0.820919,54.068904,55.379125,0.248956,1510.999483
std,975.513523,980.647994,969.712286,975.446771,1838858000.0,27.792306,38.193554,31.08119,17.455088,975.149794
min,295.450012,301.450012,294.51001,295.459991,14990000.0,-401.590088,-0.0,0.071289,-150.220215,297.980011
25%,909.600006,918.804993,898.915009,909.705017,681500000.0,-7.5,0.0,26.307692,-5.0,908.837494
50%,1260.819946,1268.189941,1253.030029,1260.680054,2403470000.0,1.299927,50.741857,59.693967,0.540009,1260.625
75%,1949.119995,1961.429993,1937.534973,1950.38501,3796280000.0,11.154984,100.0,83.846606,6.685059,1948.565002
max,4804.509766,4818.620117,4780.040039,4796.560059,11456230000.0,238.160156,100.0,99.995138,141.030029,4796.445068


In [12]:
# ----------------------------
# Training Data vs Test Data
# ----------------------------
# Separate the training set and the testing set


# Turns Date index into a feature (column)
df = df.reset_index()

# Get training and testing set
training_set = df.loc[df['Date'] <= '2022-04-01']
testing_set = df.loc[df['Date'] >= '2022-04-01']

# Resets the row index to 0
df = df.reset_index(drop=True)
training_set = training_set.reset_index(drop=True)
testing_set = testing_set.reset_index(drop=True)

# Makes the Date column back into an index 
df = df.set_index('Date')
training_set = training_set.set_index('Date')
testing_set = testing_set.set_index('Date')


# Put in a different variable for later use
data_training = pd.DataFrame()
data_testing = pd.DataFrame()

data_training['Close']  = training_set['Close']
data_testing['Close'] = testing_set['Close']


print(data_training.shape)
print(data_testing.shape)

(7938, 1)
(126, 1)


In [13]:
data_testing.head()

Unnamed: 0_level_0,Close
Date,Unnamed: 1_level_1
2022-04-01 00:00:00-04:00,4545.859863
2022-04-04 00:00:00-04:00,4582.640137
2022-04-05 00:00:00-04:00,4525.120117
2022-04-06 00:00:00-04:00,4481.149902
2022-04-07 00:00:00-04:00,4500.209961


In [14]:
data_training.head()

Unnamed: 0_level_0,Close
Date,Unnamed: 1_level_1
1990-10-01 00:00:00-04:00,314.940002
1990-10-02 00:00:00-04:00,315.209991
1990-10-03 00:00:00-04:00,311.399994
1990-10-04 00:00:00-04:00,312.690002
1990-10-05 00:00:00-04:00,311.5


In [15]:
# Scaling the data with Min-Max Scaler (This scaler will bring the training data set between 0 and 1)
scaler = MinMaxScaler(feature_range=(0,1)).fit(data_training)
data_training_array = scaler.transform(data_training)

In [16]:
# Making sure the training set is transformed using the Min-Max Scaler
data_training_array

array([[0.00432783],
       [0.00438782],
       [0.00354136],
       ...,
       [0.95687502],
       [0.94087003],
       [0.94430246]])

In [17]:
data_training_array.shape[0]

7938

In [18]:
# create x and y training sets to use 100 days of 'Close' price input as features and predicted target as current 'Close' price
x_train = []
y_train = []

for i in range(100, data_training_array.shape[0]):
    x_train.append(data_training_array[i-100: i]) # 0 through 100, then 1 through 101, etc.
    y_train.append(data_training_array[i, 0]) # starting at index 100 as 'target'


x_train, y_train = np.array(x_train, dtype=float), np.array(y_train, dtype=float)

In [19]:
x_train.shape

(7838, 100, 1)

In [20]:
# Start building neural network model, using Sequential() as model will have 1 tensor input and 1 tensor output

model = Sequential()
model.add(LSTM(units = 50, activation = 'relu', return_sequences = True, 
              input_shape = (x_train.shape[1], 1)))
model.add(Dropout(0.2))

model.add(LSTM(units = 60, activation = 'relu', return_sequences = True))
model.add(Dropout(0.3))

model.add(LSTM(units = 80, activation = 'relu', return_sequences = True))
model.add(Dropout(0.4))

model.add(LSTM(units = 120, activation = 'relu'))
model.add(Dropout(0.5))

model.add(Dense(units= 1))


  super().__init__(**kwargs)


In [21]:
model.summary()

In [22]:
# Track metrics while the model compiles

model.compile(optimizer='adam', loss= 'mean_squared_error', metrics=['mse', 'mae', 'mean_absolute_percentage_error'])
model.fit(x_train, y_train, epochs = 5)

Epoch 1/5
[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 111ms/step - loss: 0.0192 - mae: 0.0856 - mean_absolute_percentage_error: 61.6704 - mse: 0.0192
Epoch 2/5
[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 111ms/step - loss: 0.0031 - mae: 0.0369 - mean_absolute_percentage_error: 18.4453 - mse: 0.0031
Epoch 3/5
[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 133ms/step - loss: 0.0023 - mae: 0.0318 - mean_absolute_percentage_error: 17.1572 - mse: 0.0023
Epoch 4/5
[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 149ms/step - loss: 0.0020 - mae: 0.0291 - mean_absolute_percentage_error: 17.9252 - mse: 0.0020
Epoch 5/5
[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 146ms/step - loss: 0.0018 - mae: 0.0277 - mean_absolute_percentage_error: 19.3442 - mse: 0.0018


<keras.src.callbacks.history.History at 0x2015e551250>

In [23]:
data_testing.head()

Unnamed: 0_level_0,Close
Date,Unnamed: 1_level_1
2022-04-01 00:00:00-04:00,4545.859863
2022-04-04 00:00:00-04:00,4582.640137
2022-04-05 00:00:00-04:00,4525.120117
2022-04-06 00:00:00-04:00,4481.149902
2022-04-07 00:00:00-04:00,4500.209961


In [24]:
# For the testing set, in order to predict the first target 'close' stock price we need to append 
# the first 100 days as features to the test set

past_100_days = data_training.tail(100)
final_df = past_100_days.append(data_testing, ignore_index=True)
final_df.head()

AttributeError: 'DataFrame' object has no attribute 'append'

In [None]:
# Apply the min-max scaler metric calculated from the training set and transform the test set
# input_data = scaler.fit_transform(final_df)
input_data = scaler.transform(final_df)
input_data

In [None]:
input_data.shape

In [None]:
x_test = []
y_test = []

for i in range(100, input_data.shape[0]):
    x_test.append(input_data[i-100: i]) # 0 through 100, then 1 through 101, etc.
    y_test.append(input_data[i, 0]) # starting at index 100 as 'target'

In [None]:
x_test, y_test = np.array(x_test), np.array(y_test)
print(x_test.shape)
print(y_test.shape)

In [None]:
#Evaluation
score = model.evaluate(x_test, y_test)
print('Test loss (mean squared error):', score[0])
print('Test mean squared error:', score[1])
print('Test mean absolute error:', score[2])
print('Test mean absolute error percentage:', score[3])

In [None]:
# Predictions
y_predicted = model.predict(x_test)

In [None]:
y_predicted.shape

In [None]:
y_test

In [None]:
y_predicted

In [None]:
# Evaluations:
m = tf.keras.metrics.MeanSquaredError()
m.update_state(y_predicted, y_test)
mse = m.result().numpy()


# Get mean squared error and r2 score
r2s = r2_score(y_predicted, y_test)
mae = mean_absolute_error(y_predicted, y_test)
rmse = mean_squared_error(y_predicted, y_test, squared=False)

# Mean squared error
print("Mean Squared Error is: ", mse)

# R-squared score measures the variance of the independent variables influence on the dependent variable
print("R2 Score is: ", r2s)

# Mean Absolute Error and Root Mean Squared Error
print("Mean Absolute Error is: ", mae)
print("Root Mean Squared Error is: ", rmse)

In [None]:
# Note: Both are scaled here
score = model.evaluate(x_test, y_test, verbose=0)
print('Test Loss (Mean Squared Error):', score[0])
print('Test Mean Squared Error:', score[1])

In [None]:
y_predicted.shape

In [None]:
y_test.shape

In [None]:
# transfer to a dataframe shape for scaling
y_test = pd.DataFrame(y_test)

# Reverse the Min-Max scaling on the y-predicted and y-test sets
y_predicted = scaler.inverse_transform(y_predicted)
y_test = scaler.inverse_transform(y_test)

In [None]:
# Grabbing the 'Date' index from test set
Date_index = data_testing.index
y_test_with_dates = pd.DataFrame()
y_predicted_with_dates = pd.DataFrame()

# Add dates to the y-test set and y-predicted
y_test_with_dates['Date'] = Date_index
y_test_with_dates['Close'] = y_test
y_test_with_dates = y_test_with_dates.set_index('Date')

# Add dates to the y-test set and y-predicted
y_predicted_with_dates['Date'] = Date_index
y_predicted_with_dates['Close'] = y_predicted
y_predicted_with_dates = y_predicted_with_dates.set_index('Date')


# Plotting our results
plt.figure(figsize=(18, 6))
plt.plot(y_test_with_dates, 'b', label = 'Test Set Closing Price')
plt.plot(y_predicted_with_dates, 'r', label = 'Predicted Closing Price')
plt.xlabel('Dates')
plt.ylabel('Close Price')
plt.legend()
plt.show()

In [None]:
#------------------------------------------------------------------------------
# Evaluations: y_test vs y_predicted - AFTER scaling back to the original size
#------------------------------------------------------------------------------
m = tf.keras.metrics.MeanSquaredError()
m.update_state(y_predicted, y_test)
mse = m.result().numpy()


# Get r2 score, mean absolute error, root mean squared error
r2s = r2_score(y_predicted, y_test)
mae = mean_absolute_error(y_predicted, y_test)
rmse = mean_squared_error(y_predicted, y_test, squared=False)

# Mean squared error
print("Mean Squared Error is: ", mse)

# R-squared score measures the variance of the independent variables influence on the dependent variable
print("R2 Score is: ", r2s)

# Mean Absolute Error and Root Mean Squared Error
print("Mean Absolute Error is: ", mae)
print("Root Mean Squared Error is: ", rmse)