In [None]:
import pandas as pd
import yfinance as yf
import datetime
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Input, LSTM, Dense, LeakyReLU
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor

In [None]:
TRAIN_START_DATE = '1960-01-01'
TRAIN_END_DATE = '2015-12-31'
PREDICT_START_DATE = '2016-01-01'
PREDICT_END_DATE = '2019-12-31'
WINDOW_SIZE = 7

In [None]:
# Download S&P 500 data from Yahoo Finance
df = yf.download('^GSPC', start=TRAIN_START_DATE, end=PREDICT_END_DATE)

df = df.reset_index()

def str_to_datetime(s):
  split = s.split('-')
  year, month, day = int(split[0]), int(split[1]), int(split[2])
  return datetime.datetime(year=year, month=month, day=day)

df['Date'] = df['Date'].dt.strftime('%Y-%m-%d')
df['Date'] = df['Date'].apply(str_to_datetime)

df.index = df.pop('Date')

# Drop columns that are not needed
df = df.drop(columns=['Adj Close'])

In [None]:
# Split data into train and test sets
X_train = df[:PREDICT_START_DATE]
X_test = df[PREDICT_START_DATE:PREDICT_END_DATE]

y_train = X_train.pop('Close')
y_test = X_test.pop('Close')

def df_to_windowed_df(feature, df, window_size, df_slice=slice(None, None)):
    """Converts a dataframe into a windowed dataframe and date list"""
    df = df.reindex(df_slice)  # Reindex the dataframe to include missing dates
    df = df.dropna()  # Drop rows with NaN values
    date_list = df.index.to_list()
    feature_values = df[feature].to_numpy()

    windowed_df = []
    for i in range(len(feature_values) - window_size + 1):
        windowed_df.append(feature_values[i:i + window_size])

    return np.array(windowed_df), date_list[window_size:]


# Create windowed dataframes and date_list
open_windowed_df, date_list = df_to_windowed_df('Open', df, window_size=WINDOW_SIZE, df_slice=pd.date_range(PREDICT_START_DATE, PREDICT_END_DATE))
high_windowed_df, date_list = df_to_windowed_df('High', df, window_size=WINDOW_SIZE, df_slice=pd.date_range(PREDICT_START_DATE, PREDICT_END_DATE))
low_windowed_df, date_list = df_to_windowed_df('Low', df, window_size=WINDOW_SIZE, df_slice=pd.date_range(PREDICT_START_DATE, PREDICT_END_DATE))
volume_windowed_df, date_list = df_to_windowed_df('Volume', df, window_size=WINDOW_SIZE, df_slice=pd.date_range(PREDICT_START_DATE, PREDICT_END_DATE))

to_combine_open_windowed_df, _ = df_to_windowed_df('Open', df, window_size=1, df_slice=pd.date_range(PREDICT_START_DATE, PREDICT_END_DATE))
to_combine_high_windowed_df, _ = df_to_windowed_df('High', df, window_size=1, df_slice=pd.date_range(PREDICT_START_DATE, PREDICT_END_DATE))
to_combine_low_windowed_df, _ = df_to_windowed_df('Low', df, window_size=1, df_slice=pd.date_range(PREDICT_START_DATE, PREDICT_END_DATE))
to_combine_volume_windowed_df, _ = df_to_windowed_df('Volume', df, window_size=1, df_slice=pd.date_range(PREDICT_START_DATE, PREDICT_END_DATE))

# Stack the windowed dataframes along the third axis
stacked_windowed_df = np.stack([open_windowed_df, high_windowed_df, low_windowed_df, volume_windowed_df], axis=-1)

# Reshape the stacked_windowed_df into the desired shape (975, 30, 4)
combined_windowed_df = stacked_windowed_df.reshape((-1, WINDOW_SIZE, 4))

def windowed_df_to_date_X_y(windowed_df, date_list):
    """Converts a windowed dataframe and date list into a date, X, and y dataframe"""
    date_df = []
    X_df = []
    y_df = []
    for i in range(len(windowed_df) - 1):
        if i + 1 < len(date_list):  # Add this condition to prevent IndexError
            date_df.append(date_list[i + 1])   # Shift date by 1
            X_df.append(windowed_df[i])
            if windowed_df.ndim == 3:
                y_df.append(windowed_df[i + 1, -1, 0])  # Modify the index
            else:
                y_df.append(windowed_df[i + 1, -1])
    return np.array(date_df), np.array(X_df), np.array(y_df)

# Use the modified function with the date_list parameter
date_df, X_open, y_open = windowed_df_to_date_X_y(open_windowed_df, date_list)
_, X_high, y_high = windowed_df_to_date_X_y(high_windowed_df, date_list)
_, X_low, y_low = windowed_df_to_date_X_y(low_windowed_df, date_list)
_, X_volume, y_volume = windowed_df_to_date_X_y(volume_windowed_df, date_list)
_, X_train_combined, _ = windowed_df_to_date_X_y(combined_windowed_df, date_list)
y_train_combined = y_train[len(y_train) - len(X_train_combined):].to_numpy()

In [None]:
# Create empty lists for X and y
X = []
y = []

# Iterate through the dataframe with a step of 1, leaving out the last WINDOW_SIZE rows
for i in range(len(df) - WINDOW_SIZE):
    # Get the data for the current window
    window_data = df.iloc[i:i + WINDOW_SIZE]

    # Extract the relevant features and append to X as a 2D array
    X.append(window_data[['Open', 'High', 'Low', 'Volume']].values)

    # Extract the target (close price) for the day after the window and append to y
    y.append(df.iloc[i + WINDOW_SIZE]['Close'])

# Convert X and y to numpy arrays
X = np.array(X)
y = np.array(y)

In [None]:
# Find the indices that correspond to the start and end dates
train_start_index = df.index.asof(pd.to_datetime(TRAIN_START_DATE))
train_end_index = df.index.asof(pd.to_datetime(TRAIN_END_DATE))
predict_start_index = df.index.asof(pd.to_datetime(PREDICT_START_DATE))
predict_end_index = df.index.asof(pd.to_datetime(PREDICT_END_DATE))

# Handle the case where asof() returns NaT
train_start_index = df.index[0] if pd.isna(train_start_index) else train_start_index

# Get the integer locations for the indices
train_start_index = df.index.get_loc(train_start_index)
train_end_index = df.index.get_loc(train_end_index)
predict_start_index = df.index.get_loc(predict_start_index)
predict_end_index = df.index.get_loc(predict_end_index)

# Adjust the indices to account for the window size
train_start_index = max(0, train_start_index - WINDOW_SIZE)
train_end_index -= WINDOW_SIZE
predict_start_index -= WINDOW_SIZE
predict_end_index -= WINDOW_SIZE

predict_end_index = predict_start_index + 997

# Slice the X and y arrays using the indices
X_train = X[train_start_index:train_end_index]
y_train = y[train_start_index:train_end_index]
X_test = X[predict_start_index:predict_end_index]
y_test = y[predict_start_index:predict_end_index]

In [None]:
from sklearn.model_selection import train_test_split

# Split the X_train and y_train datasets into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Normalize the input data
mean = X_train.mean(axis=(0, 1))
std = X_train.std(axis=(0, 1))

X_train = (X_train - mean) / std
X_val = (X_val - mean) / std

In [None]:
# Define a function that creates the model with desired parameters
def create_model(lstm_units=50, activation='relu', optimizer='adam'):
    model = Sequential()
    model.add(LSTM(lstm_units, activation=activation, input_shape=(WINDOW_SIZE, 4)))
    model.add(Dense(1))
    model.compile(optimizer=optimizer, loss='mse')
    return model

# Create an instance of KerasRegressor
estimator = KerasRegressor(build_fn=create_model, epochs=10, batch_size=32, verbose=1)

# Set up the parameter grid
param_grid = {
    'lstm_units': [50, 100, 150],
    'activation': ['relu', 'tanh'],
    'optimizer': ['adam', 'rmsprop']
}

# Create an instance of GridSearchCV
grid_search = GridSearchCV(estimator=estimator, param_grid=param_grid, cv=3)

# Train the model using grid search
grid_result = grid_search.fit(X_train, y_train)

# Print the best parameters
print("Best parameters found: ", grid_result.best_params_)

# Train the model using the best parameters found
best_model = create_model(
    lstm_units=grid_result.best_params_['lstm_units'],
    activation=grid_result.best_params_['activation'],
    optimizer=grid_result.best_params_['optimizer']
)

best_model.fit(
    X_train, y_train,
    epochs=10,
    batch_size=32,
    validation_data=(X_val, y_val),
    verbose=1,
    shuffle=False
)

In [None]:
best_model

In [None]:
# Normalize the X_test data
X_test_normalized = (X_test - mean) / std

# Predict using the X_test dataset
y_pred = best_model.predict(X_test_normalized)

# Plot the predicted values and the actual values from y_test
plt.figure(figsize=(14, 6))
plt.plot(date_df, y_test, label='Actual')
plt.plot(date_df, y_pred, label='Predicted')
plt.xlabel('Date')
plt.ylabel('Close Price')
plt.title('S&P 500 Close Price Prediction')
plt.legend()
plt.show()

In [None]:
model2_open = tf.keras.models.load_model('open_model.h5')
model2_high = tf.keras.models.load_model('high_model.h5')
model2_low = tf.keras.models.load_model('low_model.h5')
model2_volume = tf.keras.models.load_model('volume_model.h5')

# Predict using model 2
y_pred_open = model2_open.predict(X_open)
y_pred_high = model2_high.predict(X_high)
y_pred_low = model2_low.predict(X_low)
y_pred_volume = model2_volume.predict(X_volume)

In [None]:
def create_windows(data, window_size):
    windows = []
    for i in range(len(data) - window_size):
        windows.append(data[i:i + window_size])
    return np.array(windows)

# Assuming y_pred_open, y_pred_high, y_pred_low, and y_pred_volume are already defined
# Create windows for each feature
open_windows = create_windows(y_pred_open, WINDOW_SIZE)
high_windows = create_windows(y_pred_high, WINDOW_SIZE)
low_windows = create_windows(y_pred_low, WINDOW_SIZE)
volume_windows = create_windows(y_pred_volume, WINDOW_SIZE)

# Combine the 4 arrays along axis 2
X_test2 = np.stack([open_windows, high_windows, low_windows, volume_windows], axis=2)
X_test2 = np.squeeze(X_test2, axis=3)

In [None]:
# Normalize the X_test data
X_test2_normalized = (X_test2 - mean) / std

# Predict using the X_test dataset
model2_pred = best_model.predict(X_test2_normalized)

# Plot the predicted values and the actual values from y_test
plt.figure(figsize=(14, 6))
plt.plot(date_df, y_test, label='Actual')
plt.plot(date_df[7:], model2_pred, label='Predicted')
plt.xlabel('Date')
plt.ylabel('Close Price')
plt.title('S&P 500 Close Price Prediction')
plt.legend()
plt.show()

In [None]:
# Plot the predicted values and the actual values from y_test
plt.figure(figsize=(14, 6))
plt.plot(date_df, y_test, label='Close')
plt.plot(date_df[7:], model2_pred, label='Close`')
plt.xlabel('Date')
plt.ylabel('Close Price')
plt.title('S&P 500 Close Price Prediction')
plt.legend()
plt.show()

In [None]:
import sklearn
# Evaluate model performance
mse = sklearn.metrics.mean_squared_error(y_test, y_pred)
print("Mean squared error: ", mse)

mae = sklearn.metrics.mean_squared_error(y_test, y_pred)
print("Mean absolute error: ", mae)