NASDAQ STOCK PREDICTION

In [None]:
!pip install tensorflow
!pip install yfinance
import yfinance as yf
import pandas as pd
import matplotlib.pyplot as plt


In [None]:
stock = "NQ=F"  
df = yf.Ticker(stock)
df = df.history(period="max")
# df.to_csv(f"csv files/{stock}.csv")  

In [None]:
df

In [None]:
# univariate forcasting. only use closing value for now
df= df.iloc[:, [3]]
df

In [None]:
plt.plot(df.index, df["Close"])

In [None]:
import numpy as np
import pandas as pd

def df_to_windowed_df(dataframe, first_date_str, last_date_str, n=3):
    first_date = pd.to_datetime(first_date_str)
    last_date = pd.to_datetime(last_date_str)

    target_date = first_date

    dates = []
    X, Y = [], []

    last_time = False
    while True:
        df_subset = dataframe.loc[:target_date].tail(n+1)

        if len(df_subset) != n+1:
            print(f'Error: Window of size {n} is too large for date {target_date}')
            return

        values = df_subset['Close'].to_numpy()
        x, y = values[:-1], values[-1]

        dates.append(target_date)
        X.append(x)
        Y.append(y)

        next_date = target_date + pd.DateOffset(days=7)

        if last_time:
            break

        target_date = next_date

        if target_date >= last_date:
            last_time = True

    ret_df = pd.DataFrame({})
    ret_df['Target Date'] = dates

    X = np.array(X)
    for i in range(0, n):
        ret_df[f'Target-{n-i}'] = X[:, i]

    ret_df['Target'] = Y

    return ret_df

# Example usage
windowed_df = df_to_windowed_df(df,
                                '2021-02-24',
                                '2023-12-15',
                                n=3)
windowed_df




```
# result interpretation:
Target date = x
Target = price at x
Target_1 = price at x-1

why do we need a history of h-3? so that the model can learn based on the price of previous 3 days

*below*
date = Target Date
X = 3D (Target-3,2,1)
y = Target -> output
```



In [None]:
# Convert df into numpy array -> H-3 as input (X - 3d matrix), H as output (y) for supervised learning

def windowed_df_to_date_X_y(windowed_dataframe):
  # convert df into numpy array
  df_as_np = windowed_dataframe.to_numpy()

  dates = df_as_np[:, 0]

  # excludes the "target date" column
  middle_matrix = df_as_np[:, 1:-1]
  # unfit shape for lstm -> need to reshape the 3d layers
  # first layer -> len(dates), second -> how many col, third -> 1 variable (univariate forecasting)
  X = middle_matrix.reshape((len(dates), middle_matrix.shape[1], 1))

  # output vector
  Y = df_as_np[:, -1]

  return dates, X.astype(np.float32), Y.astype(np.float32)

# windowed_df from before
dates, X, y = windowed_df_to_date_X_y(windowed_df)


dates.shape, X.shape, y.shape

In [None]:
# split into train, validation, and testing

# 80% & 90% of the data
q_80 = int(len(dates) * .80)
q_90 = int(len(dates) * .9)

# training data (0-80%)
dates_train, X_train, y_train = dates[:q_80], X[:q_80], y[:q_80]

# validation data (80%-90%)
dates_val, X_val, y_val = dates[q_80:q_90], X[q_80:q_90], y[q_80:q_90]

# testing data (90%-100%)
dates_test, X_test, y_test = dates[q_90:], X[q_90:], y[q_90:]

plt.plot(dates_train, y_train)
plt.plot(dates_val, y_val)
plt.plot(dates_test, y_test)

plt.legend(['Train', 'Validation', 'Test'])

In [None]:
# THIS IS WHERE YOU SHOULD PLAY AROUND WITH THE NUMBERS
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import layers

model = Sequential([layers.Input((3, 1)),   # (3days - length, 1variable - feature),
        layers.LSTM(64),   # number of neuron. bigger = complicated = prone to overfitting
        layers.Dense(32, activation='relu'),  # lstm dense layers - rectified linear unit
        layers.Dense(32, activation='relu'),
        layers.Dense(1)]) # linear activation function - bc only forecasting 1 variable

model.compile(loss='mse', # compile to minimize loss function (mean squared error)
        optimizer=Adam(learning_rate=0.001),  # Adam optimizer. play around with learning rate
        metrics=['mean_absolute_error']) # put in a list, to find the average of loss


# epochs -> 100 runs through the data
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=100)


In [None]:
last_val_loss = round(history.history['val_loss'][-1], 2)
last_val_mae = round(history.history['val_mean_absolute_error'][-1], 2)

test_loss, test_mae = model.evaluate(X_test, y_test)
test_loss = round(test_loss, 2)
test_mae = round(test_mae, 2)

print(f"Validation Loss: {last_val_loss}")
print(f"Validation MAE: {last_val_mae}")
print(f"Test Loss: {test_loss}")
print(f"Test MAE: {test_mae}")



In [None]:
train_predictions = model.predict(X_train).flatten() # transform to 1D array

plt.plot(dates_train, train_predictions)
plt.plot(dates_train, y_train)
plt.legend(['Training Predictions', 'Training Observations'])

In [None]:
val_predictions = model.predict(X_val).flatten()

plt.plot(dates_val, val_predictions)
plt.plot(dates_val, y_val)
plt.legend(['Validation Predictions', 'Validation Observations'])

In [None]:
test_predictions = model.predict(X_test).flatten()

plt.plot(dates_test, test_predictions)
plt.plot(dates_test, y_test)
plt.legend(['Testing Predictions', 'Testing Observations'])

In [None]:
plt.plot(dates_train, train_predictions)
plt.plot(dates_train, y_train)
plt.plot(dates_val, val_predictions)
plt.plot(dates_val, y_val)
plt.plot(dates_test, test_predictions)
plt.plot(dates_test, y_test)
plt.legend(['Training Predictions',
            'Training Observations',
            'Validation Predictions',
            'Validation Observations',
            'Testing Predictions',
            'Testing Observations'])