In [269]:
# import lib
import sys
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers import Input, LSTM, Dense, BatchNormalization, Dropout
from keras.regularizers import L1L2
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ReduceLROnPlateau

sys.path.append("../scripts")
from data_loader import get_stocks, get_etfs, get_technical_indicators

In [271]:
# define global constant
STOCKS = ['AAPL', 'MSFT', 'GOOGL']
HORIZONS = [1, 3, 7, 30, 90] # 1 day, 3 days, 1 week, 1 month, 3 months 

In [273]:
def prepare_data(stock, testing_size, seq_length=720, horizon=1): 
    # load the corresponding stock
    df = get_stocks(stock)
    df = get_technical_indicators(df)
    df = df[len(df)-testing_size:]

    # set up the data 
    df['Target'] = (df['Close'].shift(-horizon) - df['Close']) / df['Close'] # set up the training target 
    X = df.drop(columns=['Close', 'Date', 'Stock', 'Target']) # drop the irrevalent variables for training set
    y = df['Close']

    # print(f"len(X):{len(X)}, len(y):{len(y)}")
    # normalise the vector
    scaler = MinMaxScaler(feature_range=(0, 1))
    X = scaler.fit_transform(X)
    y = scaler.fit_transform(y.values.reshape(-1, 1))

    return X, y, scaler

In [275]:
# create sequence data for lstm
def get_sequences2(X, y, seq_length):
    X_seq, y_seq = [], []
    for i in range(seq_length, len(X)):
        X_seq.append(X[i:i+seq_length])
        y_seq.append(y[i+seq_length])
    return np.array(X_seq), np.array(y_seq) 

In [277]:
def get_sequences(X, y, seq_length):
    """Create properly aligned sequences where:
    - Each X sequence contains seq_length past observations
    - Each y sequence contains corresponding targets
    - Both have exactly the same length"""
    X_seq, y_seq = [], []
    for i in range(seq_length, len(X)):
            X_seq.append(X[i-seq_length:i])  # Past seq_length observations
            y_seq.append(y[i-seq_length:i])   # Corresponding targets
    return np.array(X_seq), np.array(y_seq)

In [279]:
# build lstm model
def get_lstm(seq_len, num_param):
    model = Sequential([
        Input(shape=(seq_len, num_param)),
        LSTM(50, return_sequences=True),
        LSTM(50),
        Dense(1)
    ])
    model.compile(optimizer='adam', loss='mse')
    return model

def get_enhanced_lstm(seq_len, num_features):
    model = Sequential([
        Input(shape=(seq_len, num_features)),
        LSTM(128, return_sequences=True, 
             kernel_regularizer=L1L2(l1=1e-5, l2=1e-4),
             recurrent_dropout=0.2),
        BatchNormalization(),
        LSTM(64, return_sequences=True,
             kernel_regularizer=L1L2(l1=1e-5, l2=1e-4)),
        Dropout(0.3),
        LSTM(32),
        BatchNormalization(),
        Dense(32, activation='relu'),
        Dropout(0.2),
        Dense(1)
    ])
    
    optimizer = Adam(learning_rate=0.001)
    model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])
    
    return model

# # Example usage
# model = get_enhanced_lstm(seq_len=60, num_features=5)  # 60 timesteps, 5 features
# model.summary()

In [295]:
# train the model by rolling window and predict the outcome
def rolling_window_train(stock, horizon=1, train_ratio=0.8, window_size=720, epochs=5, batch_size=32, testing_size=1000):
    predictions = []
    actuals = []
    
    # prepare data for the seq_length
    X, y, scaler = prepare_data(stock, seq_length=window_size, testing_size=testing_size, horizon=horizon)
    test_start = int(len(X)*train_ratio)
    
    # get sequence data
    X, y = get_sequences(X, y, window_size)
    # print(f"len(X_seq):{len(X_seq)}, len(y_seq):{len(y_seq)}")
    # print(y_seq[len(y_seq)-1], y[len(y)-1-seq_length:len(y)-1])

    i = test_start
    while (i < len(X)):
        i_seq = i-window_size
        # reset the model
        model = get_enhanced_lstm(window_size, X.shape[2])

        X_window, y_window = X[i_seq-window_size:i_seq], y[i_seq-window_size:i_seq]
        steps = window_size // batch_size // epochs
        if (i == test_start):
            print(len(X_window), window_size, steps*epochs*batch_size)

        # fit the model with rolling window
        model.fit(X_window, y_window, epochs=epochs, batch_size=batch_size, steps_per_epoch=steps, verbose=0)

        # Predict next value
        pred = model.predict(X[i_seq:i_seq+1].reshape(1, window_size, X.shape[2]))
        pred = scaler.inverse_transform(pred[0,0].reshape(-1,1))
        predictions.append(pred)
        y_test_original = scaler.inverse_transform(y[i_seq][0].reshape(-1,1))
        actuals.append(y_test_original)

        i += 1

    return np.array(predictions).flatten(), np.array(actuals).flatten()

In [311]:
def expanding_window_train(stock, horizon, train_ratio=0.8, window_size=720, epochs=5, batch_size=32, testing_size=1000):
    predictions = []
    actuals = []
    X, y, scaler = prepare_data(stock, seq_length=window_size, testing_size=testing_size, horizon=horizon)
    test_start = int(len(X)*train_ratio)

    i = test_start
    while (i < len(X)):
        i_seq = i-window_size
        X_seq, y_seq = get_sequences(X, y, window_size)
        # reset the model
        model = get_enhanced_lstm(window_size, X_seq.shape[2])

        X_window, y_window = X_seq[i_seq-window_size:i_seq], y_seq[i_seq-window_size:i_seq]
        steps = window_size // batch_size // epochs
        if (i == test_start):
            print(len(X_window), window_size, steps*epochs*batch_size)

        # fit the model with rolling window
        model.fit(X_window, y_window, epochs=epochs, batch_size=batch_size, steps_per_epoch=steps, verbose=0)

        # Predict next value
        print(X_seq[i_seq:i_seq+1].shape)
        pred = model.predict(X_seq[i_seq:i_seq+1].reshape(1, window_size, X_seq.shape[2]))
        pred = scaler.inverse_transform(pred[0,0].reshape(-1,1))
        predictions.append(pred)
        y_test_original = scaler.inverse_transform(y_seq[i_seq][0].reshape(-1,1))
        actuals.append(y_test_original)

        i += 1
        window_size += 1

    return np.array(predictions).flatten(), np.array(actuals).flatten()

In [299]:
def main():
    # predictions, actuals = rolling_window_train(STOCKS[0], HORIZONS[0], window_size=10, epochs=2, batch_size=5, testing_size=100)
    predictions, actuals = expanding_window_train(STOCKS[0], HORIZONS[0], window_size=10, epochs=2, batch_size=5, testing_size=100)
    # evaulate the model performance
    print(predictions - actuals)
    rmse = np.sqrt(mean_squared_error(actuals.reshape(-1, 1), predictions.reshape(-1, 1)))
    print(f'RMSE: {rmse}')

In [287]:
def test():
    X, y, scaler = prepare_data(STOCKS[0], horizon=HORIZONS[0], seq_length=730, train_ratio=0.8)
    print(X_train.shape)

In [None]:
if __name__ == '__main__':
    main()
    # test()

✅ Processed data already exists. Skipping processing.
✅ Technical indicators added successfully.
10 10 10
(1, 10, 22)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 695ms/step
