# Imports

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import math
from sklearn.model_selection import TimeSeriesSplit

2024-06-11 08:29:26.001178: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-11 08:29:26.001313: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-11 08:29:26.138268: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Read Data

In [2]:
X_train = pd.read_csv("/kaggle/input/ibm-classification-feature-selection/X_train_normalized.csv", index_col=[0])
X_test = pd.read_csv("/kaggle/input/ibm-classification-feature-selection/X_test_normalized.csv", index_col=[0])
y_train = pd.read_csv("/kaggle/input/ibm-classification-feature-selection/y_train.csv", index_col=[0])
y_test = pd.read_csv("/kaggle/input/ibm-classification-feature-selection/y_test.csv", index_col=[0]).reset_index(drop=True)

In [3]:
X_train.head(5)

Unnamed: 0,Volume,Sector_Basic Materials,Sector_Communication Services,Sector_Consumer Cyclical,Sector_Consumer Defensive,Sector_Energy,Sector_Financial Services,Sector_Healthcare,Sector_Industrials,Sector_Real Estate,...,Open 60-Day Shifted Differenced,Adj Close 7-Day Upper Bollinger Band Differenced,True Range Differenced,True Range 26-Day SMA Differenced,Adj Close Differenced,High Differenced,Low Differenced,Open Differenced,Date,Symbol
0,-0.082146,-0.211123,-0.211123,-0.355936,-0.279668,4.506073,-0.394063,-0.386806,-0.411949,-0.256814,...,-0.099365,-0.065462,0.021662,0.207475,-0.143825,0.001395,-0.21695,-0.124697,2014-09-04,14
1,-0.199298,-0.211123,-0.211123,-0.355936,-0.279668,4.506073,-0.394063,-0.386806,-0.411949,-0.256814,...,0.128093,-0.026616,-0.135444,0.292787,-0.199996,-0.087753,-0.355281,-0.261008,2014-09-04,262
2,0.256066,-0.211123,-0.211123,-0.355936,-0.279668,-0.221923,-0.394063,-0.386806,-0.411949,-0.256814,...,-0.127797,-0.051266,-0.014738,0.014914,0.041423,-0.083876,-0.089539,-0.100843,2014-09-04,105
3,0.809517,-0.211123,-0.211123,-0.355936,-0.279668,-0.221923,2.537663,-0.386806,-0.411949,-0.256814,...,0.163632,0.078857,0.052865,0.340484,0.049223,0.117676,0.048791,-0.012241,2014-09-04,395
4,-0.205682,-0.211123,-0.211123,2.809494,-0.279668,-0.221923,-0.394063,-0.386806,-0.411949,-0.256814,...,0.106768,-0.007671,-0.240677,-0.105232,-0.049846,-0.320311,-0.064057,-0.346203,2014-09-04,202


In [4]:
X_test.head(5)

Unnamed: 0,Volume,Sector_Basic Materials,Sector_Communication Services,Sector_Consumer Cyclical,Sector_Consumer Defensive,Sector_Energy,Sector_Financial Services,Sector_Healthcare,Sector_Industrials,Sector_Real Estate,...,Open 60-Day Shifted Differenced,Adj Close 7-Day Upper Bollinger Band Differenced,True Range Differenced,True Range 26-Day SMA Differenced,Adj Close Differenced,High Differenced,Low Differenced,Open Differenced,Date,Symbol
0,-0.263726,-0.211123,4.736577,-0.355936,-0.279668,-0.221923,-0.394063,-0.386806,-0.411949,-0.256814,...,-0.181107,-0.262042,-0.154274,-1.323139,-0.492231,-0.014108,-0.377124,0.161556,2021-01-04,139
1,-0.186121,-0.211123,-0.211123,-0.355936,3.575666,-0.221923,-0.394063,-0.386806,-0.411949,-0.256814,...,-0.216647,-0.008371,-0.027088,-0.381858,-0.122099,0.016899,-0.064058,0.035468,2021-01-04,255
2,-0.309133,-0.211123,-0.211123,-0.355936,-0.279668,-0.221923,2.537663,-0.386806,-0.411949,-0.256814,...,-0.778181,-0.13468,-0.002115,-0.513907,-0.962961,0.303723,-0.621021,0.386469,2021-01-04,378
3,-0.225167,-0.211123,-0.211123,-0.355936,-0.279668,-0.221923,-0.394063,-0.386806,2.427487,-0.256814,...,0.348439,-0.106183,-0.067764,-0.330715,-0.801156,0.156436,-0.497253,0.076363,2021-01-04,141
4,0.330085,-0.211123,-0.211123,-0.355936,-0.279668,-0.221923,-0.394063,-0.386806,-0.411949,3.893864,...,-0.056716,-0.11587,0.046599,-0.063529,-0.173969,0.040155,-0.140504,0.015022,2021-01-04,253


In [5]:
y_train.head(5)

Unnamed: 0,1-week Forward Return Sign
0,0.0
1,0.0
2,1.0
3,0.0
4,1.0


In [6]:
y_test.head(5)

Unnamed: 0,1-week Forward Return Sign
0,1.0
1,0.0
2,1.0
3,1.0
4,1.0


# Data Prep

LSTMs requires the data to be structured differently to most ML models. The idea is to look at a 'window' of data (e.g. last 5 timesteps) and use it to predict the target at the next timestep. We can roll the window to look at the whole dataset but only looking at one window at a time. 

To do this, we convert our dataframe to a 3D array X and a 1D array y. The array X has shape (n_samples, window_size, num_features) and the array y simply contains the target for each sample. The structure of the array X for a single stock is shown below for a window size of 2 (LHS) and the array y is shown on the RHS.

[Stock1Feature1(1), Stock1Feature2(1), ....., Stock1Featuren(1)], [Stock1Feature1(0), Stock1Feature2(0), ....., Stock1Featuren(0)]                                                                             [Stock1Target(2)]

[Stock1Feature1(2), Stock1Feature2(2), ....., Stock1Featuren(2)], [Stock1Feature1(1), Stock1Feature2(1), ....., Stock1Featuren(1)]                                                                             [Stock1Target(3)]

[Stock1Feature1(3), Stock1Feature2(3), ....., Stock1Featuren(3)], [Stock1Feature1(2), Stock1Feature2(2), ....., Stock1Featuren(2)]                                                                             [Stock1Target(4)]

....





In [7]:
# Append targets to dataframe for ease and remove 'Date' and 'Symbol' columns
df = pd.concat([X_train, y_train], axis=1)
df.drop(['Date'], axis=1, inplace=True)

# Time series split for cross validation
tscv = TimeSeriesSplit(n_splits=5)

# Gap size for purging (4000 rows roughly 10 days)
gap = 4000

# Window size - use last n samples to predict next sample
window = 5

In [8]:
def prep_data_for_lstm(df, window):
    """Function to put data in the correct format for LSTM. It assumes that the 'Target' is the last column of the dataframe. Note that 
    we get data 1 stock at a time so that the lookback window is for the same stock rather than different stocks"""
    
    # Get unique stock ids
    stock_ids = df['Symbol'].unique()

    # Array to store X and y
    X = []
    y = []
    
    for s in stock_ids:
        stock_data = df[df['Symbol'] == s].drop('Symbol', axis=1).to_numpy()

        # Loop through each index in the dataframe and get X and y
        for i in range(len(stock_data)-window):
            # Get values in window and wrap each one in a list (don't include the target column as this iwl cause look-ahead bias)
            row = stock_data[i: i+window, :-1]
            X.append(row)

            # Get the label (first value after the window)
            label = stock_data[i + window][-1]
            y.append(label)

    return np.array(X), np.array(y)

# Fit LSTM
An explanation of the model's layers can be seen in the comments in the code.

In [9]:
from keras.models import Sequential
from keras.layers import InputLayer, Dense, LSTM
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.metrics import AUC
from keras.optimizers import Adam
from keras.regularizers import l2
from sklearn.utils import class_weight

In [10]:
def build_network(X, window):
    """Function to build the network, using the given 3D array X and the window size to work out the shape of the InputLayer"""
    
    # Initialize sequential model in keras. This is a model that expects a linear stack of layers, meaning that each layer has one input tensor and one output tensor. 
    model = Sequential()

    # Add input layer. Each input tensor will be an element of X so the tensor will have shape (window, n_features)
    model.add(InputLayer((window, X.shape[2])))

    # Add LSTM layer. This is a recurrent layer that can learn long-term dependencies in sequences.
    model.add(
        LSTM(
            units=64,
            dropout=0.2, # Randomly set fraction of non-recurrent LSTM units to 0 at each training time step to reduce relianc on units and prevent overfitting
            recurrent_dropout=0.2 # Randomly set fraction of recurrent LSTM units to 0 at each training time step to reduce relianc on units and prevent overfitting
        )
    )

    # Add hidden layer to take LSTM layer output and transform it to a higher-level feayire space.
    model.add(
        Dense(
            units=8, 
            activation='relu',  # relu activation function adds some non-linearity without being computatinally expensive
            kernel_regularizer=l2(0.01) # Add l2 regularization on layer to reduce magnitude of weights reducing overfitting
        )
    )

    # Add output layer with sigmoid activation function to output a value between 0 and 1 interpreted as probability of postiivre class
    model.add(Dense(1, 'sigmoid'))
    
    # Compile model
    model.compile(
        loss='binary_crossentropy',
        optimizer=Adam(learning_rate=0.01),
        metrics=[AUC(name='auc')]
    )
    
    return model

In [11]:
def create_class_weight_dict(y):
    """Function to create a dictionary of class weights that are adjusted inversely proportional to their frequencies"""
    
    # Adjust class weights inversely proportional to class frequencies
    class_weights = class_weight.compute_class_weight(
        class_weight='balanced', 
        classes=np.unique(y),
        y=y
    )
    
    # Create dictionary of class weights required for fit method
    class_weight_dict = dict(enumerate(class_weights))
    
    return class_weight_dict

In [12]:
def create_callbacks(name):
    """Function returning a list of callbacks to pass to fit method"""
    
    # Checkpoint for saving model
    cp1 = ModelCheckpoint(
        f'{name}.keras',
        save_best_only=True
    )

    # Early stopping rounds checkpoint
    cp2 =  EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    
    return [cp1, cp2]

In [13]:
# Fit a model for each fold of the time series split
for i, (train_index, val_index) in enumerate(tscv.split(df)):

    # Split data into training fold and validation fold, purging with a gap
    df_train_fold, df_val_fold = df.iloc[train_index], df.iloc[val_index[gap:]]
    
    # Get data in form ready for LSTM
    X_train_fold_lstm, y_train_fold_lstm = prep_data_for_lstm(
        df=df_train_fold, 
        window=window
    )
    X_val_fold_lstm, y_val_fold_lstm = prep_data_for_lstm(
        df=df_val_fold, 
        window=window
    )

    # Build network
    model = build_network(
        X=X_train_fold_lstm,
        window=window,
    )
    
    # Get list of callbacks for model
    callbacks = create_callbacks(name=f"lstm_model_{i}")
    
    # Get dictionary of class weights to pass to fit function
    class_weight_dict = create_class_weight_dict(y=y_train_fold_lstm)
    
    # Fit model against validation set
    model.fit(
        X_train_fold_lstm,
        y_train_fold_lstm,
        validation_data=(X_val_fold_lstm, y_val_fold_lstm),
        epochs=100, # Number of times to run through the data
        callbacks=callbacks, # At end of every epoch, save the model if loss was lower than before. Also EarlyStoppingRounds
        class_weight=class_weight_dict # Handle imbalance of classes
    )

Epoch 1/100
[1m3201/3201[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 6ms/step - auc: 0.7224 - loss: 0.6263 - val_auc: 0.4733 - val_loss: 0.7947
Epoch 2/100
[1m3201/3201[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 6ms/step - auc: 0.7668 - loss: 0.5865 - val_auc: 0.4719 - val_loss: 0.8682
Epoch 3/100
[1m3201/3201[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 6ms/step - auc: 0.7712 - loss: 0.5821 - val_auc: 0.4886 - val_loss: 0.7967
Epoch 4/100
[1m3201/3201[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 6ms/step - auc: 0.7758 - loss: 0.5773 - val_auc: 0.4661 - val_loss: 0.8296
Epoch 5/100
[1m3201/3201[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 6ms/step - auc: 0.7779 - loss: 0.5752 - val_auc: 0.4796 - val_loss: 0.8376
Epoch 6/100
[1m3201/3201[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 6ms/step - auc: 0.7825 - loss: 0.5708 - val_auc: 0.4722 - val_loss: 0.8542
Epoch 1/100
[1m6474/6474[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m