In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
class StockDataset(Dataset):
    def __init__(self, df, seq_length=60):
        self.data = []
        self.labels = []
        self.seq_length = seq_length

        tickers = df['Ticker'].unique()

        for ticker in tickers:
            stock_data = df[df['Ticker'] == ticker].reset_index(drop=True)

            # Ensure no missing values
            stock_data.fillna(0, inplace=True)

            # Select features: Add recommended features
            features = stock_data[
                [
                    'Open', 'High', 'Low', 'Close', 'Volume',  # Basic features
                    'RSI', 'MACD', 'Signal_Line', 'Bollinger_Width',  # Technical indicators
                    'Day_of_Week', 'Month',  # Time-based features
                    'Lag1_Open', 'Lag2_Open', 'Lag1_Volume',
                    'Average_sentiment', 'news_day',  # Lagged features
                    'I-A', 'I-CT', 'I-RD', 'I-DC', 'I-DI', 'I-GI', 'I-NC',
                    'I-RSS', 'I-SD', 'I-SR', 'I-SS', 'O', 'I-GC' # Business Event Indicator variables
                ]
            ].values

            # Extract the target: Movement (binary)
            movements = stock_data['Movement'].values

            # Create sliding windows for sequences
            for i in range(len(features) - seq_length):
                self.data.append(features[i:i + seq_length])  # Sequence of features
                self.labels.append(movements[i + seq_length])  # Target for next day

        # Convert to numpy arrays for better handling
        self.data = np.array(self.data)
        self.labels = np.array(self.labels)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Return sequence data and label as tensors
        return (
            torch.tensor(self.data[idx], dtype=torch.float32),
            torch.tensor(self.labels[idx], dtype=torch.long)
        )

In [3]:
# LSTM model for binary classification
class StockLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(StockLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=0.2)
        self.fc = nn.Linear(hidden_size, 1)  # Single output for binary classification
    
    def forward(self, x):
        h_0 = torch.zeros(num_layers, x.size(0), hidden_size).to(x.device)
        c_0 = torch.zeros(num_layers, x.size(0), hidden_size).to(x.device)
        out, _ = self.lstm(x, (h_0, c_0))  # LSTM output
        hidden_state = out[:, -1, :]  # Use the last hidden state
        return self.fc(hidden_state)  # Return raw logits


In [4]:
def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, num_epochs, device, patience=5):
    best_val_loss = float('inf')
    no_improve_epochs = 0  # Tracks epochs without improvement

    for epoch in range(num_epochs):
        # Training phase
        model.train()
        train_loss = 0
        for inputs, targets in train_loader:
            inputs, targets = inputs.to(device), targets.to(device).unsqueeze(1).float()
            
            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            train_loss += loss.item()
            
            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        train_loss /= len(train_loader)

        # Validation phase
        model.eval()
        val_loss = 0
        correct_predictions = 0
        total_predictions = 0
        with torch.no_grad():
            for inputs, targets in val_loader:
                inputs, targets = inputs.to(device), targets.to(device).unsqueeze(1).float()
                outputs = model(inputs)
                val_loss += criterion(outputs, targets).item()
                
                # Compute accuracy
                predictions = (torch.sigmoid(outputs) > 0.5).float()
                correct_predictions += (predictions == targets).sum().item()
                total_predictions += targets.size(0)

        val_loss /= len(val_loader)
        accuracy = correct_predictions / total_predictions

        # Print progress
        print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {accuracy:.4f}")

        # Scheduler step
        scheduler.step(val_loss)

        # Check for improvement
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            no_improve_epochs = 0
            torch.save(model.state_dict(), 'best_model.pth')  # Save the best model
        else:
            no_improve_epochs += 1

        # Early stopping
        if no_improve_epochs >= patience:
            print("Early stopping triggered. Stopping training.")
            break

In [5]:
def init_weights(m):
    if isinstance(m, nn.Linear) or isinstance(m, nn.LSTM):
        for name, param in m.named_parameters():
            if 'weight' in name:
                nn.init.xavier_uniform_(param.data)
            elif 'bias' in name:
                nn.init.zeros_(param.data)

In [6]:
def evaluate_model(model, data_loader, device):
    model.eval()
    y_true = []
    y_pred = []
    with torch.no_grad():
        for inputs, targets in data_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            predictions = (torch.sigmoid(outputs) > 0.5).float()
            
            y_true.extend(targets.cpu().numpy())
            y_pred.extend(predictions.cpu().numpy())
    
    # Print classification report
    print("Classification Report:")
    print(classification_report(y_true, y_pred))
    
    # Print confusion matrix
    print("Confusion Matrix:")
    print(confusion_matrix(y_true, y_pred))


In [7]:
# Hyperparameters
input_size = 29  # Number of input features (OHLCV, sentiment, news_day, event indicators)
hidden_size = 96
num_layers = 2
seq_length = 60
batch_size = 64
num_epochs = 20
learning_rate = 0.0005

In [8]:
# Data preparation
df = pd.read_csv('sp100_ohlcv_news_sentiment_events_lora.csv')  # Load your dataframe

# Calculate Movement: 1 if next day's Open is greater, 0 otherwise
df['Movement'] = (df['Open'].shift(-1) > df['Open']).astype(int)
df = df.iloc[:-1]  # Drop the last row

In [9]:
# Function to calculate 5-day EMA
def calculate_ema(series, span=5):
    return series.ewm(span=span, adjust=False).mean()

In [10]:
# Apply 5-day EMA to Average_sentiment
df['Sentiment_EMA'] = calculate_ema(df['Average_sentiment'])

# Fill missing Average_sentiment values (where news_day == 0) with the EMA
df.loc[df['news_day'] == 0, 'Average_sentiment'] = df.loc[df['news_day'] == 0, 'Sentiment_EMA']

# Drop the temporary Sentiment_EMA column
df.drop(columns=['Sentiment_EMA'], inplace=True)

In [11]:
df[(df['news_day'] == 0) & df['Average_sentiment'] != 0]

Unnamed: 0.1,Unnamed: 0,Ticker,Date,Open,High,Low,Close,Volume,Average_sentiment,news_day,...,I-DI,I-GI,I-NC,I-RSS,I-SD,I-SR,I-SS,O,I-GC,Movement
396,396,AAPL,2022-06-02,147.83,151.27,146.86,151.21,72348055,-3.328278e-01,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
397,397,AAPL,2022-06-01,149.90,151.74,147.68,148.71,74286635,-3.328278e-01,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
398,398,AAPL,2022-05-31,149.07,150.66,146.84,148.84,103718416,-3.328278e-01,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
399,399,AAPL,2022-05-27,145.39,149.68,145.26,149.64,90978503,-3.328278e-01,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
400,400,AAPL,2022-05-26,137.39,144.34,137.14,143.78,90601548,-3.328278e-01,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
151911,151911,XOM,2018-01-09,86.94,87.15,86.65,86.77,7870756,-2.898843e-40,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
151912,151912,XOM,2018-01-08,86.70,87.15,86.60,87.14,10549116,-2.898843e-40,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
151913,151913,XOM,2018-01-05,86.75,86.88,85.71,86.75,11003133,-2.898843e-40,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
151914,151914,XOM,2018-01-04,86.79,87.22,86.43,86.82,10840055,-2.898843e-40,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [12]:
def calculate_rsi(data, window=14):
    delta = data['Close'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    rs = gain / loss
    return 100 - (100 / (1 + rs))

In [13]:
df['RSI'] = calculate_rsi(df)
df['RSI'].fillna(50, inplace=True) 

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['RSI'].fillna(50, inplace=True)


In [14]:
df['EMA_12'] = df['Close'].ewm(span=12, adjust=False).mean()
df['EMA_26'] = df['Close'].ewm(span=26, adjust=False).mean()
df['MACD'] = df['EMA_12'] - df['EMA_26']
df['Signal_Line'] = df['MACD'].ewm(span=9, adjust=False).mean()
df.drop(columns=['EMA_12', 'EMA_26'], inplace=True)

In [15]:
def calculate_bollinger_bands(data, window=20, k=2):
    ma = data['Close'].rolling(window=window).mean()
    std = data['Close'].rolling(window=window).std()
    data['Upper_Band'] = ma + k * std
    data['Lower_Band'] = ma - k * std
    data['Bollinger_Width'] = data['Upper_Band'] - data['Lower_Band']

In [16]:
calculate_bollinger_bands(df)
df.drop(columns=['Upper_Band', 'Lower_Band'], inplace=True)  # Keep only Bollinger_Width

In [17]:
# Temporal features
df['Day_of_Week'] = pd.to_datetime(df['Date']).dt.dayofweek
df['Month'] = pd.to_datetime(df['Date']).dt.month

In [18]:
# Lagged features
df['Lag1_Open'] = df['Open'].shift(1)
df['Lag2_Open'] = df['Open'].shift(2)
df['Lag1_Volume'] = df['Volume'].shift(1)

In [19]:
print(df[['RSI', 'MACD', 'Lag1_Open', 'Lag2_Open', 'Lag1_Volume']].describe())

                 RSI           MACD      Lag1_Open      Lag2_Open  \
count  151916.000000  151916.000000  151915.000000  151914.000000   
mean       47.737625      -0.004999     218.758783     218.759652   
std        17.400732      19.269516     370.899669     370.900736   
min         0.000000    -308.825915       4.270000       4.270000   
25%        35.130930      -1.749317      64.530000      64.530000   
50%        47.540525      -0.242395     121.540000     121.540000   
75%        59.916194       1.001487     220.770000     220.770000   
max       100.000000     959.923798    3744.000000    3744.000000   

        Lag1_Volume  
count  1.519150e+05  
mean   1.127710e+07  
std    1.833312e+07  
min    9.734500e+04  
25%    2.770560e+06  
50%    5.284210e+06  
75%    1.107402e+07  
max    4.010487e+08  


In [20]:
# Normalize numerical features to improve learning
scaler = MinMaxScaler()
# Select numerical features to normalize (include new features)
numerical_features = ['Open', 'High', 'Low', 'Close', 'Volume', 'RSI', 'MACD', 'Signal_Line', 'Bollinger_Width', 'Lag1_Open', 'Lag2_Open', 'Lag1_Volume']
df[numerical_features] = scaler.fit_transform(df[numerical_features])

In [21]:
train_df = df[df['Date'] < '2023-01-01']
test_df = df[df['Date'] >= '2023-01-01']

In [22]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [23]:
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(train_df['Movement']),
    y=train_df['Movement']
)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

print("Class Weights:", class_weights)

Class Weights: tensor([0.9550, 1.0494], device='cuda:0')


In [24]:
# Split train_df into train and validation sets
train_split = int(len(train_df) * 0.9)
train_data = train_df.iloc[:train_split]
val_data = train_df.iloc[train_split:]

train_dataset = StockDataset(train_data, seq_length)
val_dataset = StockDataset(val_data, seq_length)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [25]:
# Model, Loss, Optimizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = StockLSTM(input_size, hidden_size, num_layers).to(device)
model.apply(init_weights)
criterion = nn.BCEWithLogitsLoss(pos_weight=class_weights[1])
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='min', patience=3, factor=0.1, verbose=True
)
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)



tensor(0.)

In [27]:
# Initial training on 2018-2022 data
print("Training the model on 2018-2022 data...")
train_model(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    criterion=criterion,
    optimizer=optimizer,
    scheduler=scheduler,
    num_epochs=100,
    device=device
)
torch.save(model.state_dict(), 'trained_model_events_lora.pth')

Training the model on 2018-2022 data...
Epoch [1/100], Train Loss: 0.5827, Val Loss: 0.5853, Val Accuracy: 0.6965
Epoch [2/100], Train Loss: 0.5818, Val Loss: 0.5858, Val Accuracy: 0.6968
Epoch [3/100], Train Loss: 0.5821, Val Loss: 0.5856, Val Accuracy: 0.6973
Epoch [4/100], Train Loss: 0.5813, Val Loss: 0.5849, Val Accuracy: 0.6969
Epoch [5/100], Train Loss: 0.5814, Val Loss: 0.5849, Val Accuracy: 0.6963
Epoch [6/100], Train Loss: 0.5807, Val Loss: 0.5848, Val Accuracy: 0.6982
Epoch [7/100], Train Loss: 0.5813, Val Loss: 0.5847, Val Accuracy: 0.6978
Epoch [8/100], Train Loss: 0.5800, Val Loss: 0.5844, Val Accuracy: 0.6987
Epoch [9/100], Train Loss: 0.5803, Val Loss: 0.5835, Val Accuracy: 0.7023
Epoch [10/100], Train Loss: 0.5795, Val Loss: 0.5840, Val Accuracy: 0.7005
Epoch [11/100], Train Loss: 0.5786, Val Loss: 0.5835, Val Accuracy: 0.6985
Epoch [12/100], Train Loss: 0.5791, Val Loss: 0.5831, Val Accuracy: 0.7009
Epoch [13/100], Train Loss: 0.5780, Val Loss: 0.5828, Val Accuracy: 0

In [28]:
# Evaluate on validation/test set
evaluate_model(model, val_loader, device)

Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.73      0.72      6296
           1       0.69      0.67      0.68      5711

    accuracy                           0.70     12007
   macro avg       0.70      0.70      0.70     12007
weighted avg       0.70      0.70      0.70     12007

Confusion Matrix:
[[4584 1712]
 [1860 3851]]


In [None]:
model = StockLSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers)
model.load_state_dict(torch.load('trained_model_events_lora.pth'))
model.to(device)
model.eval()  # Set model to evaluation mode

StockLSTM(
  (lstm): LSTM(29, 96, num_layers=2, batch_first=True, dropout=0.2)
  (fc): Linear(in_features=96, out_features=1, bias=True)
)

In [29]:
def rolling_window_predictions_with_finetuning(
    model, test_df, seq_length, device, optimizer, criterion, fine_tune_steps=1
):
    """
    Perform rolling window predictions with dynamic model weight updates.

    Args:
        model (nn.Module): Trained LSTM model.
        test_df (pd.DataFrame): DataFrame containing test data.
        seq_length (int): Length of the input sequence for LSTM.
        device (torch.device): Device (CPU/GPU) for model computation.
        optimizer (torch.optim.Optimizer): Optimizer for fine-tuning the model.
        criterion (torch.nn.Module): Loss function for fine-tuning.
        fine_tune_steps (int): Number of gradient steps per update.

    Returns:
        predictions (list): List of predicted probabilities for each day.
        actuals (list): List of actual Movement values.
    """
    predictions = []
    actuals = []

    model.train()  # Switch to training mode for fine-tuning

    for i in range(len(test_df) - seq_length - 1):
        # Extract the rolling window
        window_data = test_df.iloc[i:i + seq_length]

        # Select features for the rolling window
        features = window_data[
                [
                    'Open', 'High', 'Low', 'Close', 'Volume',  # Basic features
                    'RSI', 'MACD', 'Signal_Line', 'Bollinger_Width',  # Technical indicators
                    'Day_of_Week', 'Month',  # Time-based features
                    'Lag1_Open', 'Lag2_Open', 'Lag1_Volume',
                    'Average_sentiment', 'news_day',  # Lagged features
                    'I-A', 'I-CT', 'I-RD', 'I-DC', 'I-DI', 'I-GI', 'I-NC',
                    'I-RSS', 'I-SD', 'I-SR', 'I-SS', 'O', 'I-GC' # Business Event Indicator variables
                ]
        ].values

        # Convert to PyTorch tensor
        x = torch.tensor(features, dtype=torch.float32).unsqueeze(0).to(device)

        # Predict the next day's Movement probability
        with torch.no_grad():
            output = model(x)
            probability = torch.sigmoid(output).item()
            predictions.append(probability)

        # Get the actual value for the next day
        actual = test_df['Movement'].iloc[i + seq_length]
        actuals.append(actual)

        # Fine-tune the model using the current data point
        label = torch.tensor([[actual]], dtype=torch.float32).to(device)

        # Take multiple fine-tuning steps (optional)
        for _ in range(fine_tune_steps):
            optimizer.zero_grad()
            output = model(x)
            loss = criterion(output, label)
            loss.backward()
            optimizer.step()

    return predictions, actuals

In [45]:
seq_length = 60  # Rolling window size
fine_tune_steps = 1  # Number of gradient steps per update
predictions, actuals = rolling_window_predictions_with_finetuning(
    model, test_df, seq_length, device, optimizer, criterion, fine_tune_steps
)

# Convert probabilities to binary predictions
threshold = 0.5  # Adjust threshold as needed
predicted_classes = [1 if p > threshold else 0 for p in predictions]

In [46]:
print("Accuracy:", accuracy_score(actuals, predicted_classes))
print("Classification Report:")
print(classification_report(actuals, predicted_classes))
print("Confusion Matrix:")
print(confusion_matrix(actuals, predicted_classes))

Accuracy: 0.5253086664814006
Classification Report:
              precision    recall  f1-score   support

           0       0.53      1.00      0.69     13232
           1       0.00      0.00      0.00     11957

    accuracy                           0.53     25189
   macro avg       0.26      0.50      0.34     25189
weighted avg       0.28      0.53      0.36     25189

Confusion Matrix:
[[13232     0]
 [11957     0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
