# LSTM Using Pytorch

You may need to update parent_folder and colab_base as you see fit

Download Ollama first to access llama3.2

In [19]:
# Global paths for both local (Mac) and Google Colab
PARENT_FOLDER = "/Users/colbywang/Google Drive/我的云端硬盘/Advanced NLP/Assignments/data files/organized/"
COLAB_BASE = "/content/gdrive/MyDrive/Assignments/Advanced NLP/Assignments/data files/organized/"

In [2]:
import torch
import torch.nn as nn

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class StockLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=1):
        super(StockLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        # LSTM layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        
        # Fully connected layer to predict stock percentage change
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x, hidden):
        # LSTM forward pass
        output, hidden = self.lstm(x, hidden)  
        
        # Take the last output step for prediction
        output = self.fc(output[:, -1, :])  
        
        return output, hidden

    def init_hidden(self, batch_size):
        # Initialize hidden and cell states with zeros
        h_0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device)
        c_0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device)
        return (h_0, c_0)

# Setting Up FinBERT

In [3]:
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np

# Load FinBERT model and tokenizer
model_name = "yiyanghkust/finbert-pretrain"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def get_average_embedding(sentences):
    """Compute and average sentence embeddings using FinBERT."""
    embeddings = []

    for sentence in sentences:
        inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=512)
        
        # Forward pass to get hidden states
        with torch.no_grad():
            outputs = model(**inputs)

        # Extract [CLS] token embedding
        cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
        embeddings.append(cls_embedding)

    # Convert list to NumPy array and compute the mean embedding
    avg_embedding = np.mean(np.array(embeddings), axis=0)
    
    return avg_embedding

  from .autonotebook import tqdm as notebook_tqdm


# RAG Pipeline Demo

In [None]:
from llama_index.core import (
    SimpleDirectoryReader,
    VectorStoreIndex,
    Settings
)
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.packs.sentence_window_retriever import SentenceWindowRetrieverPack as SentenceWindowRetriever
from llama_index.core.node_parser import SentenceWindowNodeParser

# ✅ Load the LLM Model
llm = Ollama(
    model="llama3.2",
    context_window=4096,
    request_timeout=600.0,
    temperature=0.1
)

# ✅ Load the embedding model
embedding_model = HuggingFaceEmbedding(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

# ✅ Configure Settings
Settings.llm = llm
Settings.embed_model = embedding_model

# ✅ Load documents
file_path = "2001_0000912057-01-006039.txt"
docs = SimpleDirectoryReader(input_files=[file_path]).load_data()

# ✅ Create Node Parser with Sentence Window
node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=1,
    window_metadata_key="window",
    original_text_metadata_key="original_text"
)

# ✅ Process nodes from documents
nodes = node_parser.get_nodes_from_documents(docs)

# ✅ Create Vector Store Index
index = VectorStoreIndex(nodes)

# ✅ Create Retriever
retriever = index.as_retriever(
    similarity_top_k=3
)

# ✅ Create Query Engine
query_engine = RetrieverQueryEngine(retriever=retriever)

# ✅ Function to run queries
def run_rag_query(query_text):
    response = query_engine.query(query_text)
    print("\n🔹 Query:", query_text)
    print("\n🔹 RAG Response:")
    print(response)
    return response

# ✅ Example usage
query = "What are the top 3-5 material risk factors highlighted in this 10-K?"
response = run_rag_query(query)



# Actual RAG Function

In [44]:
from llama_index.core import (
    SimpleDirectoryReader,
    VectorStoreIndex,
    Settings
)
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.packs.sentence_window_retriever import SentenceWindowRetrieverPack as SentenceWindowRetriever
from llama_index.core.node_parser import SentenceWindowNodeParser

# ✅ Load the LLM Model
llm = Ollama(
    model="llama3.2",
    context_window=4096,
    request_timeout=600.0,
    temperature=0.1
)

# ✅ Load the embedding model
embedding_model = HuggingFaceEmbedding(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

# ✅ Configure Settings
Settings.llm = llm
Settings.embed_model = embedding_model

# ✅ Create Node Parser with Sentence Window (Used in Function)
node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=1,
    window_metadata_key="window",
    original_text_metadata_key="original_text"
)

def run_rag_pipeline(file_path, query_text):
    """
    Runs the RAG pipeline for a given document file path and query.

    Args:
        file_path (str): Path to the 10-K or DEF 14A file.
        query_text (str): The query to ask the LLM.

    Returns:
        str: The retrieved response from the document.
    """

    # ✅ Load document
    docs = SimpleDirectoryReader(input_files=[file_path]).load_data()

    # ✅ Process nodes from document
    nodes = node_parser.get_nodes_from_documents(docs)

    # ✅ Create Vector Store Index
    index = VectorStoreIndex(nodes)

    # ✅ Create Retriever
    retriever = index.as_retriever(
        similarity_top_k=3
    )

    # ✅ Create Query Engine
    query_engine = RetrieverQueryEngine(retriever=retriever)

    # ✅ Run the query
    response = query_engine.query(query_text)
    
    return response


In [20]:
def switch_file_path(colab_path):
    """
    Converts a file path from Google Drive (Colab) to a local path.

    Args:
        colab_path (str): The file path from Google Drive in Colab.

    Returns:
        str: The equivalent local path.
    """
    local_path = colab_path.replace(COLAB_BASE, PARENT_FOLDER, 1)
    return local_path

# Complete Pipeline

In [41]:
import os
import random
import pandas as pd
import torch
import torch.nn as nn
import numpy as np

# Get a list of all CSV files in the folder
csv_folder = os.path.join(PARENT_FOLDER, "stock-data")
csv_files = [file for file in os.listdir(csv_folder) if file.endswith(".csv")]

# Train/Test split
train_files = csv_files[:int(0.8 * len(csv_files))]
test_files = csv_files[int(0.8 * len(csv_files)):]
print(f"Training on {len(train_files)} files and testing on {len(test_files)} files.")

def sample_stock_file(files):
    # Randomly select one CSV file
    stock_file = os.path.join(csv_folder, random.choice(files))

    return stock_file

def sample_a_row_with_10K_DEF14A(stock_file, num_trading_days=6):
    """
    Randomly selects a row where either a 10-K or DEF 14A filing exists
    and returns the row along with the next `num_trading_days` valid trading days.
    """
    # Load the stock data CSV file
    stock_data = pd.read_csv(stock_file, parse_dates=["Date"])
    stock_data.set_index("Date", inplace=True)  # Ensure Date is the index
    stock_data.sort_index(inplace=True)  # Sort by Date to ensure correctness

    # First determine randomly whether to sample 10-K or DEF 14A
    sample_10K = random.choice([True, False])

    # Get the sample row
    if sample_10K:
        filtered_df = stock_data[stock_data["10-K"] != "0"]  # Ensure paths are considered
    else:
        filtered_df = stock_data[stock_data["DEF 14A"] != "0"]

    # If no matching row found, return None
    if filtered_df.empty:
        print("⚠️ No matching rows found.")
        return None

    # Randomly sample a row
    sampled_row = filtered_df.sample(1)
    sampled_date = sampled_row.index[0]

    # Find the position of this row in the stock DataFrame
    sampled_idx = stock_data.index.get_loc(sampled_date)

    # Select the next `num_trading_days` rows after sampled index
    future_dates = stock_data.index[sampled_idx: sampled_idx + num_trading_days + 1]  # +1 to include sampled row
    selected_rows = stock_data.loc[future_dates]

    return selected_rows

def train(LSTM_Model, stock_file, optimizer, num_trading_days=6):
    """
    Trains the LSTM model on a randomly sampled row from the stock data CSV file.
    
    Args:
        LSTM_Model: The LSTM model.
        stock_file: Path to the stock data CSV file.
        optimizer: The optimizer for training the model.
        num_trading_days: Number of past trading days used as input.

    Returns:
        loss_value: The computed training loss.
    """
    # Sample a row with 10-K or DEF 14A
    stock_data = sample_a_row_with_10K_DEF14A(stock_file, num_trading_days)

    # **If no valid data is found, skip training**
    if stock_data is None or stock_data.empty:
        print("⚠️ No valid row found for training. Skipping...")
        return None

    # **Extract input features (ignore categorical columns)**
    features = stock_data.drop(columns=["10-K", "DEF 14A", "Change"]).values
    target = stock_data["Percentage Change"].values

    # **Enhance features with RAG embeddings**
    first_row = stock_data.iloc[0]  # First row contains the filing path

    # Ensure the values are strings before checking
    ten_k_path = str(first_row["10-K"]) if not pd.isna(first_row["10-K"]) else "0"
    def_14a_path = str(first_row["DEF 14A"]) if not pd.isna(first_row["DEF 14A"]) else "0"

    # **Check if there is a valid filing path**
    if ten_k_path != "0":
        file_path = switch_file_path(ten_k_path)
        response = run_rag_pipeline(file_path, 
            "Summarize the most critical financial risks and uncertainties outlined in this 10-K filing."
        )
    elif def_14a_path != "0":
        file_path = switch_file_path(def_14a_path)
        response = run_rag_pipeline(file_path, 
            "Summarize the key executive compensation decisions and governance changes disclosed in this DEF 14A filing."
        )
    else:
        response = ""  # No filing path available, return empty response

    # **Convert RAG response into embeddings**
    if response:
        # **Convert RAG response into embeddings**
        response_text = str(response)  # Convert response object to string
        
        # Ensure response is processed correctly
        if hasattr(response, 'response'):  # If response has a .response attribute
            response_text = response.response  
        elif hasattr(response, 'text'):  # If response has a .text attribute
            response_text = response.text  
        
        rag_sentences = response_text.split(".")  # Split into sentences
        rag_embedding = get_average_embedding(rag_sentences)  # (768,)
        rag_embedding = np.tile(rag_embedding, (features.shape[0], 1))  # Repeat embedding for each row

    else:
        rag_embedding = np.zeros((features.shape[0], 768))  # Use zero vector if no RAG data

    # **Concatenate RAG embedding with features**
    features = np.hstack((features, rag_embedding))

    # **Normalize the features and target**
    features = (features - features.mean(axis=0)) / (features.std(axis=0) + 1e-8)  # Avoid division by zero
    target = (target - target.mean()) / (target.std() + 1e-8)

    # **Convert to PyTorch tensors**
    features_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0)  # (batch, seq_len, input_size)
    target_tensor = torch.tensor(target, dtype=torch.float32).unsqueeze(0).unsqueeze(-1)  # (batch, seq_len, 1)

    # **Initialize hidden states**
    hidden = LSTM_Model.init_hidden(batch_size=1)

    # **Forward pass**
    output, hidden = LSTM_Model(features_tensor, hidden)

    # **Compute loss**
    loss_fn = nn.MSELoss()
    loss = loss_fn(output, target_tensor)

    # **Backpropagation**
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    return loss.item()

Training on 397 files and testing on 100 files.


# Train

In [None]:
import os
import torch
import torch.nn as nn
import random
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt

# Create Checkpoint Folder if it doesn't exist
checkpoint_dir = "checkpoints"
os.makedirs(checkpoint_dir, exist_ok=True)

# ✅ Define Model & Optimizer
input_size = 785  # (Stock prices & Volume) + (Yield) + (CPI and Inflation) + (RAG embedding)
hidden_size = 64
num_epochs = 100  # Change this for more training
num_trading_days = 6  # Lookback window

# ✅ Initialize Model (Load from checkpoint if exists)
model_LSTM = StockLSTM(input_size, hidden_size).to(device)

# Load from checkpoint if available
checkpoint_path = os.path.join(checkpoint_dir, "lstm_final.pth")
if os.path.exists(checkpoint_path):
    model_LSTM.load_state_dict(torch.load(checkpoint_path))
    print(f"🔍 Loaded model checkpoint: {checkpoint_path}")

# ✅ Define Optimizer
optimizer = torch.optim.Adam(model_LSTM.parameters(), lr=0.001)

# ✅ Training Loop
losses = []
for epoch in range(num_epochs):
    print(f"\n🚀 Epoch {epoch + 1}/{num_epochs}")

    epoch_loss = 0
    num_samples = 0

    # **Iterate Through Stock Files**
    num_iterations = 10000  # Number of iterations per epoch
    stock_files = [sample_stock_file(train_files) for _ in range(num_iterations)]

    # 🔀 Shuffle the stock files to introduce randomness
    random.shuffle(stock_files) 

    for stock_file in tqdm(stock_files, desc=f"Training Epoch {epoch + 1}"):
        loss = train(model_LSTM, stock_file, optimizer, num_trading_days=num_trading_days)
        
        if loss is not None:  # If training was successful
            epoch_loss += loss
            num_samples += 1

    # **Compute Average Loss for Epoch**
    avg_loss = epoch_loss / max(num_samples, 1)  # Avoid division by zero
    losses.append(avg_loss)
    print(f"✅ Epoch {epoch + 1} - Average Loss: {avg_loss:.4f}")

    # **🔹 Save Model Checkpoint Every 2 Epochs**
    if epoch % 2 == 0:
        checkpoint_path = os.path.join(checkpoint_dir, f"lstm_epoch_{epoch + 1}.pth")
        torch.save(model_LSTM.state_dict(), checkpoint_path)
        print(f"📌 Model saved: {checkpoint_path}")

# ✅ Final Model Save
final_model_path = os.path.join(checkpoint_dir, "lstm_final.pth")
torch.save(model_LSTM.state_dict(), final_model_path)
print(f"🎯 Training Completed! Final model saved: {final_model_path}")

# ✅ Plot the Loss Curve
plt.figure(figsize=(12, 6))
plt.plot(losses, label="Training Loss", color='blue')
plt.title("LSTM Training Loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
plt.grid(True)
plt.show()  # Display the plot


🚀 Epoch 1/100


Training Epoch 1:   0%|          | 0/10000 [00:00<?, ?it/s]