# LSTM Using Pytorch

You may need to update parent_folder and colab_base as you see fit

Download Ollama first to access llama3.2

In [1]:
# Global paths for both local (Mac) and Google Colab
PARENT_FOLDER = "/Users/colbywang/Google Drive/我的云端硬盘/Advanced NLP/Assignments/data files/organized/"
COLAB_BASE = "/content/gdrive/MyDrive/Assignments/Advanced NLP/Assignments/data files/organized/"

# RAG Function

In [None]:
import numpy as np
from llama_index.core import VectorStoreIndex, Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SentenceWindowNodeParser
from llama_index.core import SimpleDirectoryReader

# ✅ Initialize Embedding Model (Same as Before)
embedding_model = HuggingFaceEmbedding(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)
Settings.embed_model = embedding_model  # Apply globally

# ✅ Initialize Sentence Window Parser (Used for Retrieval)
node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=1,
    window_metadata_key="window",
    original_text_metadata_key="original_text"
)

def build_vector_index(file_path):
    """
    Build a vector index from a given document.

    Args:
        file_path (str): Path to the document.

    Returns:
        VectorStoreIndex: Built vector store index.
    """
    # ✅ Load document
    docs = SimpleDirectoryReader(input_files=[file_path]).load_data()

    # ✅ Process nodes from document
    nodes = node_parser.get_nodes_from_documents(docs)

    # ✅ Create Vector Store Index
    index = VectorStoreIndex(nodes)

    return index

def retrieve_avg_embedding(query_text, vector_index, top_k=3):
    """
    Retrieve top-K relevant passages and compute their average embedding.

    Args:
        query_text (str): The query to search in the vector database.
        vector_index (VectorStoreIndex): Prebuilt vector store index.
        top_k (int): Number of retrieved passages to consider.

    Returns:
        np.ndarray: The averaged embedding of retrieved contexts.
    """
    # Retrieve top-K most similar documents
    retriever = vector_index.as_retriever(similarity_top_k=top_k)
    retrieved_docs = retriever.retrieve(query_text)

    # Extract embeddings of retrieved docs
    retrieved_embeddings = np.array([
        doc.embedding for doc in retrieved_docs if doc.embedding is not None
    ])

    # Handle case where no embeddings were retrieved
    if retrieved_embeddings.size == 0:
        print("⚠️ No valid embeddings found. Returning zero vector.")
        return np.zeros((embedding_model.get_embedding_dimension(),))

    # Compute the average embedding
    avg_embedding = np.mean(retrieved_embeddings, axis=0)

    return avg_embedding # shape: (384, )

# ✅ Example Usage
# file_path = "path/to/your/10-K_or_DEF14A_file.txt"
# query_text = "Summarize key financial risks."

# # Build index
# vector_index = build_vector_index(file_path)

# # Retrieve average embedding
# avg_embedding = retrieve_avg_embedding(query_text, vector_index)
# print("Avg Embedding Shape:", avg_embedding.shape)


In [9]:
def switch_file_path(colab_path):
    """
    Converts a file path from Google Drive (Colab) to a local path.

    Args:
        colab_path (str): The file path from Google Drive in Colab.

    Returns:
        str: The equivalent local path.
    """
    local_path = colab_path.replace(COLAB_BASE, PARENT_FOLDER, 1)
    return local_path

# Train

In [None]:
import os
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

# ✅ Define Parent Folder for Stock Data
PARENT_FOLDER = "/Users/colbywang/Google Drive/我的云端硬盘/Advanced NLP/Assignments/data files/organized/stock-data/"

# ✅ Get list of all CSV files in stock data folder
csv_files = [os.path.join(PARENT_FOLDER, file) for file in os.listdir(PARENT_FOLDER) if file.endswith(".csv")]

# ✅ Train/Test Split (80% train, 20% test)
train_files = csv_files[:int(0.7 * len(csv_files))]
test_files = csv_files[int(0.7 * len(csv_files))]
print(f"Training on {len(train_files)} files and testing on {len(test_files)} files.")

# ✅ Define Function to Load & Preprocess Stock Data
def load_and_preprocess_data(stock_file):
    # Check if file is empty
    if os.stat(stock_file).st_size == 0:
        return None, None

    df = pd.read_csv(stock_file, parse_dates=["Date"])
    df.set_index("Date", inplace=True)
    df.sort_index(inplace=True)

    # ✅ Select Features
    features = df[["Close", "High", "Low", "Open", "Volume", "Percentage Change", "CPI", "Inflation",
                   "3 Mo", "6 Mo", "1 Yr", "2 Yr", "3 Yr", "5 Yr", "7 Yr", "10 Yr", "20 Yr"]]

    # ✅ Shift the Target (Predict next day's percentage change)
    df["Target"] = df["Percentage Change"].shift(-1)

    # ✅ Drop rows with NaN values
    df.dropna(inplace=True)

    return features, df["Target"]

# ✅ Define GRU Model
class StockGRU(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=1):
        super(StockGRU, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x, hidden):
        output, hidden = self.gru(x, hidden)
        output = self.fc(output[:, -1, :])  # Take last timestep output
        return output, hidden

    def init_hidden(self, batch_size):
        return torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device)

# ✅ Device Setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ✅ Prepare Data Function
def prepare_data(stock_files, sequence_length=10):
    X, y = [], []
    
    for stock_file in tqdm(stock_files, desc="Processing Stock Data"):
        if not os.path.isfile(stock_file):  # ✅ Ensure it's a file
            continue  # Skip directories

        features, target = load_and_preprocess_data(stock_file)

        if features is None or target is None:
            continue
        features, target = features.values, target.values

        for i in range(len(features) - sequence_length):
            if i + sequence_length >= len(target):  # Ensure index is within bounds
                break
            X.append(features[i:i + sequence_length])
            y.append(target[i + sequence_length])


    # Convert X and y to NumPy arrays with explicit dtype
    X = np.array(X, dtype=np.float32)
    y = np.array(y, dtype=np.float32)

    # Convert to PyTorch tensors
    X_tensor = torch.from_numpy(X).to(device)
    y_tensor = torch.from_numpy(y).to(device)

    return X_tensor, y_tensor

# ✅ Prepare Training & Testing Data
sequence_length = 10
X_train, y_train = prepare_data(train_files, sequence_length)
X_test, y_test = prepare_data(test_files, sequence_length)

# ✅ Initialize GRU Model
input_size = X_train.shape[2]
hidden_size = 64
gru_model = StockGRU(input_size, hidden_size).to(device)

# ✅ Define Loss, Optimizer, and Training Parameters
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(gru_model.parameters(), lr=0.001)
num_epochs = 100
batch_size = 32
training_losses = []  # Store losses

# ✅ Training Loop
for epoch in range(num_epochs):
    print(f"🔵 Epoch [{epoch+1}/{num_epochs}]")
    total_loss = 0
    counter = 0
    hidden = gru_model.init_hidden(batch_size)

    for i in tqdm(range(0, len(X_train), batch_size), desc="training batches"):
        batch_X = X_train[i:i+batch_size]
        batch_y = y_train[i:i+batch_size]

        if len(batch_X) < batch_size:
            continue
        
        # If batch_X have nan values then continue
        if torch.isnan(batch_X).any():
            continue

        hidden = hidden.detach()

        # Forward Pass
        output, hidden = gru_model(batch_X, hidden)

        # If batch_y have nan values then continue
        if torch.isnan(batch_y).any():
            continue

        loss = loss_fn(output.squeeze(), batch_y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        counter += 1

    avg_loss = total_loss / counter
    training_losses.append(avg_loss)  # Store loss
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")

# ✅ Save Model
torch.save(gru_model.state_dict(), "stock_gru_model.pth")

# ✅ Plot & Save Training Loss
plt.figure(figsize=(8, 5))
plt.plot(range(1, num_epochs + 1), training_losses, marker="o", linestyle="-", color="blue")
plt.xlabel("Epoch")
plt.ylabel("Training Loss (MSE)")
plt.title("Training Loss Over Epochs")
plt.grid(True, linestyle="--", alpha=0.6)
plt.savefig("training_loss.png")

# ✅ Evaluation on Test Set
gru_model.eval()
with torch.no_grad():
    test_hidden = gru_model.init_hidden(X_test.size(0))
    y_pred, _ = gru_model(X_test, test_hidden)
    y_pred = y_pred.cpu().numpy().flatten()
    y_test = y_test.cpu().numpy().flatten()

    # ✅ Calculate Performance Metrics
    mae = np.mean(np.abs(y_pred - y_test))
    mse = np.mean((y_pred - y_test) ** 2)
    rmse = np.sqrt(mse)

    print("\n🔹 **Test Set Performance Metrics** 🔹")
    print(f"📉 Mean Absolute Error (MAE): {mae:.4f}")
    print(f"📊 Mean Squared Error (MSE): {mse:.4f}")
    print(f"📈 Root Mean Squared Error (RMSE): {rmse:.4f}")

    # ✅ Select a Random Test Example for Visualization
    random_idx = np.random.randint(0, len(y_test) - 20)  # Pick a random starting point
    actual_values = y_test[random_idx:random_idx + 20]
    predicted_values = y_pred[random_idx:random_idx + 20]

    # ✅ Plot & Save Actual vs Predicted
    plt.figure(figsize=(8, 5))
    plt.plot(actual_values, label="Actual", marker="o", linestyle="-", color="green")
    plt.plot(predicted_values, label="Predicted", marker="x", linestyle="--", color="red")
    plt.xlabel("Time Step")
    plt.ylabel("Stock Percentage Change")
    plt.title("Actual vs Predicted Stock Percentage Change")
    plt.legend()
    plt.grid(True, linestyle="--", alpha=0.6)
    plt.savefig("actual_vs_pred.png")


Training on 347 files and testing on 108 files.


Processing Stock Data: 100%|██████████| 347/347 [00:06<00:00, 52.91it/s]
Processing Stock Data: 100%|██████████| 108/108 [00:00<00:00, 118026.27it/s]


🔵 Epoch [1/100]


training batches: 100%|██████████| 59992/59992 [02:56<00:00, 340.57it/s]


Epoch [1/100], Loss: 5.1649
🔵 Epoch [2/100]


training batches: 100%|██████████| 59992/59992 [02:51<00:00, 348.85it/s]


Epoch [2/100], Loss: 5.1641
🔵 Epoch [3/100]


training batches: 100%|██████████| 59992/59992 [02:51<00:00, 349.35it/s]


Epoch [3/100], Loss: 5.1637
🔵 Epoch [4/100]


training batches: 100%|██████████| 59992/59992 [03:02<00:00, 329.23it/s]


Epoch [4/100], Loss: 5.1631
🔵 Epoch [5/100]


training batches: 100%|██████████| 59992/59992 [03:06<00:00, 322.44it/s]


Epoch [5/100], Loss: 5.1627
🔵 Epoch [6/100]


training batches: 100%|██████████| 59992/59992 [03:09<00:00, 315.84it/s]


Epoch [6/100], Loss: 5.1624
🔵 Epoch [7/100]


training batches: 100%|██████████| 59992/59992 [03:05<00:00, 323.43it/s]


Epoch [7/100], Loss: 5.1624
🔵 Epoch [8/100]


training batches:  31%|███       | 18587/59992 [00:56<02:10, 317.23it/s]