# LSTM Using Pytorch

Parent Folder:

/Users/colbywang/Google Drive/我的云端硬盘/Advanced NLP/Assignments/data files/organized

Download Ollama first to access llama3.2

In [7]:
import torch
import torch.nn as nn

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class StockLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=1):
        super(StockLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        # LSTM layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        
        # Fully connected layer to predict stock percentage change
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x, hidden):
        # LSTM forward pass
        output, hidden = self.lstm(x, hidden)  
        
        # Take the last output step for prediction
        output = self.fc(output[:, -1, :])  
        
        return output, hidden

    def init_hidden(self, batch_size):
        # Initialize hidden and cell states with zeros
        h_0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device)
        c_0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device)
        return (h_0, c_0)

# Example Usage
input_size = 1 + 3 + 768  # Example: Stock price, 3 economic indicators, context vector
hidden_size = 64
sequence_length = 7  # 7 days of past data as input
batch_size = 16

model = StockLSTM(input_size, hidden_size).to(device)
hidden = model.init_hidden(batch_size)

# Example input tensor (batch_size, sequence_length, input_size)
sample_input = torch.randn(batch_size, sequence_length, input_size).to(device)
output, hidden = model(sample_input, hidden)

print("Output Shape:", output.shape)  # Expected: (batch_size, 1)


Output Shape: torch.Size([16, 1])


# Setting Up FinBERT

In [8]:
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np

# Load FinBERT model and tokenizer
model_name = "yiyanghkust/finbert-pretrain"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def get_average_embedding(sentences):
    """Compute and average sentence embeddings using FinBERT."""
    embeddings = []

    for sentence in sentences:
        inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=512)
        
        # Forward pass to get hidden states
        with torch.no_grad():
            outputs = model(**inputs)

        # Extract [CLS] token embedding
        cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
        embeddings.append(cls_embedding)

    # Convert list to NumPy array and compute the mean embedding
    avg_embedding = np.mean(np.array(embeddings), axis=0)
    
    return avg_embedding

# Example usage: multiple sentences
sentences = [
    "The company's revenue increased by 10% last quarter.",
    "Market trends indicate strong growth in the AI sector.",
    "Risk factors include inflation and supply chain disruptions.",
]

# Compute average embedding
avg_embedding = get_average_embedding(sentences)

print("Averaged FinBERT Sentence Embedding Shape:", avg_embedding.shape)  # Expected: (768,)


Averaged FinBERT Sentence Embedding Shape: (768,)


# RAG Pipeline Demo

In [None]:
from llama_index.core import (
    SimpleDirectoryReader,
    VectorStoreIndex,
    Settings
)
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.packs.sentence_window_retriever import SentenceWindowRetrieverPack as SentenceWindowRetriever
from llama_index.core.node_parser import SentenceWindowNodeParser

# ✅ Load the LLM Model using Llama2
llm = Ollama(
    model="llama3.2",
    context_window=4096,
    request_timeout=60.0,
    temperature=0.7
)

# ✅ Load the embedding model
embedding_model = HuggingFaceEmbedding(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

# ✅ Configure Settings
Settings.llm = llm
Settings.embed_model = embedding_model

# ✅ Load documents
# file_path = "/content/drive/MyDrive/Advanced NLP/Assignments/data files/organized/10-K/0000001800/2001_0000912057-01-006039.txt"
file_path = "2001_0000912057-01-006039.txt"
docs = SimpleDirectoryReader(input_files=[file_path]).load_data()

# ✅ Create Node Parser with Sentence Window
node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=1,
    window_metadata_key="window",
    original_text_metadata_key="original_text"
)

# ✅ Process nodes from documents
nodes = node_parser.get_nodes_from_documents(docs)

# ✅ Create Vector Store Index
index = VectorStoreIndex(nodes)

# ✅ Create Retriever
retriever = index.as_retriever(
    similarity_top_k=3
)

# ✅ Create Query Engine
query_engine = RetrieverQueryEngine(retriever=retriever)

# ✅ Function to run queries
def run_rag_query(query_text):
    response = query_engine.query(query_text)
    print("\n🔹 Query:", query_text)
    print("\n🔹 RAG Response:")
    print(response)
    return response

# ✅ Example usage
query = "What are the top 3-5 material risk factors highlighted in this 10-K?"
response = run_rag_query(query)



# Actual RAG Function

In [None]:
from llama_index.core import (
    SimpleDirectoryReader,
    VectorStoreIndex,
    Settings
)
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.packs.sentence_window_retriever import SentenceWindowRetrieverPack as SentenceWindowRetriever
from llama_index.core.node_parser import SentenceWindowNodeParser

# ✅ Load the LLM Model using Llama2
llm = Ollama(
    model="llama3.2",
    context_window=4096,
    request_timeout=60.0,
    temperature=0.7
)

# ✅ Load the embedding model
embedding_model = HuggingFaceEmbedding(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

# ✅ Configure Settings
Settings.llm = llm
Settings.embed_model = embedding_model

# ✅ Create Node Parser with Sentence Window (Used in Function)
node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=1,
    window_metadata_key="window",
    original_text_metadata_key="original_text"
)

def run_rag_pipeline(file_path, query_text):
    """
    Runs the RAG pipeline for a given document file path and query.

    Args:
        file_path (str): Path to the 10-K or DEF 14A file.
        query_text (str): The query to ask the LLM.

    Returns:
        str: The retrieved response from the document.
    """

    # ✅ Load document
    docs = SimpleDirectoryReader(input_files=[file_path]).load_data()

    # ✅ Process nodes from document
    nodes = node_parser.get_nodes_from_documents(docs)

    # ✅ Create Vector Store Index
    index = VectorStoreIndex(nodes)

    # ✅ Create Retriever
    retriever = index.as_retriever(
        similarity_top_k=3
    )

    # ✅ Create Query Engine
    query_engine = RetrieverQueryEngine(retriever=retriever)

    # ✅ Run the query
    response = query_engine.query(query_text)

    print("\n🔹 Query:", query_text)
    print("\n🔹 RAG Response:")
    print(response)
    
    return response

# ✅ Example usage
# file_path = "2001_0000912057-01-006039.txt"
# query = "What are the top 3-5 material risk factors highlighted in this 10-K?"
# response = run_rag_pipeline(file_path, query)


In [None]:
def switch_file_path(colab_path):
    """
    Converts a file path from Google Drive (Colab) to a local Mac path.

    Args:
        colab_path (str): The file path from Google Drive in Colab.

    Returns:
        str: The equivalent local path for Mac.
    """
    colab_base = "/content/drive/MyDrive/Advanced NLP/Assignments/data files/organized"
    mac_base = "/Users/colbywang/Google Drive/我的云端硬盘/Advanced NLP/Assignments/data files/organized"

    if colab_path.startswith(colab_base):
        local_path = colab_path.replace(colab_base, mac_base, 1)
        return local_path
    else:
        print("⚠️ Warning: Path does not match expected Colab structure.")
        return colab_path  # Return original path if no match

# ✅ Example Usage:
colab_file = "/content/drive/MyDrive/Advanced NLP/Assignments/data files/organized/10-K/0000001800/2001_0000912057-01-006039.txt"
mac_file = switch_file_path(colab_file)

print("Converted Mac File Path:", mac_file)

Converted Mac File Path: /Users/colbywang/Google Drive/我的云端硬盘/Advanced NLP/Assignments/data files/organized/10-K/0000001800/2001_0000912057-01-006039.txt


# Complete Pipeline