# GRU Using Pytorch

In [1]:
import torch
import torch.nn as nn

# GRU Model Definition
class GRUStockModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(GRUStockModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.gru(x, h0)
        out = self.fc(out[:, -1, :])  # Take last time step output
        return out

You may need to update parent_folder and colab_base as you see fit

Download Ollama first to access llama3.2

In [1]:
# Global paths for both local (Mac) and Google Colab
PARENT_FOLDER = "/Users/colbywang/Google Drive/我的云端硬盘/Advanced NLP/Assignments/data files/organized/"
COLAB_BASE = "/content/gdrive/MyDrive/Assignments/Advanced NLP/Assignments/data files/organized/"

In [1]:
def switch_file_path(google_drive_path):
    """
    Convert Google Drive file paths to local file paths.
    
    Args:
        google_drive_path (str): File path stored in the CSV (Google Drive format).
    
    Returns:
        str: Converted local file path.
    """
    google_prefix = "/content/gdrive/MyDrive/Assignments"  # Incorrect Google Drive path
    local_prefix = "/Users/colbywang/Google Drive/我的云端硬盘"  # Correct local prefix

    if google_drive_path.startswith(google_prefix):
        local_path = google_drive_path.replace(google_prefix, local_prefix)
        return local_path
    
    return google_drive_path  # If not found, return as is


# Train

In [None]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, Dataset, TensorDataset
from sklearn.preprocessing import MinMaxScaler
from glob import glob
from sklearn.model_selection import train_test_split
import random
from tqdm import tqdm

# Path to stock data folder
PARENT_FOLDER = "/Users/colbywang/Google Drive/我的云端硬盘/Advanced NLP/Assignments/data files/organized/stock-data"
all_files = glob(os.path.join(PARENT_FOLDER, "*.csv"))

# Hyperparameters
sequence_length = 7  # Lookback window
N = 1  # Predict trend in next N days
batch_size = 32
epochs = 20
hidden_size = 128
num_layers = 2
learning_rate = 0.001
threshold = 0.02  # 2% change threshold for "Stable"
train_ratio = 0.7  # 70% of files for training, 30% for testing

# Split stock files into training and testing sets
train_files, test_files = train_test_split(all_files, test_size=1-train_ratio, random_state=42)
print(f"Training on {len(train_files)} stock files, Testing on {len(test_files)} stock files")

def create_labels(df):
    """
    Create classification labels for stock price movement.
    
    Args:
        df (DataFrame): Stock data
        N (int): Number of days in the future to compute price movement
        threshold (float): Percentage change threshold to classify Up or Down

    Returns:
        Series: Labels (0 = Stable, 1 = Up, 2 = Down)
    """
    df['pct change'] = df['Close'].pct_change(N)
    df['Close_diff_pct'] = df['pct change'].shift(-N)

    # Default to "Stable"
    df['Label'] = 0  

    # Up (if change > threshold)
    df.loc[df['Close_diff_pct'] > threshold, 'Label'] = 1  

    # Down (if change < -threshold)
    df.loc[df['Close_diff_pct'] < -threshold, 'Label'] = 2  

    return df['Label']

def create_sequences(df, labels, sequence_length, N=5):
    """
    Create sequences for GRU training.

    Args:
        df (DataFrame): Stock features
        labels (Series): Classification labels
        sequence_length (int): Length of input sequence
        N (int): Lookahead period for labels

    Returns:
        np.ndarray: X (features)
        np.ndarray: y (labels)
    """
    df = df[['Close', 'High', 'Low', 'Open', 'Volume', 'CPI', 'Inflation']]
    X, y = [], []

    # Ensure labels align with future price movement
    for i in range(len(df) - sequence_length - N):
        X.append(df.values[i:i + sequence_length])  # Sequence of input features
        y.append(labels.iloc[i + sequence_length])  # Label for the next movement

    return np.array(X), np.array(y)

# Load and preprocess all training stock data
all_X_train, all_y_train = [], []

for file in tqdm(train_files, desc="Loading training files"):
    # If file is empty skip
    if os.stat(file).st_size == 0:
        continue
    
    df = pd.read_csv(file)
    if not {'Close', 'High', 'Low', 'Open', 'Volume', 'CPI', 'Inflation'}.issubset(df.columns):
        continue  # Skip if missing columns

    df = df[['Close', 'High', 'Low', 'Open', 'Volume', 'CPI', 'Inflation']]

    labels = create_labels(df)
    X, y = create_sequences(df, labels, sequence_length)

    all_X_train.append(X)
    all_y_train.append(y)

# Concatenate all training sequences
X_train = np.concatenate(all_X_train, axis=0)
y_train = np.concatenate(all_y_train, axis=0)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)

# Create PyTorch DataLoader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Initialize Model
input_size = 7  # ['Close', 'High', 'Low', 'Open', 'Volume', 'CPI', 'Inflation']
output_size = 3  # 3 classes: Up, Down, Stable
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = GRUStockModel(input_size, hidden_size, num_layers, output_size).to(device)

# Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Train Model
train_losses = []
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    model.train()
    total_loss = 0
    for batch_X, batch_y in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
        # If batch_X has nan skip
        if torch.isnan(batch_X).any():
            continue
        
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    train_losses.append(avg_loss)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")

# Save Training Loss Plot
plt.figure(figsize=(8, 5))
plt.plot(train_losses, label="Training Loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("Training Loss")
plt.legend()
plt.savefig("train_loss.png")
plt.close()

print("\n✅ Training complete. Training loss visualization saved.")


In [None]:
# Save Model
torch.save(model.state_dict(), "stock_gru_model.pth")
print("✅ Model saved as 'stock_model.pth'.")

✅ Model saved as 'stock_model.pth'.


# Test

In [None]:
import os
import torch
import pandas as pd
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# ✅ Define Parent Folder for Stock Data
PARENT_FOLDER = "/Users/colbywang/Google Drive/我的云端硬盘/Advanced NLP/Assignments/data files/organized/stock-data/"

# ✅ Get list of test files
csv_files = [os.path.join(PARENT_FOLDER, file) for file in os.listdir(PARENT_FOLDER) if file.endswith(".csv")]
test_files = csv_files[int(0.7 * len(csv_files)):]

# ✅ Load GRU Model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = GRUStockModel(input_size=7, hidden_size=64, num_layers=2, output_size=3).to(device)
model.load_state_dict(torch.load("stock_gru_model.pth"))
model.eval()  # Set model to evaluation mode

# ✅ Initialize Metrics Storage
all_actuals, all_predictions = [], []

# ✅ Process Each Test Stock
for test_stock in tqdm(test_files, desc="stock files"):
    # If file is empty skip
    if os.stat(test_stock).st_size == 0:
        continue

    df_test = pd.read_csv(test_stock)
    if not {'Close', 'High', 'Low', 'Open', 'Volume', 'CPI', 'Inflation'}.issubset(df.columns):
        continue  # Skip if missing columns

    # Select Relevant Features
    df_test = df_test[['Close', 'High', 'Low', 'Open', 'Volume', 'CPI', 'Inflation']]

    # Generate Labels
    labels_test = create_labels(df_test)

    # Create Sequences
    X_test, y_test = create_sequences(df_test, labels_test, sequence_length=7)
    X_test, y_test = torch.tensor(X_test, dtype=torch.float32).to(device), torch.tensor(y_test, dtype=torch.long).to(device)

    # ✅ Run Inference
    with torch.no_grad():
        for i in range(len(X_test)):
            X_sample = X_test[i].unsqueeze(0)  # Add batch dimension

            if torch.isnan(X_sample).any():
                continue

            y_actual = y_test[i].item()
            output = model(X_sample)
            _, predicted = torch.max(output, 1)
            
            # Store results
            all_actuals.append(y_actual)
            all_predictions.append(predicted.item())

# ✅ Compute Classification Metrics
accuracy = accuracy_score(all_actuals, all_predictions)
precision, recall, f1, _ = precision_recall_fscore_support(all_actuals, all_predictions, average="weighted", zero_division=0)

# ✅ Print Results
print("\n🔹 **Overall Test Set Performance Metrics** 🔹")
print(f"✅ Accuracy: {accuracy:.4f}")
print(f"📏 Precision: {precision:.4f}")
print(f"📡 Recall: {recall:.4f}")
print(f"⚖️ F1-Score: {f1:.4f}")


stock files: 100%|██████████| 150/150 [06:29<00:00,  2.60s/it]



🔹 **Overall Test Set Performance Metrics** 🔹
✅ Accuracy: 0.7716
📏 Precision: 0.6080
📡 Recall: 0.7716
⚖️ F1-Score: 0.6726


# Train with Embedding

In [None]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from glob import glob
from tqdm import tqdm

# ✅ Path to stock data folder
stock_folder = "/Users/colbywang/Google Drive/我的云端硬盘/Advanced NLP/Assignments/data files/organized/stock-data"
all_files = glob(os.path.join(stock_folder, "*.csv"))

# ✅ Hyperparameters
sequence_length = 7  # Lookback window
N = 1  # Predict trend in next N days
batch_size = 32
epochs = 20
hidden_size = 128
num_layers = 2
learning_rate = 0.001
threshold = 0.02  # 2% change threshold for "Stable"
train_ratio = 0.7  # 70% of files for training, 30% for testing
embedding_dim = 384  # SEC Filing Embedding Size

# ✅ Split stock files into training and testing sets
train_files, test_files = train_test_split(all_files, test_size=1-train_ratio, random_state=42)
print(f"Training on {len(train_files)} stock files, Testing on {len(test_files)} stock files")

def create_labels(df):
    """Create classification labels for stock price movement."""
    df['pct change'] = df['Close'].pct_change(N)
    df['Close_diff_pct'] = df['pct change'].shift(-N)
    df['Label'] = 0  
    df.loc[df['Close_diff_pct'] > threshold, 'Label'] = 1  
    df.loc[df['Close_diff_pct'] < -threshold, 'Label'] = 2  
    return df['Label']

def create_sequences(df, labels, filing_embeddings, sequence_length):
    """
    Create sequences for GRU training including SEC filing embeddings.

    Args:
        df (DataFrame): Stock features.
        labels (Series): Classification labels.
        filing_embeddings (np.ndarray): Precomputed embeddings for filings.
        sequence_length (int): Length of input sequence.

    Returns:
        np.ndarray: X (features including filing embeddings)
        np.ndarray: y (labels)
    """
    stock_features = df[['Close', 'High', 'Low', 'Open', 'Volume', 'CPI', 'Inflation']]
    X, y = [], []

    for i in range(len(stock_features) - sequence_length - N):
        stock_seq = stock_features.values[i:i + sequence_length]  # (seq_len, 7)
        filing_seq = filing_embeddings[i:i + sequence_length]  # (seq_len, 384)

        combined_seq = np.hstack((stock_seq, filing_seq))  # (seq_len, 391)
        X.append(combined_seq)
        y.append(labels.iloc[i + sequence_length])

    return np.array(X), np.array(y)

def load_filing_embeddings(csv_file):
    """
    Load SEC filing embeddings from CSV.

    Args:
        csv_file (str): Path to stock CSV file.

    Returns:
        np.ndarray: Filing embeddings for each row.
    """
    df = pd.read_csv(csv_file)

    # ✅ Check if embedding column exists
    if "embedding" not in df.columns:
        print(f"⚠️ No embeddings found in {csv_file}, using zero vectors.")
        return np.zeros((len(df), embedding_dim))

    # ✅ Convert stored string embeddings to numpy array
    return np.array(df["embedding"].apply(lambda x: np.fromstring(x.strip("[]"), sep=",")))

# ✅ Load and preprocess all training stock data
all_X_train, all_y_train = [], []

for file in tqdm(train_files, desc="Loading training files"):
    if os.stat(file).st_size == 0:
        continue
    
    df = pd.read_csv(file)
    if not {'Close', 'High', 'Low', 'Open', 'Volume', 'CPI', 'Inflation', 'embedding'}.issubset(df.columns):
        continue

    df = df[['Close', 'High', 'Low', 'Open', 'Volume', 'CPI', 'Inflation', 'embedding']]
    
    labels = create_labels(df)
    filing_embeddings = load_filing_embeddings(file)  # Load embeddings from CSV
    X, y = create_sequences(df, labels, filing_embeddings, sequence_length)

    all_X_train.append(X)
    all_y_train.append(y)

# ✅ Concatenate all training sequences
X_train = np.concatenate(all_X_train, axis=0)  # (total_samples, seq_len, 391)
y_train = np.concatenate(all_y_train, axis=0)

# ✅ Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)

# ✅ Create PyTorch DataLoader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# ✅ Initialize Model
input_size = 391  # Stock features (7) + SEC embeddings (384)
output_size = 3  # 3 classes: Up, Down, Stable
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = GRUStockModel(input_size, hidden_size, num_layers, output_size).to(device)

# ✅ Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# ✅ Train Model
train_losses = []
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    model.train()
    total_loss = 0
    for batch_X, batch_y in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
        if torch.isnan(batch_X).any():
            continue
        
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    train_losses.append(avg_loss)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")

# ✅ Save Training Loss Plot
plt.figure(figsize=(8, 5))
plt.plot(train_losses, label="Training Loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("Training Loss")
plt.legend()
plt.savefig("train_loss_with_filings.png")
plt.close()

print("\n✅ Training complete. Training loss visualization saved.")


In [None]:
# Save Model
torch.save(model.state_dict(), "stock_model_rag.pth")
print("✅ Model saved as 'stock_model_rag.pth'.")

# Training a Doc2Vec

In [None]:
import os
import pandas as pd
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from tqdm import tqdm
import random

# ✅ Paths
stock_folder = "/Users/colbywang/Google Drive/我的云端硬盘/Advanced NLP/Assignments/data files/organized/stock-data"
all_stock_files = [os.path.join(stock_folder, f) for f in os.listdir(stock_folder) if f.endswith(".csv")]

# ✅ Extract SEC Filing Paths from Stock CSVs
sec_files = []
for csv_file in tqdm(all_stock_files, desc="Extracting SEC Filings"):
    if os.stat(csv_file).st_size == 0:
        continue
    
    df = pd.read_csv(csv_file)

    if not {'Close', 'High', 'Low', 'Open', 'Volume', 'CPI', 'Inflation'}.issubset(df.columns):
        continue  # Skip if missing columns
    
    if "10-K" in df.columns and "DEF 14A" in df.columns:
        # Must be string and not 0
        sec_files.extend(df["10-K"].astype(str).tolist())
        sec_files.extend(df["DEF 14A"].astype(str).tolist())

# ✅ Convert Paths (add tqdm)
sec_files = [switch_file_path(f) for f in tqdm(sec_files, desc="switch") if f != "0"]

print(f"✅ Found {len(sec_files)} SEC filings for training.")

# ✅ Shuffle the file list
random.shuffle(sec_files)

# Get 900 of the files for training
sec_files = sec_files[:900]
print(f"✅ Using {len(sec_files)} SEC filings for training.")

# ✅ Load SEC filings into TaggedDocument format
def load_filing(file_path):
    """Loads and tokenizes an SEC filing."""
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()
    return text.split()  # Tokenized as words

# ✅ Convert filings to Gensim's TaggedDocument format
documents = [TaggedDocument(words=load_filing(f), tags=[i]) for i, f in enumerate(tqdm(sec_files, desc="Loading Filings"))]

# ✅ Define & Train Doc2Vec Model
model = Doc2Vec(vector_size=384,  # Match SEC embedding size
                window=10,
                min_count=5,
                workers=4,  # Adjust based on CPU cores
                epochs=20,
                dm=1)  # PV-DM (Distributed Memory)

# ✅ Build Vocabulary
model.build_vocab(documents)
print("✅ Vocabulary built. Training started...")

# ✅ Train Model
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)


Extracting SEC Filings: 100%|██████████| 497/497 [04:47<00:00,  1.73it/s]
switch: 100%|██████████| 5561498/5561498 [00:00<00:00, 8187885.12it/s]


✅ Found 19779 SEC filings for training.
✅ Using 900 SEC filings for training.


Loading Filings:  23%|██▎       | 206/900 [02:58<07:24,  1.56it/s]IOStream.flush timed out
Loading Filings:  48%|████▊     | 433/900 [06:55<07:14,  1.07it/s]  IOStream.flush timed out
Loading Filings:  73%|███████▎  | 660/900 [11:29<02:59,  1.34it/s]  IOStream.flush timed out
Loading Filings:  99%|█████████▊| 887/900 [16:30<00:09,  1.36it/s]  IOStream.flush timed out
Loading Filings: 100%|██████████| 900/900 [17:01<00:00,  1.14s/it]


✅ Vocabulary built. Training started...


OSError: 523612032 requested and 0 written

In [5]:
# ✅ Save Model
model.save("sec_doc2vec.model")
print("✅ Training complete! Model saved as 'sec_doc2vec.model'.")

✅ Training complete! Model saved as 'sec_doc2vec.model'.
