# GRU Using Pytorch

In [7]:
import torch
import torch.nn as nn

# GRU Model Definition
class GRUStockModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(GRUStockModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.gru(x, h0)
        out = self.fc(out[:, -1, :])  # Take last time step output
        return out

You may need to update parent_folder and colab_base as you see fit

Download Ollama first to access llama3.2

In [1]:
# Global paths for both local (Mac) and Google Colab
PARENT_FOLDER = "/Users/colbywang/Google Drive/我的云端硬盘/Advanced NLP/Assignments/data files/organized/"
COLAB_BASE = "/content/gdrive/MyDrive/Assignments/Advanced NLP/Assignments/data files/organized/"

In [2]:
def switch_file_path(google_drive_path):
    """
    Convert Google Drive file paths to local file paths.
    
    Args:
        google_drive_path (str): File path stored in the CSV (Google Drive format).
    
    Returns:
        str: Converted local file path.
    """
    google_prefix = "/content/gdrive/MyDrive/Assignments"  # Incorrect Google Drive path
    local_prefix = "/Users/colbywang/Google Drive/我的云端硬盘"  # Correct local prefix

    if google_drive_path.startswith(google_prefix):
        local_path = google_drive_path.replace(google_prefix, local_prefix)
        return local_path
    
    return google_drive_path  # If not found, return as is


# Train

In [None]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, Dataset, TensorDataset
from sklearn.preprocessing import MinMaxScaler
from glob import glob
from sklearn.model_selection import train_test_split
import random
from tqdm import tqdm

# Path to stock data folder
PARENT_FOLDER = "/Users/colbywang/Google Drive/我的云端硬盘/Advanced NLP/Assignments/data files/organized/stock-data"
all_files = glob(os.path.join(PARENT_FOLDER, "*.csv"))

# Hyperparameters
sequence_length = 7  # Lookback window
N = 1  # Predict trend in next N days
batch_size = 32
epochs = 20
hidden_size = 128
num_layers = 2
learning_rate = 0.001
threshold = 0.02  # 2% change threshold for "Stable"
train_ratio = 0.7  # 70% of files for training, 30% for testing

# Split stock files into training and testing sets
train_files, test_files = train_test_split(all_files, test_size=1-train_ratio, random_state=42)
print(f"Training on {len(train_files)} stock files, Testing on {len(test_files)} stock files")

def create_labels(df):
    """
    Create classification labels for stock price movement.
    
    Args:
        df (DataFrame): Stock data
        N (int): Number of days in the future to compute price movement
        threshold (float): Percentage change threshold to classify Up or Down

    Returns:
        Series: Labels (0 = Stable, 1 = Up, 2 = Down)
    """
    df['pct change'] = df['Close'].pct_change(N)
    df['Close_diff_pct'] = df['pct change'].shift(-N)

    # Default to "Stable"
    df['Label'] = 0  

    # Up (if change > threshold)
    df.loc[df['Close_diff_pct'] > threshold, 'Label'] = 1  

    # Down (if change < -threshold)
    df.loc[df['Close_diff_pct'] < -threshold, 'Label'] = 2  

    return df['Label']

def create_sequences(df, labels, sequence_length, N=5):
    """
    Create sequences for GRU training.

    Args:
        df (DataFrame): Stock features
        labels (Series): Classification labels
        sequence_length (int): Length of input sequence
        N (int): Lookahead period for labels

    Returns:
        np.ndarray: X (features)
        np.ndarray: y (labels)
    """
    df = df[['Close', 'High', 'Low', 'Open', 'Volume', 'CPI', 'Inflation']]
    X, y = [], []

    # Ensure labels align with future price movement
    for i in range(len(df) - sequence_length - N):
        X.append(df.values[i:i + sequence_length])  # Sequence of input features
        y.append(labels.iloc[i + sequence_length])  # Label for the next movement

    return np.array(X), np.array(y)

# Load and preprocess all training stock data
all_X_train, all_y_train = [], []

for file in tqdm(train_files, desc="Loading training files"):
    # If file is empty skip
    if os.stat(file).st_size == 0:
        continue
    
    df = pd.read_csv(file)
    if not {'Close', 'High', 'Low', 'Open', 'Volume', 'CPI', 'Inflation'}.issubset(df.columns):
        continue  # Skip if missing columns

    df = df[['Close', 'High', 'Low', 'Open', 'Volume', 'CPI', 'Inflation']]

    labels = create_labels(df)
    X, y = create_sequences(df, labels, sequence_length)

    all_X_train.append(X)
    all_y_train.append(y)

# Concatenate all training sequences
X_train = np.concatenate(all_X_train, axis=0)
y_train = np.concatenate(all_y_train, axis=0)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)

# Create PyTorch DataLoader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Initialize Model
input_size = 7  # ['Close', 'High', 'Low', 'Open', 'Volume', 'CPI', 'Inflation']
output_size = 3  # 3 classes: Up, Down, Stable
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = GRUStockModel(input_size, hidden_size, num_layers, output_size).to(device)

# Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Train Model
train_losses = []
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    model.train()
    total_loss = 0
    for batch_X, batch_y in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
        # If batch_X has nan skip
        if torch.isnan(batch_X).any():
            continue
        
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    train_losses.append(avg_loss)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")

# Save Training Loss Plot
plt.figure(figsize=(8, 5))
plt.plot(train_losses, label="Training Loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("Training Loss")
plt.legend()
plt.savefig("train_loss.png")
plt.close()

print("\n✅ Training complete. Training loss visualization saved.")


In [None]:
# Save Model
torch.save(model.state_dict(), "stock_gru_model.pth")
print("✅ Model saved as 'stock_model.pth'.")

✅ Model saved as 'stock_model.pth'.


# Test

In [None]:
import os
import torch
import pandas as pd
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# ✅ Define Parent Folder for Stock Data
PARENT_FOLDER = "/Users/colbywang/Google Drive/我的云端硬盘/Advanced NLP/Assignments/data files/organized/stock-data/"

# ✅ Get list of test files
csv_files = [os.path.join(PARENT_FOLDER, file) for file in os.listdir(PARENT_FOLDER) if file.endswith(".csv")]
test_files = csv_files[int(0.7 * len(csv_files)):]

# ✅ Load GRU Model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = GRUStockModel(input_size=7, hidden_size=64, num_layers=2, output_size=3).to(device)
model.load_state_dict(torch.load("stock_gru_model.pth"))
model.eval()  # Set model to evaluation mode

# ✅ Initialize Metrics Storage
all_actuals, all_predictions = [], []

# ✅ Process Each Test Stock
for test_stock in tqdm(test_files, desc="stock files"):
    # If file is empty skip
    if os.stat(test_stock).st_size == 0:
        continue

    df_test = pd.read_csv(test_stock)
    if not {'Close', 'High', 'Low', 'Open', 'Volume', 'CPI', 'Inflation'}.issubset(df.columns):
        continue  # Skip if missing columns

    # Select Relevant Features
    df_test = df_test[['Close', 'High', 'Low', 'Open', 'Volume', 'CPI', 'Inflation']]

    # Generate Labels
    labels_test = create_labels(df_test)

    # Create Sequences
    X_test, y_test = create_sequences(df_test, labels_test, sequence_length=7)
    X_test, y_test = torch.tensor(X_test, dtype=torch.float32).to(device), torch.tensor(y_test, dtype=torch.long).to(device)

    # ✅ Run Inference
    with torch.no_grad():
        for i in range(len(X_test)):
            X_sample = X_test[i].unsqueeze(0)  # Add batch dimension

            if torch.isnan(X_sample).any():
                continue

            y_actual = y_test[i].item()
            output = model(X_sample)
            _, predicted = torch.max(output, 1)
            
            # Store results
            all_actuals.append(y_actual)
            all_predictions.append(predicted.item())

# ✅ Compute Classification Metrics
accuracy = accuracy_score(all_actuals, all_predictions)
precision, recall, f1, _ = precision_recall_fscore_support(all_actuals, all_predictions, average="weighted", zero_division=0)

# ✅ Print Results
print("\n🔹 **Overall Test Set Performance Metrics** 🔹")
print(f"✅ Accuracy: {accuracy:.4f}")
print(f"📏 Precision: {precision:.4f}")
print(f"📡 Recall: {recall:.4f}")
print(f"⚖️ F1-Score: {f1:.4f}")


stock files: 100%|██████████| 150/150 [06:29<00:00,  2.60s/it]



🔹 **Overall Test Set Performance Metrics** 🔹
✅ Accuracy: 0.7716
📏 Precision: 0.6080
📡 Recall: 0.7716
⚖️ F1-Score: 0.6726


# Train with a Doc2Vec Embedding

In [None]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from glob import glob
from tqdm import tqdm
from gensim.models.doc2vec import Doc2Vec

# ✅ Load Pretrained Doc2Vec Model
doc2vec_model = Doc2Vec.load("sec_doc2vec.model")
embedding_dim = doc2vec_model.vector_size  # Get embedding size from Doc2Vec

# ✅ Path to stock data folder
stock_folder = "/Users/colbywang/Google Drive/我的云端硬盘/Advanced NLP/Assignments/data files/organized/stock-data"
all_files = glob(os.path.join(stock_folder, "*.csv"))

# ✅ Hyperparameters
sequence_length = 7  # Lookback window
N = 1  # Predict trend in next N days
batch_size = 32
epochs = 20
hidden_size = 128
num_layers = 2
learning_rate = 0.001
threshold = 0.02  # 2% change threshold for "Stable"
train_ratio = 0.7  # 70% of files for training, 30% for testing
save_every_files = 2  # ✅ Save model every 2 stock CSVs

# ✅ Split stock files into training and testing sets
train_files, test_files = train_test_split(all_files, test_size=1-train_ratio, random_state=42)
print(f"Training on {len(train_files)} stock files, Testing on {len(test_files)} stock files")

def create_labels(df):
    """Create classification labels for stock price movement."""
    df['pct change'] = df['Close'].pct_change(N)
    df['Close_diff_pct'] = df['pct change'].shift(-N)
    df['Label'] = 0  
    df.loc[df['Close_diff_pct'] > threshold, 'Label'] = 1  
    df.loc[df['Close_diff_pct'] < -threshold, 'Label'] = 2  
    return df['Label']

def get_filing_embedding(file_path):
    """Retrieves embedding for a single SEC filing, ensuring shape consistency."""
    embedding = np.zeros((embedding_dim,), dtype=np.float32)  # Default zero vector
    if os.path.exists(file_path):
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                text = f.read()
            embedding = doc2vec_model.infer_vector(text.split())
        except Exception as e:
            print(f"⚠️ Error retrieving embedding for {file_path}: {e}")

    # ✅ Ensure embedding has correct shape
    if embedding.shape != (embedding_dim,):
        print(f"⚠️ Bad embedding shape: {embedding.shape} for {file_path}. Using zero vector.")
        embedding = np.zeros((embedding_dim,), dtype=np.float32)

    return embedding

def create_sequences(df, labels, filing_embeddings, sequence_length):
    """Create sequences for GRU training including SEC filing embeddings."""
    stock_features = df[['Close', 'High', 'Low', 'Open', 'Volume', 'CPI', 'Inflation']].values
    X, y = [], []

    for i in range(len(stock_features) - sequence_length - N):
        stock_seq = stock_features[i:i + sequence_length]  # (seq_len, 7)
        filing_seq = filing_embeddings[i:i + sequence_length]  # (seq_len, embedding_dim)

        # ✅ Fix potential missing SEC embeddings
        if filing_seq.shape != (sequence_length, embedding_dim):
            print(f"⚠️ Mismatch in SEC embeddings for sequence at index {i}. Using zero vectors.")
            filing_seq = np.zeros((sequence_length, embedding_dim), dtype=np.float32)

        combined_seq = np.hstack((stock_seq, filing_seq))  # (seq_len, 7 + embedding_dim)
        X.append(combined_seq)
        y.append(labels.iloc[i + sequence_length])

    return np.array(X, dtype=np.float32), np.array(y, dtype=np.int64)  # ✅ Convert lists to NumPy first!

# ✅ Initialize Model (GRU model is assumed to be already defined)
input_size = 7 + embedding_dim  # Stock features (7) + SEC embeddings (Doc2Vec)
output_size = 3  # 3 classes: Up, Down, Stable
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = GRUStockModel(input_size, hidden_size, num_layers, output_size).to(device)  # Use your existing model

# ✅ Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# ✅ Training Loop (Train Per-File, Save Every 2 Files)
train_losses = []
file_count = 0

for epoch in range(epochs):
    print(f"\n🟢 Epoch {epoch+1}/{epochs}")
    model.train()
    total_loss = 0

    for file in tqdm(train_files, desc=f"Processing Training Files"):
        if os.stat(file).st_size == 0:
            continue

        df = pd.read_csv(file)
        if not {'Close', 'High', 'Low', 'Open', 'Volume', 'CPI', 'Inflation', '10-K', 'DEF 14A'}.issubset(df.columns):
            continue

        df = df[['Close', 'High', 'Low', 'Open', 'Volume', 'CPI', 'Inflation', '10-K', 'DEF 14A']]
        labels = create_labels(df)

        # ✅ Retrieve SEC Filing Embeddings **Only for This File**
        filing_embeddings = np.array([
            get_filing_embedding(switch_file_path(str(row["10-K"]))) if row["10-K"] != "0"
            else get_filing_embedding(switch_file_path(str(row["DEF 14A"]))) if row["DEF 14A"] != "0"
            else np.zeros((embedding_dim,), dtype=np.float32)
            for _, row in df.iterrows()
        ])

        # ✅ Generate sequences
        X, y = create_sequences(df, labels, filing_embeddings, sequence_length)

        # ✅ Convert NumPy arrays to tensors **efficiently**
        X_train_tensor = torch.from_numpy(X).float().to(device)
        y_train_tensor = torch.from_numpy(y).long().to(device)

        # ✅ Create DataLoader (Batch Within the File)
        train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

        # ✅ Train Per-File (Batch Training within the File)
        file_loss = 0
        for batch_X, batch_y in train_loader:
            # If batch_X has nan skip
            if torch.isnan(batch_X).any():
                continue

            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
            file_loss += loss.item()

        total_loss += file_loss / len(train_loader)
        file_count += 1

        # ✅ Save Model Every 2 Files
        if file_count % save_every_files == 0 or file_count == len(train_files):
            torch.save(model.state_dict(), f"stock_gru_d2v.pth")
            print(f"✅ Saved model checkpoint after {file_count} files!")

    # ✅ Save Model After Each Epoch
    torch.save(model.state_dict(), f"stock_gru_d2v.pth")
    avg_loss = total_loss / len(train_files)
    train_losses.append(avg_loss)
    print(f"Epoch {epoch+1}, Loss: {avg_loss:.4f} ✅ Model Saved!")

# ✅ Save Final Model
torch.save(model.state_dict(), "stock_gru_final.pth")
print("\n✅ Training complete! Final model saved.")


Training on 347 stock files, Testing on 150 stock files

🟢 Epoch 1/20


Processing Training Files:   0%|          | 1/347 [01:44<10:02:32, 104.49s/it]

✅ File trained. Loss: 107.7894


Processing Training Files:   1%|          | 2/347 [03:02<8:29:52, 88.67s/it]  

✅ File trained. Loss: 171.4454
✅ Saved model checkpoint after 2 files!


Processing Training Files:   1%|          | 3/347 [04:16<7:51:50, 82.30s/it]

✅ File trained. Loss: 106.9434


Processing Training Files:   1%|          | 4/347 [04:53<6:08:12, 64.41s/it]

✅ File trained. Loss: 54.1054
✅ Saved model checkpoint after 4 files!


Processing Training Files:   1%|▏         | 5/347 [06:36<7:25:03, 78.08s/it]

✅ File trained. Loss: 74.7414


Processing Training Files:   2%|▏         | 6/347 [07:50<7:17:32, 76.99s/it]

✅ File trained. Loss: 150.2065
✅ Saved model checkpoint after 6 files!


Processing Training Files:   2%|▏         | 7/347 [09:11<7:23:33, 78.28s/it]

✅ File trained. Loss: 136.4386


Processing Training Files:   2%|▏         | 8/347 [10:40<7:41:41, 81.72s/it]

✅ File trained. Loss: 140.3327
✅ Saved model checkpoint after 8 files!


Processing Training Files:   3%|▎         | 9/347 [12:01<7:37:30, 81.21s/it]

✅ File trained. Loss: 127.9178


Processing Training Files:   3%|▎         | 10/347 [13:18<7:29:24, 80.01s/it]

✅ File trained. Loss: 115.8131
✅ Saved model checkpoint after 10 files!


Processing Training Files:   3%|▎         | 11/347 [14:11<6:41:29, 71.70s/it]

✅ File trained. Loss: 109.1487


Processing Training Files:   3%|▎         | 12/347 [15:10<6:19:54, 68.04s/it]

✅ File trained. Loss: 106.7449
✅ Saved model checkpoint after 12 files!


Processing Training Files:   4%|▎         | 13/347 [15:48<5:28:09, 58.95s/it]

✅ File trained. Loss: 51.0613


Processing Training Files:   4%|▍         | 14/347 [15:49<3:49:29, 41.35s/it]

✅ File trained. Loss: nan
✅ Saved model checkpoint after 14 files!


Processing Training Files:   4%|▍         | 15/347 [16:45<4:13:08, 45.75s/it]

✅ File trained. Loss: nan


Processing Training Files:   5%|▍         | 16/347 [18:20<5:33:53, 60.52s/it]

✅ File trained. Loss: nan
✅ Saved model checkpoint after 16 files!


Processing Training Files:   5%|▍         | 17/347 [19:39<6:03:45, 66.14s/it]

✅ File trained. Loss: nan


Processing Training Files:   5%|▌         | 18/347 [21:01<6:28:16, 70.81s/it]

✅ File trained. Loss: nan
✅ Saved model checkpoint after 18 files!


Processing Training Files:   5%|▌         | 19/347 [22:15<6:32:49, 71.86s/it]

✅ File trained. Loss: nan


Processing Training Files:   6%|▌         | 20/347 [23:30<6:36:49, 72.81s/it]

✅ File trained. Loss: nan
✅ Saved model checkpoint after 20 files!


Processing Training Files:   6%|▌         | 21/347 [24:18<5:55:13, 65.38s/it]

✅ File trained. Loss: nan


Processing Training Files:   6%|▋         | 22/347 [25:28<6:02:01, 66.84s/it]

✅ File trained. Loss: nan
✅ Saved model checkpoint after 22 files!


Processing Training Files:   7%|▋         | 23/347 [26:41<6:10:01, 68.52s/it]

✅ File trained. Loss: nan


Processing Training Files:   7%|▋         | 24/347 [27:24<5:27:59, 60.93s/it]

✅ File trained. Loss: nan
✅ Saved model checkpoint after 24 files!


Processing Training Files:   7%|▋         | 25/347 [28:09<5:00:57, 56.08s/it]

✅ File trained. Loss: nan


Processing Training Files:   7%|▋         | 26/347 [28:48<4:32:15, 50.89s/it]

✅ File trained. Loss: nan
✅ Saved model checkpoint after 26 files!


Processing Training Files:   8%|▊         | 27/347 [29:12<3:49:29, 43.03s/it]

✅ File trained. Loss: nan


Processing Training Files:   8%|▊         | 28/347 [30:26<4:38:26, 52.37s/it]

✅ File trained. Loss: nan
✅ Saved model checkpoint after 28 files!


Processing Training Files:   8%|▊         | 29/347 [31:47<5:22:19, 60.82s/it]

✅ File trained. Loss: nan


Processing Training Files:   9%|▊         | 30/347 [32:56<5:34:54, 63.39s/it]

✅ File trained. Loss: nan
✅ Saved model checkpoint after 30 files!


Processing Training Files:   9%|▉         | 31/347 [33:34<4:53:40, 55.76s/it]

✅ File trained. Loss: nan


Processing Training Files:   9%|▉         | 32/347 [34:06<4:14:22, 48.45s/it]

✅ File trained. Loss: nan
✅ Saved model checkpoint after 32 files!


Processing Training Files:  10%|▉         | 33/347 [35:01<4:23:31, 50.35s/it]

✅ File trained. Loss: nan


Processing Training Files:  10%|▉         | 34/347 [36:18<5:04:27, 58.36s/it]

✅ File trained. Loss: nan
✅ Saved model checkpoint after 34 files!


Processing Training Files:  10%|█         | 35/347 [37:30<5:24:42, 62.44s/it]

✅ File trained. Loss: nan


Processing Training Files:  10%|█         | 35/347 [37:36<5:35:13, 64.47s/it]


KeyboardInterrupt: 

# Training a Doc2Vec

In [5]:
import os
import pandas as pd
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from tqdm import tqdm
import random

# ✅ Paths
stock_folder = "/Users/colbywang/Google Drive/我的云端硬盘/Advanced NLP/Assignments/data files/organized/stock-data"
all_stock_files = [os.path.join(stock_folder, f) for f in os.listdir(stock_folder) if f.endswith(".csv")]
model_save_path = "sec_doc2vec.model"

batch_size = 100  # Number of filings to process at once

# ✅ Extract SEC Filing Paths from Stock CSVs
sec_files = []
for csv_file in tqdm(all_stock_files, desc="Extracting SEC Filings"):
    if os.stat(csv_file).st_size == 0:
        continue
    
    df = pd.read_csv(csv_file)

    if not {'Close', 'High', 'Low', 'Open', 'Volume', 'CPI', 'Inflation'}.issubset(df.columns):
        continue  # Skip if missing columns
    
    if "10-K" in df.columns and "DEF 14A" in df.columns:
        # Must be string and not 0
        sec_files.extend(df["10-K"].astype(str).tolist())
        sec_files.extend(df["DEF 14A"].astype(str).tolist())

# ✅ Convert Paths (add tqdm)
sec_files = [switch_file_path(f) for f in tqdm(sec_files, desc="switch") if f != "0"]

print(f"✅ Found {len(sec_files)} SEC filings for training.")

# ✅ Shuffle the file list
random.shuffle(sec_files)

# Get 900 of the files for training
sec_files = sec_files[:1000]
print(f"✅ Using {len(sec_files)} SEC filings for training.")

# ✅ Load SEC filings into TaggedDocument format
def load_filing(file_path):
    """Loads and tokenizes an SEC filing."""
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()
    return text.split()  # Tokenized as words

# ✅ Initialize Model (Adjust parameters as needed)
model = Doc2Vec(
    vector_size=384,
    window=10,
    min_count=5,
    workers=max(1, os.cpu_count() - 2),  # Prevent CPU overload
    epochs=20,
    dm=1  # PV-DM mode
)

# ✅ Process in Batches and Save After Each Batch
for i in tqdm(range(0, len(sec_files), batch_size), desc="batch proc"):
    batch_docs = [
        TaggedDocument(words=load_filing(file), tags=[str(j)])
        for j, file in enumerate(sec_files[i:i + batch_size])
    ]

    # ✅ If first batch, build vocab
    if i == 0:
        model.build_vocab(batch_docs)
    else:
        model.build_vocab(batch_docs, update=True)  # Update existing vocab

    # ✅ Train on batch
    model.train(batch_docs, total_examples=len(batch_docs), epochs=model.epochs)

    # ✅ Save after each batch
    model.save(model_save_path)
    print(f"✅ Saved progress after batch {i // batch_size + 1}")

print("✅ Training complete! Final model saved.")

Extracting SEC Filings: 100%|██████████| 497/497 [00:06<00:00, 76.45it/s]
switch: 100%|██████████| 5561498/5561498 [00:00<00:00, 7983377.01it/s]


✅ Found 19779 SEC filings for training.
✅ Using 1000 SEC filings for training.




✅ Saved progress after batch 1




✅ Saved progress after batch 2




✅ Saved progress after batch 3




✅ Saved progress after batch 4




✅ Saved progress after batch 5




✅ Saved progress after batch 6




✅ Saved progress after batch 7




✅ Saved progress after batch 8




✅ Saved progress after batch 9


batch proc: 100%|██████████| 10/10 [43:15<00:00, 259.54s/it]

✅ Saved progress after batch 10
✅ Training complete! Final model saved.



