## Notice
The EmCyclopedia is a proof of concept. Emma's Encyclopedia from HuggingFace wikipedia corpus. The Worst (Soon-To-Be) Neural Net of All time.

By the time it is complete it will have anywhere from 3 to 30 million parameters. (Eek!)

Specifically this "v3.00" file exists to **finalize the project from dummy training pipeline**. What you are seeing is the 11th version of EmCyclopedia.

## How to use it?
First run the cell below to install all dependencies. This may take 5-10 minutes. (A long time!)

After that the dummy training pipeline cell will do the rest.

No GUI sorry. Maybe later.

Params for generation need to be added again once I actually have text generation

## TODOs

- Fix dummy training pipeline.
- Implement backpropagation. (Eek!)
- Add control for top P, min token, max token (instead of prior forced token output).

## Tentative Pipeline

- Use spaCy for word vectors. It returns 300-dimensional vectors so I do not have to count words like a caveman. But that's a lotta dimensions so I so scared.
- Training from scratch.

## Things to look into
- spaCy documentation
- PyTorch, what it do! Yop yop yop yop.

In [1]:
!pip install datasets
!pip install tqdm
!python -m spacy download en_core_web_md
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

Collecting datasets
  Downloading datasets-3.4.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.4.0-py3-none-any.whl (487 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading xx

In [35]:
from datasets import load_dataset
import spacy

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import time

print("Loading Wikitext dataset...")
ds = load_dataset("wikitext", "wikitext-2-raw-v1", split="train", streaming=True)
print("Yop yop! Wikipedia dataset loaded.")

nlp = spacy.load("en_core_web_md", disable=["parser", "ner"])

# --- AGGRESSIVE SAMPLING ---
small_ds = ds.take(10000)  # Take only 1000 examples
# --- END AGGRESSIVE SAMPLING ---

def tokenize_text(text, word_to_index):
  tokens = [token.text.lower() for token in nlp(text) if token.is_alpha]
  return [word_to_index.get(token, 0) for token in tokens]

class emCyclopedia_LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim=300, hidden_dim=128, num_layers=2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)  # Predict next word

    def forward(self, input_ids):
        """
        Forward pass of the model.

        Args:
            input_ids: Tensor of shape (batch_size, sequence_length) containing the word indices.

        Returns:
            Tensor of shape (batch_size, sequence_length, vocab_size) containing the logits.
        """
        embedded = self.embedding(input_ids)  # (batch_size, sequence_length, embedding_dim)
        lstm_out, _ = self.lstm(embedded)     # (batch_size, sequence_length, hidden_dim)
        logits = self.fc(lstm_out)          # (batch_size, sequence_length, vocab_size)
        return logits

    def generate(self, input_ids, top_p=0.9, temperature=1.0, max_length=50):
        generated_tokens = []
        for _ in range(max_length):
            logits = self.forward(input_ids)  # Get raw scores
            logits = logits[:, -1, :]  # Focus on the last token
            logits = logits / temperature  # Apply temperature scaling
            probs = F.softmax(logits, dim=-1)  # Convert to probabilities
            sorted_probs, sorted_indices = torch.sort(probs, descending=True)

            # Top-p nucleus filtering
            cumulative_probs = torch.cumsum(sorted_probs, dim=-1)
            cutoff_index = (cumulative_probs > top_p).nonzero(as_tuple=True)[1][0].item()
            filtered_probs = sorted_probs[:, :cutoff_index + 1] #remember to index both dimensions!
            filtered_indices = sorted_indices[:, :cutoff_index + 1]

            # Sample from the filtered distribution
            next_token_index = torch.multinomial(filtered_probs, 1).item() #extract the index first!
            next_token = filtered_indices[0, next_token_index]  # Now index correctly.
            generated_tokens.append(next_token.item()) #append the item
            input_ids = torch.cat([input_ids, torch.tensor([[next_token.item()]], dtype=torch.long).to(input_ids.device)], dim=1)  # Append new token, and make sure its the item!

        return generated_tokens

class WikiDataset(Dataset):
    def __init__(self, data, word_to_index, max_examples=10000, window_size=3):
        self.data = []
        self.window_size = window_size #store the window size
        count = 0
        for example in data:
            text = example['text']
            tokenized = tokenize_text(text, word_to_index)
            # Create input/target pairs
            for i in range(0, len(tokenized) - self.window_size, 1): #iterate correctly
                context = torch.tensor(tokenized[i : i + self.window_size ], dtype=torch.long)
                target  = torch.tensor(tokenized[i + 1: i + self.window_size + 1], dtype=torch.long) #get all the targets
                count += 1
                self.data.append((context, target))
                if count >= max_examples:
                  return

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

# --- INSERT THIS BLOCK ---
print("Tokenizing and building vocabulary...")
all_words = []
total_examples = 0

for example in small_ds:
    start_time = time.time()
    total_examples += 1
    text = example['text']
    tokens = [token.text.lower() for token in nlp(text) if token.is_alpha]
    all_words.extend(tokens)
    end_time = time.time()
    if total_examples % 100 == 0:  # Frequent updates
        print(f"Processed {total_examples} examples. Current all_words size: {len(all_words)}. Time per example: {(end_time - start_time):.4f} seconds")

print(f"Tokenization complete. Total examples processed: {total_examples}")
# --- END OF INSERTED BLOCK ---

# Create vocabulary
word_counts = Counter(all_words)
vocab = sorted(word_counts, key=word_counts.get, reverse=True)
vocab_size = len(vocab)
word_to_index = {word: index for index, word in enumerate(vocab)}
# --- AGGRESSIVE SAMPLING (WikiDataset) ---
train_dataset = WikiDataset(small_ds, word_to_index, max_examples=10000, window_size=100)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
# --- END AGGRESSIVE SAMPLING ---

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = emCyclopedia_LSTM(vocab_size).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

Loading Wikitext dataset...
Yop yop! Wikipedia dataset loaded.
Tokenizing and building vocabulary...
Processed 100 examples. Current all_words size: 5425. Time per example: 0.0039 seconds
Processed 200 examples. Current all_words size: 8325. Time per example: 0.0036 seconds
Processed 300 examples. Current all_words size: 9798. Time per example: 0.0288 seconds
Processed 400 examples. Current all_words size: 13202. Time per example: 0.0002 seconds
Processed 500 examples. Current all_words size: 18099. Time per example: 0.0390 seconds
Processed 600 examples. Current all_words size: 22528. Time per example: 0.0037 seconds
Processed 700 examples. Current all_words size: 28383. Time per example: 0.0002 seconds
Processed 800 examples. Current all_words size: 34369. Time per example: 0.0033 seconds
Processed 900 examples. Current all_words size: 39099. Time per example: 0.0001 seconds
Processed 1000 examples. Current all_words size: 43955. Time per example: 0.0094 seconds
Processed 1100 exampl

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

valid_ds = load_dataset("wikitext", "wikitext-2-raw-v1", split="validation", streaming=True)
valid_dataset = WikiDataset(valid_ds, word_to_index, max_examples=10000, window_size=100) # Use the same parameters as training
valid_loader = DataLoader(valid_dataset, batch_size=16, shuffle=False)  # shuffle=False for evaluation
train_losses = []
valid_losses = []
# --- Assuming you have your model, optimizer, criterion, train_loader, and valid_loader defined ---

def train_model(model, train_loader, valid_loader, epochs=100, patience=10): # Added patience
    model.train()
    best_valid_loss = float('inf')  # Initialize best validation loss
    patience_counter = 0 # counter for early stopping

    for epoch in range(epochs):
        # --- Training Phase ---
        model.train()  # Set the model to training mode
        total_train_loss = 0
        for batch_num, (inputs, targets) in enumerate(train_loader):
            inputs, targets = inputs.to(device), targets.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs.view(-1, vocab_size), targets.view(-1))
            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()
            if (batch_num + 1) % 10 == 0:  # Print progress every 10 batches
                print(f"Epoch {epoch+1}, Batch {batch_num+1}, Loss: {loss.item():.4f}")
        avg_train_loss = total_train_loss / len(train_loader)
        train_losses.append(avg_train_loss)
        print(f"Epoch {epoch+1}, Train Loss: {avg_train_loss:.4f}")

        # --- Validation Phase ---
        model.eval()  # Set the model to evaluation mode
        total_valid_loss = 0
        with torch.no_grad():  # Disable gradient calculation during validation
            for inputs, targets in valid_loader:
                inputs, targets = inputs.to(device), targets.to(device)
                outputs = model(inputs)
                loss = criterion(outputs.view(-1, vocab_size), targets.view(-1))
                total_valid_loss += loss.item()

        avg_valid_loss = total_valid_loss / len(valid_loader)
        valid_losses.append(avg_valid_loss)
        print(f"Epoch {epoch+1}, Valid Loss: {avg_valid_loss:.4f}")

        # --- Early Stopping ---
        if avg_valid_loss < best_valid_loss:
            best_valid_loss = avg_valid_loss
            patience_counter = 0
            # Save the best model (optional, but recommended)
            torch.save(model.state_dict(), 'best_model.pth')
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"Early stopping triggered at epoch {epoch+1}")
                break # Stop if it has not gotten better.

# --- END AGGRESSIVE TRAINING AND VALIDATION ---
train_model(model, train_loader, valid_loader, epochs=100, patience=10)

Epoch 1, Batch 10, Loss: 6.6539
Epoch 1, Batch 20, Loss: 6.5807
Epoch 1, Batch 30, Loss: 6.7634
Epoch 1, Batch 40, Loss: 6.6724
Epoch 1, Batch 50, Loss: 6.6151
Epoch 1, Batch 60, Loss: 6.6746
Epoch 1, Batch 70, Loss: 6.6645
Epoch 1, Batch 80, Loss: 6.5450
Epoch 1, Batch 90, Loss: 6.7582
Epoch 1, Batch 100, Loss: 6.6609
Epoch 1, Batch 110, Loss: 6.4986
Epoch 1, Batch 120, Loss: 6.6664
Epoch 1, Batch 130, Loss: 6.7067
Epoch 1, Batch 140, Loss: 6.5908
Epoch 1, Batch 150, Loss: 6.7142
Epoch 1, Batch 160, Loss: 6.5322
Epoch 1, Batch 170, Loss: 6.5708
Epoch 1, Batch 180, Loss: 6.5127
Epoch 1, Batch 190, Loss: 6.7598
Epoch 1, Batch 200, Loss: 6.5875
Epoch 1, Batch 210, Loss: 6.7177
Epoch 1, Batch 220, Loss: 6.5282
Epoch 1, Batch 230, Loss: 6.5499
Epoch 1, Batch 240, Loss: 6.5765
Epoch 1, Batch 250, Loss: 6.4745
Epoch 1, Batch 260, Loss: 6.7351
Epoch 1, Batch 270, Loss: 6.6152
Epoch 1, Batch 280, Loss: 6.5499
Epoch 1, Batch 290, Loss: 6.5228
Epoch 1, Batch 300, Loss: 6.4134
Epoch 1, Batch 310,

In [None]:
import matplotlib.pyplot as plt

epochs = range(1, num_epochs + 1)

plt.plot(epochs, train_losses, marker='o', linestyle='-', label='Training Loss')
plt.plot(epochs, valid_losses, marker='x', linestyle='-', label='Validation Loss')

plt.title('Training and Validation Loss vs. Epochs')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.grid(True)
plt.legend()  # Add a legend to differentiate the lines
plt.show()

In [34]:
# Example usage (after training)
model.eval()  # Set the model to evaluation mode

# Start with a seed sequence (e.g., "The cat sat")
seed_text = "The cat sat"
seed_tokens = tokenize_text(seed_text, word_to_index)
input_ids = torch.tensor([seed_tokens], dtype=torch.long).to(device)

# Generate text
generated_tokens = model.generate(input_ids, max_length=50)

# Convert tokens back to words
generated_text = " ".join([vocab[token] for token in generated_tokens])

print(f"Seed text: {seed_text}")
print(f"Generated text: {generated_text}")

Seed text: The cat sat
Generated text: fresno both returned from previous entries along with valkyria chronicles ii while it retained the standard features of the series of linear missions gradually unlocked as maps that remain unaltered unless otherwise dictated by the darcsen heavy weapons specialist who seeks revenge against the valkyria who destroyed her home and


In [None]:
torch.save(model.state_dict(), "emcyclopedia_v3.pt")
print("Model saved.")

In [None]:
model.load_state_dict(torch.load("emcyclopedia_v3.pt"))
model.eval()  # Put it in evaluation mode
print("Model loaded.")

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

# Hyperparameters
hidden_size = 128  # Size of hidden layer
num_layers = 2     # Number of RNN layers
seq_length = 30    # Number of characters per training sequence
batch_size = 64    # Batch size
learning_rate = 0.002
num_epochs = 10

# Sample text data (replace with actual Wikipedia text)
text = "The quick brown fox jumps over the lazy dog. " * 100
chars = list(set(text))  # Unique characters
char_to_idx = {ch: i for i, ch in enumerate(chars)}
idx_to_char = {i: ch for ch, i in char_to_idx.items()}
vocab_size = len(chars)

# Convert text to numerical format
data = torch.tensor([char_to_idx[ch] for ch in text], dtype=torch.long)

def get_batches(data, seq_length, batch_size):
    n = (len(data) - 1) // (seq_length * batch_size)
    for i in range(0, n * batch_size * seq_length, seq_length):
        x = data[i:i+seq_length]
        y = data[i+1:i+seq_length+1]
        yield x.view(batch_size, -1), y.view(batch_size, -1)

# Define RNN model
class RNNModel(nn.Module):
    def __init__(self, vocab_size, hidden_size, num_layers):
        super(RNNModel, self).__init__()
        self.embed = nn.Embedding(vocab_size, hidden_size)
        self.rnn = nn.RNN(hidden_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, h):
        x = self.embed(x)
        out, h = self.rnn(x, h)
        out = self.fc(out.reshape(out.size(0) * out.size(1), -1))
        return out, h

# Initialize model, loss, and optimizer
model = RNNModel(vocab_size, hidden_size, num_layers)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    h = torch.zeros(num_layers, batch_size, hidden_size)  # Initialize hidden state
    for x, y in get_batches(data, seq_length, batch_size):
        optimizer.zero_grad()
        output, h = model(x, h.detach())
        loss = criterion(output, y.view(-1))
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}")

# Generate text
def generate_text(start_str, length=100):
    model.eval()
    input_seq = torch.tensor([char_to_idx[ch] for ch in start_str], dtype=torch.long).unsqueeze(0)
    h = torch.zeros(num_layers, 1, hidden_size)
    result = start_str
    with torch.no_grad():
        for _ in range(length):
            output, h = model(input_seq, h)
            pred_idx = torch.argmax(output[-1]).item()
            result += idx_to_char[pred_idx]
            input_seq = torch.tensor([[pred_idx]])
    return result

print(generate_text("The quick ", 200))


RuntimeError: shape '[64, -1]' is invalid for input of size 30