In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.6 MB/s[0m eta [36m0:00:

In [None]:
# Helper function to build vocabulary
def build_vocab(dataset, max_vocab_size, stop_words):
    counter = Counter()
    for item in dataset:
        tokens = word_tokenize(item['text'].lower())
        tokens = [token for token in tokens if token.isalpha() and token not in stop_words]
        counter.update(tokens)
    vocab_tokens = ['[PAD]', '[UNK]', '[CLS]', '[SEP]'] + [token for token, _ in counter.most_common(max_vocab_size)]
    return {token: idx for idx, token in enumerate(vocab_tokens)}

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split
from datasets import load_dataset
from collections import Counter
import random
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Set random seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)
random.seed(42)

# Define the Hyperdimensional Computing with Entropix class
class HDComputing:
    def __init__(self, dim, seed=None, entropy_weight=0.1):
        self.dim = dim
        self.random_state = np.random.RandomState(seed)
        self.entropy_weight = entropy_weight  # Weight for Entropix entropy modulation

    def random_hv(self):
        return self.random_state.choice([-1, 1], size=self.dim)

    def superpose(self, hvs):
        sum_hv = np.sum(hvs, axis=0)
        return np.sign(sum_hv)

    def bind(self, hv1, hv2):
        return hv1 * hv2

    def permute(self, hv, shifts=1):
        return np.roll(hv, shifts)

    def apply_entropy(self, hv):
        """Apply Entropix modulation to enhance reasoning with entropy."""
        noise = self.random_state.choice([-1, 1], size=self.dim)
        return np.sign(hv + self.entropy_weight * noise)

# Custom Dataset Class for AG News
class AGNewsDataset(Dataset):
    def __init__(self, data, vocab, token_hvs, hd, max_seq_len, stop_words):
        self.data = data
        self.vocab = vocab
        self.token_hvs = token_hvs
        self.hd = hd
        self.max_seq_len = max_seq_len
        self.stop_words = stop_words

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        text = item['text']
        label = item['label']
        tokens = word_tokenize(text.lower())
        tokens = [token for token in tokens if token.isalpha() and token not in self.stop_words]
        tokens = tokens[:self.max_seq_len]
        tokens = ['[CLS]'] + tokens + ['[SEP]']
        seq_hv = encode_sequence(tokens, self.token_hvs, self.hd)
        return torch.tensor(seq_hv, dtype=torch.float32), torch.tensor(label, dtype=torch.long)

# Function to create token hypervectors
def create_token_hvs(vocab, dim, hd):
    return {token: hd.apply_entropy(hd.random_hv()) for token in vocab}

# Function to encode sequences with Entropix modulation
def encode_sequence(tokens, token_hvs, hd):
    sequence_hv = np.zeros(hd.dim)
    for i, token in enumerate(tokens):
        token_hv = token_hvs.get(token, token_hvs['[UNK]'])
        permuted_token_hv = hd.permute(token_hv, shifts=i)
        sequence_hv += permuted_token_hv
    return np.sign(sequence_hv)

# HDC Neural Network model with Entropix-enhanced classification
class HDCNNClassifier(nn.Module):
    def __init__(self, dim, num_classes):
        super(HDCNNClassifier, self).__init__()
        self.fc1 = nn.Linear(dim, 512)
        self.activation = nn.ReLU()
        self.dropout = nn.Dropout(p=0.6)
        self.fc2 = nn.Linear(512, num_classes)

    def forward(self, x):
        out = self.fc1(x)
        out = self.activation(out)
        out = self.dropout(out)
        out = self.fc2(out)
        # Apply softmax for classification output
        return nn.functional.log_softmax(out, dim=-1)

# Main function to train and evaluate the model
def main():
    dim = 5000
    hd = HDComputing(dim, seed=42, entropy_weight=0.2)
    max_vocab_size = 5000
    max_seq_len = 50
    batch_size = 128
    num_epochs = 5
    learning_rate = 0.001
    num_classes = 4
    stop_words = set(stopwords.words('english'))

    dataset = load_dataset('ag_news')
    train_data = dataset['train'].shuffle(seed=42)
    test_data = dataset['test'].shuffle(seed=42)

    vocab = build_vocab(train_data, max_vocab_size, stop_words)
    token_hvs = create_token_hvs(vocab, dim, hd)

    train_dataset = AGNewsDataset(train_data, vocab, token_hvs, hd, max_seq_len, stop_words)
    test_dataset = AGNewsDataset(test_data, vocab, token_hvs, hd, max_seq_len, stop_words)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    model = HDCNNClassifier(dim, num_classes)
    criterion = nn.NLLLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    # Training loop
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {total_loss / len(train_loader):.4f}')

    # Evaluation on test set
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    print(f'Test Accuracy: {100 * correct / total:.2f}%')

if __name__ == '__main__':
    main()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Epoch [1/5], Loss: 1.0738
Epoch [2/5], Loss: 0.8788
Epoch [3/5], Loss: 0.7366
Epoch [4/5], Loss: 0.6200
Epoch [5/5], Loss: 0.5368
Test Accuracy: 62.70%
