In [None]:
!pip install -q datasets
!pip install -q tiktoken

In [None]:
from datasets import load_dataset
ds = load_dataset("asahi417/multi-domain-document-classification")

import torch
from torch.utils.data import Dataset , DataLoader

import tiktoken
tokenizer = tiktoken.get_encoding('gpt2')

In [None]:
# Arguments
max_length = 90
num_outputs = 4
vocab_size = tokenizer.n_vocab
emd_dim = 15

In [None]:
print(ds)

In [None]:
X = ds['test']['text']
y = ds['test']['label']

In [None]:
X[:10],y[:10]

In [None]:
# try maximize max_length

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(
    X,y,test_size=0.3,random_state=1,stratify=y
)

In [None]:
len(X_train),len(X_test)

In [None]:
# Define Data Class
class dataset(Dataset):
    def __init__(self,X,y,tokenizer,max_length):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.features = []
        self.labels = torch.tensor(y)
        for i in range(len(self.labels)):
          tokens = self.tokenizer.encode(X[i])[:self.max_length]
          if len(tokens) < max_length:
                tokens += [0] * (max_length - len(tokens))
          self.features.append(torch.tensor(tokens))
    def __getitem__(self,index):
        one = self.features[index]
        two = self.labels[index]
        return one,two
    def __len__(self):
        return len(self.labels)
train_ds = dataset(X_train,y_train,tokenizer,max_length)
test_ds = dataset(X_test,y_test,tokenizer,max_length)

In [None]:
# Data Loader
torch.manual_seed(123)
train_loader = DataLoader(
    dataset = train_ds,
    shuffle = True,
    batch_size = 100,
    num_workers = 0,
    drop_last = True,
    )

In [None]:
# Building My NeuralNetwork
class NeuralNetwork(torch.nn.Module):
    def __init__(self,num_inputs,num_outputs,vocab_size,emd_dim):
        super().__init__()
        self.num_inputs = num_inputs
        self.num_outputs = num_outputs
        self.vocab_size = vocab_size
        self.emd_dim = emd_dim

        self.tok_emb = torch.nn.Embedding(self.vocab_size,self.emd_dim)
        self.pos_emb = torch.nn.Embedding(self.num_inputs, self.emd_dim)
        
        self.layers = torch.nn.Sequential(

            # 1st hidden layer
            torch.nn.Linear(self.emd_dim,30),
            torch.nn.ReLU(),

            # 2nd hidden layer
            torch.nn.Linear(30,20),
            torch.nn.ReLU(),

            # output layer
            torch.nn.Linear(20,self.num_outputs)
        )

    def forward(self,x):
        tok_embeds = self.tok_emb(x)
        pos_embeds = self.pos_emb(torch.arange(self.num_inputs))
        x = tok_embeds + pos_embeds

        x = x.mean(dim=1)  # Shape: [batch_size, emd_dim]
        
        logits = self.layers(x)
        return logits

In [None]:
# Training
import torch.nn.functional as F
torch.manual_seed(123)
model = NeuralNetwork(max_length,num_outputs,vocab_size,emd_dim)
optimizer = torch.optim.SGD(model.parameters(),lr = 0.1)

num_epochs = 200
for i in range(num_epochs):
    model.train()
    for batch ,(features,labels) in enumerate(train_loader):
        logits = model(features)
        loss = F.cross_entropy(logits,labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # Print loss and progress
        if (i+1)%10==0:
            print(f"Epoch {i + 1:03d}/{num_epochs:03d}"
                  f" | Batch {batch + 1:03d}/{len(train_loader):03d}"
                  f" | Loss: {loss.item():.4f}")

In [None]:
# Predcition Accuracy
def compute_accuracy(model,loader):
    correct = 0
    tatal_examples = 0
    model.eval()
    for idx ,(feature,lables) in enumerate(loader):
        with torch.no_grad():
            logits = model(features)
        predictions = torch.argmax(logits,dim=1)
        compare = labels == predictions
        correct += torch.sum(compare)
        tatal_examples += len(compare)
    return (correct/tatal_examples).item()

In [None]:
torch.manual_seed(123)
test_loader = DataLoader(
    dataset = test_ds,
    shuffle = False,
    batch_size = 500,
    num_workers = 0,
    )

In [None]:
accuracy_test = compute_accuracy(model,test_loader)
accuracy_test

In [None]:
accuracy_train = compute_accuracy(model,train_loader)
accuracy_train