In [1]:
import torch
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, AdamW
from tqdm.auto import tqdm
import numpy as np
from sklearn.metrics import accuracy_score

In [2]:
# Load the AG News dataset
agnews_train = load_dataset("fancyzhx/ag_news", split="train")
agnews_test = load_dataset("fancyzhx/ag_news", split="test")

In [3]:
# Initialize the tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [4]:
# Tokenize the datasets
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128)

In [5]:
tokenized_train = agnews_train.map(tokenize_function, batched=True)
tokenized_test = agnews_test.map(tokenize_function, batched=True)

In [6]:
# Convert datasets to PyTorch Dataset
class AGNewsDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings['input_ids'])

    def __getitem__(self, idx):
        # item = {key: torch.tensor(self.encodings[key][idx]) for key in self.encodings if key != 'label'}
        item = {key: torch.tensor(self.encodings[key][idx]) for key in self.encodings if key != 'label'}
        item = {key: torch.tensor(self.encodings[key][idx]) for key in self.encodings.features if key != 'label'}
        item['labels'] = torch.tensor(self.encodings['label'][idx])
        return item

In [7]:
train_dataset = AGNewsDataset(tokenized_train)
test_dataset = AGNewsDataset(tokenized_test)

# Create DataLoader
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

In [8]:
# Check the number of samples in the training dataset
print(f"Number of samples in the training dataset: {len(train_dataset)}")

# Print the first sample
print("First sample in the training dataset:")
print(train_dataset[0])

# Print a few samples
for i in range(5):
    print(f"Sample {i}:")
    print(train_dataset[i])

# Convert tokenized IDs back to text
def decode_example(example):
    input_ids = example['input_ids'].tolist()
    return tokenizer.decode(input_ids, skip_special_tokens=True)

# Print a few decoded samples
for i in range(5):
    print(f"Sample {i}:")
    decoded_text = decode_example(train_dataset[i])
    print(f"Text: {decoded_text}")
    print(f"Label: {train_dataset[i]['labels'].item()}")

Number of samples in the training dataset: 120000
First sample in the training dataset:


ValueError: invalid literal for int() with base 10: 'text'

In [8]:
# Initialize the model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=4)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [9]:
# Initialize the optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)



In [10]:
# Training loop
model.train()
for epoch in range(3):  # Number of epochs
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

  0%|          | 0/7500 [00:00<?, ?it/s]

TypeError: new(): invalid data type 'str'

In [None]:
# Evaluation loop
model.eval()
all_preds = []
all_labels = []
for batch in tqdm(test_loader):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        all_preds.extend(predictions.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

In [None]:
accuracy = accuracy_score(all_labels, all_preds)
print(f"Accuracy: {accuracy:.4f}")