## Import Libraries

In [None]:
!pip install transformers datasets torch

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Collecting requests (from transformers)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (

In [None]:
!pip install transformers



In [1]:
import torch
from torch.utils.data import DataLoader
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW, get_scheduler
from datasets import load_dataset
# from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np


  from .autonotebook import tqdm as notebook_tqdm


## Load The Data

In [2]:
# Load the dataset from Hugging Face
dataset = load_dataset("tweet_eval", "sentiment")
train_dataset = dataset["train"]
val_dataset = dataset["validation"]
test_dataset = dataset["test"]


In [3]:
classes_names = train_dataset.features['label'].names
classes_names

['negative', 'neutral', 'positive']

## Preprocess the Data

### Calculate the Lengths of All Texts

In [4]:
# Calculate the lengths of all texts in the dataset
train_lengths = [len(text.split()) for text in train_dataset['text']]
val_lengths = [len(text.split()) for text in val_dataset['text']]
test_lengths = [len(text.split()) for text in test_dataset['text']]

In [5]:
import numpy as np

# Calculate the 98th percentile length
max_len_train = np.percentile(train_lengths, 98)
max_len_val = np.percentile(val_lengths, 98)
max_len_test = np.percentile(test_lengths, 98)

# Get the maximum length to cover 98% of all data
max_len = int(max(max_len_train, max_len_val, max_len_test))

print(f'98th percentile of sequence lengths - Train: {max_len_train}, Validation: {max_len_val}, Test: {max_len_test}')
print(f'Selected max_len for padding: {max_len}')


98th percentile of sequence lengths - Train: 28.0, Validation: 29.0, Test: 25.0
Selected max_len for padding: 29


### Use the Calculated max_length in the Tokenizer

In [6]:
# Initialize the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Function to tokenize the text with calculated max_length
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding=True, max_length=max_len)

# Apply the preprocessing to the datasets
train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

# Set the format of the datasets to PyTorch tensors
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])


### Create DataLoaders

In [7]:
# Function to create DataLoaders
def create_dataloader(dataset, batch_size=32):
    return DataLoader(dataset, shuffle=True, batch_size=batch_size)

# Setup batch_size
BATCH_SIZE = 32

# Turn data into iterables (batches)
train_dataloader = DataLoader(dataset=train_dataset,
                              shuffle=True,
                              batch_size=BATCH_SIZE)

val_dataloader = DataLoader(dataset=val_dataset,
                              shuffle=False,
                              batch_size=BATCH_SIZE)

test_dataloader = DataLoader(dataset=test_dataset,
                              shuffle=False,
                              batch_size=BATCH_SIZE)


## Initialize the Model

In [8]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

  return torch._C._cuda_getDeviceCount() > 0


device(type='cpu')

In [9]:
# Initialize the model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)

# Move the model to GPU if available
model.to(device)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

## Define Optimizer and Loss Function

In [10]:
# Define the optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
loss_fn = torch.nn.CrossEntropyLoss()
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)

lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)



## Training Loop

In [11]:
# Import tqdm for progress bar
from tqdm.auto import tqdm

# Set seed
torch.manual_seed(42)

# Define the loss function
loss_fn = torch.nn.CrossEntropyLoss()

# Function to train the model
def train_model(model, train_dataloader, val_dataloader, optimizer, lr_scheduler, num_epochs, device):
    model.train()
    for epoch in tqdm(range(num_epochs)):
        total_loss = 0
        total_correct = 0
        total_samples = 0

        for batch in train_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask)
            loss = loss_fn(outputs.logits, labels)
            total_loss += loss.item()

            loss.backward()
            optimizer.step()
            lr_scheduler.step()

            # Calculate accuracy
            preds = torch.argmax(outputs.logits, dim=1)
            total_correct += (preds == labels).sum().item()
            total_samples += labels.size(0)

        avg_train_loss = total_loss / len(train_dataloader)
        train_accuracy = total_correct / total_samples
        print(f"Epoch {epoch+1}/{num_epochs}, Training Loss: {avg_train_loss:.4f}, Training Accuracy: {train_accuracy:.4f}")

        # Evaluate on validation set
        model.eval()
        val_loss = 0
        val_correct = 0
        val_samples = 0

        with torch.no_grad():
            for batch in val_dataloader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['label'].to(device)

                outputs = model(input_ids, attention_mask=attention_mask)
                loss = loss_fn(outputs.logits, labels)
                val_loss += loss.item()

                preds = torch.argmax(outputs.logits, dim=1)
                val_correct += (preds == labels).sum().item()
                val_samples += labels.size(0)

        avg_val_loss = val_loss / len(val_dataloader)
        val_accuracy = val_correct / val_samples
        print(f"Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")

        model.train()

# Train the model
train_model(model, train_dataloader, val_dataloader, optimizer, lr_scheduler, num_epochs, device)


  0%|          | 0/3 [00:00<?, ?it/s]