In [1]:
# Install necessary libraries
!pip install transformers


Collecting transformers
  Downloading transformers-4.35.1-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m31.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.19.1-py3-none-any.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.1/311.1 kB[0m [31m39.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m79.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m75.7 MB/s[0m eta [36m0:00:00[0m
Col

In [2]:
# Import required libraries
from google.colab import files
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import f1_score
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [3]:
# Clear CUDA cache
torch.cuda.empty_cache()

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
# Define the DebertaClassifier class for binary classification
class DebertaBinaryClassifier(nn.Module):
    def __init__(self):
        super(DebertaBinaryClassifier, self).__init__()
        self.deberta = AutoModel.from_pretrained('microsoft/deberta-large')
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(1024, 1)

    def forward(self, input_ids, attention_mask, token_type_ids=None, labels=None):
        # Model forward pass
        outputs = self.deberta(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        last_hidden_state = outputs.last_hidden_state
        cls_representation = last_hidden_state[:, 0, :]
        cls_representation = self.dropout(cls_representation)
        logits = self.fc(cls_representation)
        logits = logits.squeeze(dim=1)
        if labels is not None:
            # Compute binary cross-entropy loss
            loss_fct = nn.BCEWithLogitsLoss()
            loss = loss_fct(logits, labels.float())
            return loss
        return logits


In [5]:
# Modify the DebertaTrainer class for binary classification
class DebertaBinaryTrainer:
    def __init__(self, model, train_dataloader, val_dataloader, test_dataloader, learning_rate=2e-5, eps=1e-8,
                 weight_decay=0.01, betas=(0.9, 0.999), num_epochs=3, warmup_prop=0.1):
        # Initialize trainer parameters
        self.model = model
        self.model.cuda()
        self.learning_rate = learning_rate
        self.eps = eps
        self.weight_decay = weight_decay
        self.betas = betas
        self.num_epochs = num_epochs
        self.warmup_prop = warmup_prop
        self.train_dataloader = train_dataloader
        self.validation_dataloader = val_dataloader
        self.test_dataloader = test_dataloader

        # Use PyTorch implementation of AdamW optimizer
        self.optimizer = optim.AdamW(self.model.parameters(), lr=self.learning_rate, eps=self.eps,
                                     weight_decay=self.weight_decay, betas=self.betas)

        # Learning rate scheduler with warm-up
        self.scheduler = get_linear_schedule_with_warmup(self.optimizer,
                                                         num_warmup_steps=len(self.train_dataloader) * self.num_epochs * self.warmup_prop,
                                                         num_training_steps=len(self.train_dataloader) * self.num_epochs)


In [6]:
# Modify the DebertaTrainer class for binary classification
class DebertaBinaryTrainer:
    def __init__(self, model, train_dataloader, val_dataloader, test_dataloader, learning_rate=2e-5, eps=1e-8,
                 weight_decay=0.01, betas=(0.9, 0.999), num_epochs=3, warmup_prop=0.1):
        # Initialize trainer parameters
        self.model = model
        self.model.cuda()
        self.learning_rate = learning_rate
        self.eps = eps
        self.weight_decay = weight_decay
        self.betas = betas
        self.num_epochs = num_epochs
        self.warmup_prop = warmup_prop
        self.train_dataloader = train_dataloader
        self.validation_dataloader = val_dataloader
        self.test_dataloader = test_dataloader

        # Use PyTorch implementation of AdamW optimizer
        self.optimizer = optim.AdamW(self.model.parameters(), lr=self.learning_rate, eps=self.eps,
                                     weight_decay=self.weight_decay, betas=self.betas)

        # Learning rate scheduler with warm-up
        self.scheduler = get_linear_schedule_with_warmup(self.optimizer,
                                                         num_warmup_steps=len(self.train_dataloader) * self.num_epochs * self.warmup_prop,
                                                         num_training_steps=len(self.train_dataloader) * self.num_epochs)

    def _train_epoch(self, epoch):
        # Train the model for one epoch
        self.model.train()
        train_loss = 0
        num_train_steps = 0

        train_iterator = tqdm(self.train_dataloader, desc="Training")

        for step, batch in enumerate(train_iterator):
            # Move input data to device
            input_ids = batch[0].to(device)
            attention_masks = batch[1].to(device)
            labels = batch[2].to(device)

            # Zero the gradients
            self.model.zero_grad()

            # Forward pass and compute loss
            loss = self.model(input_ids, attention_mask=attention_masks, labels=labels)

            # Backward pass and optimization step
            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
            self.optimizer.step()
            self.scheduler.step()

            # Update training loss
            train_loss += loss.item()
            num_train_steps += 1

            # Update progress bar
            train_iterator.set_description(f"Epoch {epoch + 1} - Train loss: {train_loss / num_train_steps:.3f}")

        # Compute average training loss
        train_loss /= num_train_steps
        print(f"Epoch {epoch + 1} - Training loss: {train_loss:.3f}")

    def evaluate(self):
        # Evaluate the model on the validation set
        self.model.eval()
        val_loss = 0
        num_val_steps = 0
        all_predictions = []
        all_labels = []

        for batch in tqdm(self.validation_dataloader, desc="Validation"):
            # Move input data to device
            input_ids = batch[0].to(device)
            attention_masks = batch[1].to(device)
            labels = batch[2].to(device)

            with torch.no_grad():
                # Forward pass
                logits = self.model(input_ids, attention_mask=attention_masks)

            # Compute validation loss
            loss = self.model(input_ids, attention_mask=attention_masks, labels=labels)
            val_loss += loss.item()
            num_val_steps += 1

            # Collect predictions and labels
            all_predictions.extend(torch.round(torch.sigmoid(logits)).detach().cpu().numpy())
            all_labels.extend(labels.detach().cpu().numpy())

        # Compute average validation loss
        val_loss /= num_val_steps
        print(f"Validation loss: {val_loss:.3f}")

        # Compute macro F1 score
        macro_f1 = f1_score(all_labels, all_predictions, average='macro')
        print(f"Validation Macro F1 score: {macro_f1:.3f}")

        return all_predictions, all_labels

    def train(self):
        # Train the model for multiple epochs
        best_val_loss = float('inf')
        for epoch in range(self.num_epochs):
            self._train_epoch(epoch)
            val_predictions, val_labels = self.evaluate()
            val_loss = f1_score(val_labels, val_predictions)
            print(f"Epoch {epoch + 1}: Validation F1 score: {val_loss:.3f}")
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                torch.save(self.model.state_dict(), "model.pt")
                print(f"Saved model at epoch {epoch + 1}:")

    def test(self):
        # Test the model on the test set
        self.model.load_state_dict(torch.load("model.pt"))
        self.model.eval()
        predictions = []

        for batch in tqdm(self.test_dataloader, desc="Testing"):
            # Move input data to device
            input_ids = batch[0].to(device)
            attention_masks = batch[1].to(device)

            with torch.no_grad():
                # Forward pass and compute sigmoid
                logits = torch.sigmoid(self.model(input_ids, attention_mask=attention_masks))

            # Collect predictions
            batch_predictions = torch.round(logits).cpu().numpy()
            predictions.extend(batch_predictions)

        return predictions

In [15]:
class DebertaDataset:
    def __init__(self, sentences, labels=None, tokenizer_name='microsoft/deberta-large', batch_size=4):
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
        self.batch_size = batch_size

        self.input_ids, self.attention_masks, self.labels = self._prepare_data(sentences, labels)

        self.train_dataloader = self._create_train_dataloader()

    def _prepare_data(self, sentences, labels):
        input_ids = []
        attention_masks = []

        if labels is not None:
            labels = torch.tensor(labels)

        for sent in sentences:
            encoded_dict = self.tokenizer.encode_plus(
                sent,
                add_special_tokens=True,
                max_length=128,
                pad_to_max_length=True,
                return_attention_mask=True,
                truncation=True,
                return_tensors='pt',
            )
            input_ids.append(encoded_dict['input_ids'])
            attention_masks.append(encoded_dict['attention_mask'])

        input_ids = torch.cat(input_ids, dim=0)
        attention_masks = torch.cat(attention_masks, dim=0)

        if labels is None:
            labels = torch.zeros(len(sentences), dtype=torch.float)  # Modify this for binary classification

        return input_ids, attention_masks, labels

    def _create_train_dataloader(self):
        dataset = TensorDataset(self.input_ids, self.attention_masks, self.labels)

        dataloader = DataLoader(
            dataset,
            sampler=RandomSampler(dataset),
            batch_size=self.batch_size
        )

        return dataloader


In [8]:
# Upload data files
upload1 = files.upload()
upload2 = files.upload()
upload3 = files.upload()

Saving train_data.csv to train_data.csv


Saving val_data.csv to val_data.csv


Saving test_data.csv to test_data.csv


In [10]:
# Load your data
train_df = pd.read_csv('train_data.csv')
val_df = pd.read_csv('val_data.csv')
test_df = pd.read_csv("test_data.csv")

# Encode labels using LabelEncoder
label_encoder = LabelEncoder()
train_df['labels'] = label_encoder.fit_transform(train_df['label_sexist'])
train_df.rename(columns={'text': 'input'}, inplace=True)
train_df = train_df[['input', 'labels']]
train_df = train_df[:2000]

val_df['labels'] = label_encoder.fit_transform(val_df['label_sexist'])
val_df.rename(columns={'text': 'input'}, inplace=True)
val_df = val_df[['input', 'labels']]

test_df['labels'] = label_encoder.fit_transform(test_df['label_sexist'])
test_df.rename(columns={'text': 'input'}, inplace=True)
test_df = test_df[['input', 'labels']]



# Get sentences and labels from training, validation, and test data
sentences_train = train_df['input'].tolist()
labels_train = train_df['labels'].tolist()

sentences_val = val_df['input'].tolist()
labels_val = val_df['labels'].tolist()

sentences_test = test_df['input'].tolist()
labels_test = test_df['labels'].tolist()



In [16]:
# Create DebertaDataset objects
train_dataset = DebertaDataset(sentences_train, labels_train, tokenizer_name='microsoft/deberta-large', batch_size=4)
val_dataset = DebertaDataset(sentences_val, labels_val, tokenizer_name='microsoft/deberta-large', batch_size=4)
test_dataset = DebertaDataset(sentences_test, tokenizer_name='microsoft/deberta-large', batch_size=4)

# Create a DebertaBinaryClassifier object
binary_classifier = DebertaBinaryClassifier()

# Create a DebertaBinaryTrainer object
binary_trainer = DebertaBinaryTrainer(model=binary_classifier,
                                     train_dataloader=train_dataset.train_dataloader,
                                     val_dataloader=val_dataset.train_dataloader,  # Use the train_dataloader for validation
                                     test_dataloader=test_dataset.train_dataloader,  # Use the train_dataloader for testing
                                     num_epochs=10)



In [None]:
# Train the model
binary_trainer.train()

# Get predictions on the test set
test_predictions = binary_trainer.test()

# Evaluate the model on the test set
true_labels = [label for _, _, label in binary_trainer.test_dataloader.dataset]
macro_f1 = f1_score(true_labels, test_predictions, average='macro')
print(f"Test Macro F1 score: {macro_f1:.3f}")

Epoch 1 - Train loss: 0.654: 100%|██████████| 500/500 [02:02<00:00,  4.08it/s]


Epoch 1 - Training loss: 0.654


Validation: 100%|██████████| 500/500 [01:08<00:00,  7.34it/s]


Validation loss: 0.593
Validation Macro F1 score: 0.686
Epoch 1: Validation F1 score: 0.578
Saved model at epoch 1:


Epoch 2 - Train loss: 0.660: 100%|██████████| 500/500 [01:58<00:00,  4.22it/s]


Epoch 2 - Training loss: 0.660


Validation: 100%|██████████| 500/500 [01:08<00:00,  7.26it/s]


Validation loss: 0.517
Validation Macro F1 score: 0.702
Epoch 2: Validation F1 score: 0.512
Saved model at epoch 2:


Epoch 3 - Train loss: 0.500:  42%|████▏     | 210/500 [00:50<01:15,  3.85it/s]