In [1]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.utils import resample
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_scheduler
from torch.utils.data import DataLoader, Dataset
import torch
from tqdm import tqdm
from torch.cuda.amp import autocast, GradScaler

# Load dataset
file_path = '/content/drive/MyDrive/labeled_data_cleaned.csv'  # Update with your file path
data = pd.read_csv(file_path)

# Drop redundant column if it exists
if 'Unnamed: 0' in data.columns:
    data.drop(columns=['Unnamed: 0'], inplace=True)

# Clean tweets
def clean_tweet(tweet):
    tweet = re.sub(r"http\\S+", "", tweet)  # Remove URLs
    tweet = re.sub(r"[^A-Za-z0-9\\s]", "", tweet)  # Remove special characters
    tweet = re.sub(r"\\s+", " ", tweet).strip()  # Remove extra spaces
    return tweet

data['tweet'] = data['corrected_tweet'].apply(clean_tweet)

# Step 2: Split dataset
train_data, temp_data = train_test_split(data, test_size=0.3, stratify=data['class'], random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, stratify=temp_data['class'], random_state=42)

# Step 3: Balance the dataset (Oversampling Hate Speech class)
hate_speech = train_data[train_data['class'] == 0]
train_data_balanced = pd.concat([
    train_data,
    resample(hate_speech, replace=True, n_samples=1000, random_state=42)
])

# Step 4: Tokenize data
tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-base')

def tokenize_data(data):
    return tokenizer(
        data['tweet'].tolist(),
        padding='max_length',
        truncation=True,
        max_length=64,
        return_tensors='pt'
    )

train_encodings = tokenize_data(train_data_balanced)
val_encodings = tokenize_data(val_data)
test_encodings = tokenize_data(test_data)

# Step 5: Define Dataset class
class HateSpeechDataset(Dataset):
    def __init__(self, tokenized_texts, labels):
        self.tokenized_texts = tokenized_texts
        self.labels = labels.values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.tokenized_texts['input_ids'][idx],
            'attention_mask': self.tokenized_texts['attention_mask'][idx],
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

train_dataset = HateSpeechDataset(train_encodings, train_data_balanced['class'])
val_dataset = HateSpeechDataset(val_encodings, val_data['class'])
test_dataset = HateSpeechDataset(test_encodings, test_data['class'])

# Step 6: Prepare DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16)
test_dataloader = DataLoader(test_dataset, batch_size=16)

# Step 7: Define the model
model = AutoModelForSequenceClassification.from_pretrained('microsoft/deberta-v3-base', num_labels=3)

# Step 8: Define optimizer, scheduler, and class weights
optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = get_scheduler(
    "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * 10
)

class_weights = torch.tensor([3.0, 1.0, 2.0], dtype=torch.float).to('cuda')
loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights)

# Device setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Step 9: Define the evaluate_model function
def evaluate_model(dataloader):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in dataloader:
            inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
            labels = batch['labels'].to(device)

            outputs = model(**inputs)
            preds = torch.argmax(outputs.logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    return all_preds, all_labels

# Training with Mixed Precision and Early Stopping
scaler = GradScaler()
accumulation_steps = 2
epochs = 10
best_val_f1 = 0
early_stopping_patience = 3
patience_counter = 0

for epoch in range(epochs):
    model.train()
    total_loss = 0
    print(f"Epoch {epoch + 1}/{epochs}")

    for batch_idx, batch in enumerate(tqdm(train_dataloader)):
        inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        with autocast():
            outputs = model(**inputs)
            loss = loss_fn(outputs.logits, labels)
            loss = loss / accumulation_steps

        scaler.scale(loss).backward()

        if (batch_idx + 1) % accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

        total_loss += loss.item() * accumulation_steps

    scheduler.step()
    print(f"Average training loss: {total_loss / len(train_dataloader):.4f}")

    # Validation
    val_preds, val_labels = evaluate_model(val_dataloader)
    val_report = classification_report(
        val_labels,
        val_preds,
        target_names=['Hate Speech', 'Offensive', 'Neutral'],
        output_dict=True
    )
    val_f1 = val_report['Hate Speech']['f1-score']
    print(f"Validation F1 for Hate Speech: {val_f1:.4f}")

    # Early Stopping
    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        torch.save(model.state_dict(), "best_deberta_model.pth")
        patience_counter = 0
    else:
        patience_counter += 1

    if patience_counter >= early_stopping_patience:
        print("Early stopping triggered.")
        break

# Step 10: Load Best Model and Test
model.load_state_dict(torch.load("best_deberta_model.pth"))
test_preds, test_labels = evaluate_model(test_dataloader)
print("Test Metrics:")
print(classification_report(test_labels, test_preds, target_names=['Hate Speech', 'Offensive', 'Neutral']))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = GradScaler()


Epoch 1/10


  with autocast():
100%|██████████| 54/54 [00:11<00:00,  4.65it/s]


Average training loss: 0.8206


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Validation F1 for Hate Speech: 0.1013
Epoch 2/10


  with autocast():
100%|██████████| 54/54 [00:09<00:00,  5.92it/s]


Average training loss: 0.6043


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Validation F1 for Hate Speech: 0.1013
Epoch 3/10


  with autocast():
100%|██████████| 54/54 [00:09<00:00,  5.97it/s]


Average training loss: 0.3872


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Validation F1 for Hate Speech: 0.1515
Epoch 4/10


  with autocast():
100%|██████████| 54/54 [00:09<00:00,  5.85it/s]


Average training loss: 0.3035


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Validation F1 for Hate Speech: 0.2143
Epoch 5/10


  with autocast():
100%|██████████| 54/54 [00:09<00:00,  5.83it/s]


Average training loss: 0.2435
Validation F1 for Hate Speech: 0.3077
Epoch 6/10


  with autocast():
100%|██████████| 54/54 [00:09<00:00,  5.81it/s]


Average training loss: 0.2099
Validation F1 for Hate Speech: 0.2222
Epoch 7/10


  with autocast():
100%|██████████| 54/54 [00:09<00:00,  5.71it/s]


Average training loss: 0.1211
Validation F1 for Hate Speech: 0.2000
Epoch 8/10


  with autocast():
100%|██████████| 54/54 [00:09<00:00,  5.57it/s]


Average training loss: 0.1017
Validation F1 for Hate Speech: 0.2727
Early stopping triggered.


  model.load_state_dict(torch.load("best_deberta_model.pth"))


Test Metrics:
              precision    recall  f1-score   support

 Hate Speech       0.50      0.11      0.18         9
   Offensive       0.82      0.94      0.87       114
     Neutral       0.53      0.33      0.41        27

    accuracy                           0.78       150
   macro avg       0.62      0.46      0.49       150
weighted avg       0.75      0.78      0.75       150

