In [2]:
!pip install bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl.metadata (2.9 kB)
Downloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl (69.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.1/69.1 MB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.45.0


In [17]:
import torch
import torch.nn as nn
import transformers
from torch.utils.data import DataLoader
from torch.utils.data.dataset import Dataset
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
import torch.optim as optim
from transformers import get_scheduler
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
import pandas as pd
from peft import LoraConfig, TaskType, get_peft_model
import bitsandbytes as bnb
from accelerate import Accelerator
from transformers import BitsAndBytesConfig
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [18]:
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_storage=torch.bfloat16,
)
base_model = AutoModelForSequenceClassification.from_pretrained(
    "google-bert/bert-base-cased", num_labels=6, quantization_config=bnb_config, torch_dtype=torch.bfloat16
)

peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    inference_mode=False,
    r=4,  # Lowered rank for faster training
    lora_alpha=16,
    lora_dropout=0.1,
)

model = get_peft_model(base_model, peft_config)

if torch.cuda.is_available():
    device = torch.device("cuda")
    model.to(device)

# Accelerator for mixed precision and multi-GPU support
accelerator = Accelerator(mixed_precision="fp16")
model = accelerator.prepare(model)

# Print trainable parameters
model.print_trainable_parameters()

# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

`low_cpu_mem_usage` was None, now default to True since model is quantized.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ValueError: AcceleratorState has already been initialized and cannot be changed, restart your runtime completely and pass `mixed_precision='fp16'` to `Accelerator()`.

In [9]:
class ToxicCommentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels.iloc[idx].values, dtype=torch.float)
        return item

In [10]:
df = pd.read_csv("train.csv")

X = df["comment_text"]
y = df[["toxic",
        "severe_toxic",
        "obscene",
        "threat",
        "insult",
        "identity_hate"]]

In [11]:
# Tokenize the text
def tokenize_text(texts, max_length=128):
    return tokenizer(
        texts.tolist(),  # Convert pandas Series to list
        padding=True,    # Pad to max_length
        truncation=True, # Truncate to max_length
        max_length=max_length,
        return_tensors="pt",  # Return PyTorch tensors
    )

# Tokenize the input text
tokenized_texts = tokenize_text(X)

In [12]:
dataset = ToxicCommentDataset(tokenized_texts, y)
batch_size = 128
data_loader = DataLoader(
    dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=7,
    pin_memory=True,
    persistent_workers=True,
)



In [13]:
# configuring hf accelerate
data_loader, model, optimizer = accelerator.prepare(data_loader, model, optimizer)

In [14]:
# Scheduler
epochs = 3
training_steps = epochs * len(data_loader)
lr_scheduler = get_scheduler(
    name="cosine",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=training_steps,
)

In [15]:
progress_bar = tqdm(range(training_steps))


  0%|          | 0/2494 [00:00<?, ?it/s]

In [16]:
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    total_loss = 0
    all_predictions = []
    all_labels = []

    model.train()
    for batch in data_loader:
        # Forward pass
        outputs = model(**batch)
        loss = outputs.loss

        # Backward pass
        accelerator.backward(loss)

        # Update parameters
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        # Store predictions and labels
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        labels = batch["labels"]

        all_predictions.extend(predictions.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

        # Update progress bar
        progress_bar.update(1)

        # Accumulate loss
        total_loss += loss.item()

    # Average loss for the epoch
    avg_loss = total_loss / len(data_loader)
    print(f"Epoch {epoch + 1} - Average Loss: {avg_loss:.4f}")

    # Compute metrics
    accuracy = accuracy_score(all_labels, all_predictions)
    f1 = f1_score(all_labels, all_predictions, average="weighted")
    precision = precision_score(all_labels, all_predictions, average="weighted")
    recall = recall_score(all_labels, all_predictions, average="weighted")

    print(f"Epoch {epoch + 1} - Accuracy: {accuracy:.4f}")
    print(f"Epoch {epoch + 1} - F1 Score: {f1:.4f}")
    print(f"Epoch {epoch + 1} - Precision: {precision:.4f}")
    print(f"Epoch {epoch + 1} - Recall: {recall:.4f}")

progress_bar.close()


Epoch 1/1


KeyboardInterrupt: 