In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer

# Charger les données
file_path = "/kaggle/input/datasetes/Arabic.xlsx"  # Mets le bon chemin si nécessaire
df = pd.read_excel(file_path)

# Convertir les labels en 0 et 1
df["Hate speech"] = df["Hate speech"].map({"yes": 1, "no": 0})

# Séparer en train (80%) et test (20%)
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    df["processed_comment"], df["Hate speech"], test_size=0.2, random_state=42, stratify=df["Hate speech"]
)

# Séparer temp en validation (10%) et test (10%)
val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts, temp_labels, test_size=0.5, random_state=42, stratify=temp_labels
)

# Charger le tokenizer d'AraBERT
model_name = "aubmindlab/bert-base-arabertv02"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenization des textes avec padding et truncation
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True, max_length=128)

# Vérifier la taille des ensembles
print(f"Train : {len(train_texts)} échantillons")
print(f"Validation : {len(val_texts)} échantillons")
print(f"Test : {len(test_texts)} échantillons")


tokenizer_config.json:   0%|          | 0.00/381 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/825k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.64M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Train : 10714 échantillons
Validation : 1339 échantillons
Test : 1340 échantillons


In [2]:
!pip install farasapy arabert


Collecting farasapy
  Downloading farasapy-0.0.14-py3-none-any.whl.metadata (8.9 kB)
Collecting arabert
  Downloading arabert-1.0.1-py3-none-any.whl.metadata (16 kB)
Collecting emoji==1.4.2 (from arabert)
  Downloading emoji-1.4.2.tar.gz (184 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m185.0/185.0 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading farasapy-0.0.14-py3-none-any.whl (11 kB)
Downloading arabert-1.0.1-py3-none-any.whl (179 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-1.4.2-py3-none-any.whl size=186455 sha256=5d9ea76e25fcf65b9558ae85ad929749cc73a776bf70b53ae97dff72009ef09f
  Stored in directory: /root/.cache/pip/wheels/10/f0/fd/4813b1177405693e

In [3]:
import torch
from torch.utils.data import Dataset, DataLoader

# Définir une classe Dataset pour PyTorch
class HateSpeechDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels.iloc[idx])
        return item

# Créer les datasets
train_dataset = HateSpeechDataset(train_encodings, train_labels)
val_dataset = HateSpeechDataset(val_encodings, val_labels)
test_dataset = HateSpeechDataset(test_encodings, test_labels)

# Définir les DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Vérifier la taille des DataLoaders
print(f"Taille train_loader : {len(train_loader)} batches")
print(f"Taille val_loader : {len(val_loader)} batches")
print(f"Taille test_loader : {len(test_loader)} batches")


Taille train_loader : 670 batches
Taille val_loader : 84 batches
Taille test_loader : 84 batches


In [4]:
import torch
import torch.nn as nn
from torch.optim import AdamW
from transformers import AutoModelForSequenceClassification, get_scheduler

# Vérifier si GPU disponible
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("GPU")
print(f"Utilisation de {device}")

# Charger AraBERT pour la classification binaire
model = AutoModelForSequenceClassification.from_pretrained("aubmindlab/bert-base-arabertv02", num_labels=2)
model.to(device)

# Définir l'optimiseur et la fonction de perte
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss()

# Scheduler pour ajuster le taux d'apprentissage
num_training_steps = len(train_loader) * 3  # 3 époques
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Fonction d'entraînement
def train_epoch(model, train_loader):
    model.train()
    total_loss, correct = 0, 0

    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        
        outputs = model(**batch)
        loss = loss_fn(outputs.logits, batch["labels"])
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        total_loss += loss.item()
        correct += (outputs.logits.argmax(1) == batch["labels"]).sum().item()

    return total_loss / len(train_loader), correct / len(train_dataset)

# Fonction d'évaluation
def evaluate(model, val_loader):
    model.eval()
    total_loss, correct = 0, 0

    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            
            outputs = model(**batch)
            loss = loss_fn(outputs.logits, batch["labels"])

            total_loss += loss.item()
            correct += (outputs.logits.argmax(1) == batch["labels"]).sum().item()

    return total_loss / len(val_loader), correct / len(val_dataset)

# Entraînement du modèle
num_epochs = 3
for epoch in range(num_epochs):
    train_loss, train_acc = train_epoch(model, train_loader)
    val_loss, val_acc = evaluate(model, val_loader)

    print(f"Époque {epoch+1}/{num_epochs}")
    print(f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}")
    print(f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")
    print("-" * 50)

# Sauvegarder le modèle entraîné
torch.save(model.state_dict(), "arabert_hate_speech.pth")
print("Modèle sauvegardé ! ✅")


Utilisation de cuda


model.safetensors:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv02 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Époque 1/3
Train Loss: 0.5682 | Train Acc: 0.6962
Val Loss: 0.5479 | Val Acc: 0.7334
--------------------------------------------------
Époque 2/3
Train Loss: 0.4409 | Train Acc: 0.7947
Val Loss: 0.5522 | Val Acc: 0.7431
--------------------------------------------------
Époque 3/3
Train Loss: 0.3242 | Train Acc: 0.8628
Val Loss: 0.5638 | Val Acc: 0.7588
--------------------------------------------------
Modèle sauvegardé ! ✅


In [5]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Define the model name (same as used during training)
model_name = "aubmindlab/bert-base-arabertv02"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load the model architecture (must match the one used during training)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)  # Adjust num_labels if needed

# Load saved weights
model.load_state_dict(torch.load("arabert_hate_speech.pth", map_location=torch.device('cpu')))

# Set model to evaluation mode
model.eval()

def predict(text):
    """Function to test the model on new input text."""
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()
    return "Hate Speech" if predicted_class == 1 else "Not Hate Speech"

# Example Test Cases
test_sentences = [
     "لن تجد أي خير في هؤلاء، فهذه عقلية موروثة عن أجدادهم.",  # Should return "Not Hate Speech"
     "ما كاين حتى فرق بين الشاوي، القبائلي، المزابيي، ولا التارقي، كلنا خاوة",  # Should return "Hate Speech"
]

for sentence in test_sentences:
    print(f"Text: {sentence} -> Prediction: {predict(sentence)}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv02 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load("arabert_hate_speech.pth", map_location=torch.device('cpu')))


Text: لن تجد أي خير في هؤلاء، فهذه عقلية موروثة عن أجدادهم. -> Prediction: Hate Speech
Text: ما كاين حتى فرق بين الشاوي، القبائلي، المزابيي، ولا التارقي، كلنا خاوة -> Prediction: Not Hate Speech


In [6]:
import pandas as pd
import torch
import torch.nn as nn
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModel, get_scheduler
from torch.utils.data import Dataset, DataLoader

# =======================
# 1️⃣ Load and Preprocess Data
# =======================
file_path = "/kaggle/input/datasetes/Arabic.xlsx"  # Change this if necessary
df = pd.read_excel(file_path)

# Convert labels to numeric values
df["Hate speech"] = df["Hate speech"].map({"yes": 1, "no": 0})

# Encode topics as numbers (Label Encoding)
topics = df["Topic"].unique()
topic_to_id = {topic: i for i, topic in enumerate(topics)}
df["Topic"] = df["Topic"].map(topic_to_id)

# Split into train (80%), validation (10%), and test (10%)
train_texts, temp_texts, train_labels, temp_labels, train_topics, temp_topics = train_test_split(
    df["processed_comment"], df["Hate speech"], df["Topic"], test_size=0.2, random_state=42, stratify=df["Hate speech"]
)

val_texts, test_texts, val_labels, test_labels, val_topics, test_topics = train_test_split(
    temp_texts, temp_labels, temp_topics, test_size=0.5, random_state=42, stratify=temp_labels
)

# =======================
# 2️⃣ Tokenization
# =======================
model_name = "aubmindlab/bert-base-arabertv02"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize text with truncation and padding
def tokenize_texts(texts):
    return tokenizer(list(texts), truncation=True, padding=True, max_length=128, return_tensors="pt")

train_encodings = tokenize_texts(train_texts)
val_encodings = tokenize_texts(val_texts)
test_encodings = tokenize_texts(test_texts)

# =======================
# 3️⃣ Custom Dataset Class
# =======================
class HateSpeechDataset(Dataset):
    def __init__(self, encodings, hate_speech_labels, topic_labels):
        self.encodings = encodings
        self.hate_speech_labels = hate_speech_labels
        self.topic_labels = topic_labels

    def __len__(self):
        return len(self.hate_speech_labels)

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}  # Fix warning
        item["hate_speech_labels"] = torch.tensor(self.hate_speech_labels.iloc[idx], dtype=torch.long)
        item["topic_labels"] = torch.tensor(self.topic_labels.iloc[idx], dtype=torch.long)
        return item

# Create datasets
train_dataset = HateSpeechDataset(train_encodings, train_labels, train_topics)
val_dataset = HateSpeechDataset(val_encodings, val_labels, val_topics)
test_dataset = HateSpeechDataset(test_encodings, test_labels, test_topics)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# =======================
# 4️⃣ Define Model
# =======================
class MultiTaskModel(nn.Module):
    def __init__(self, model_name, num_topics):
        super(MultiTaskModel, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.hate_speech_classifier = nn.Linear(self.bert.config.hidden_size, 2)  # Hate speech (binary)
        self.topic_classifier = nn.Linear(self.bert.config.hidden_size, num_topics)  # Topic classification

    def forward(self, input_ids, attention_mask, token_type_ids=None, hate_speech_labels=None, topic_labels=None):
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = bert_outputs.last_hidden_state[:, 0, :]  # Use [CLS] token

        hate_speech_logits = self.hate_speech_classifier(pooled_output)
        topic_logits = self.topic_classifier(pooled_output)

        loss = None
        if hate_speech_labels is not None and topic_labels is not None:
            hate_speech_loss = nn.CrossEntropyLoss()(hate_speech_logits, hate_speech_labels)
            topic_loss = nn.CrossEntropyLoss()(topic_logits, topic_labels)
            loss = hate_speech_loss + topic_loss  # Combined loss

        return loss, hate_speech_logits, topic_logits

# Load Model
num_topics = len(topics)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MultiTaskModel(model_name, num_topics).to(device)

# =======================
# 5️⃣ Training Setup
# =======================
optimizer = AdamW(model.parameters(), lr=2e-5)
num_training_steps = len(train_loader) * 3  # 3 epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# =======================
# 6️⃣ Training and Evaluation
# =======================
def train_epoch(model, train_loader):
    model.train()
    total_loss, correct_hate_speech, correct_topic = 0, 0, 0

    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        loss, hate_speech_logits, topic_logits = model(**batch)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        total_loss += loss.item()
        correct_hate_speech += (hate_speech_logits.argmax(1) == batch["hate_speech_labels"]).sum().item()
        correct_topic += (topic_logits.argmax(1) == batch["topic_labels"]).sum().item()

    return total_loss / len(train_loader), correct_hate_speech / len(train_dataset), correct_topic / len(train_dataset)

def evaluate(model, val_loader):
    model.eval()
    total_loss, correct_hate_speech, correct_topic = 0, 0, 0

    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            loss, hate_speech_logits, topic_logits = model(**batch)

            total_loss += loss.item()
            correct_hate_speech += (hate_speech_logits.argmax(1) == batch["hate_speech_labels"]).sum().item()
            correct_topic += (topic_logits.argmax(1) == batch["topic_labels"]).sum().item()

    return total_loss / len(val_loader), correct_hate_speech / len(val_dataset), correct_topic / len(val_dataset)


# =======================
# 7️⃣ Train Model
# =======================
num_epochs = 3
for epoch in range(num_epochs):
    train_loss, train_acc_hate, train_acc_topic = train_epoch(model, train_loader)
    val_loss, val_acc_hate, val_acc_topic = evaluate(model, val_loader)

    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"Train Loss: {train_loss:.4f} | Hate Speech Acc: {train_acc_hate:.4f} | Topic Acc: {train_acc_topic:.4f}")
    print(f"Val Loss: {val_loss:.4f} | Hate Speech Acc: {val_acc_hate:.4f} | Topic Acc: {val_acc_topic:.4f}")
    print("-" * 50)

# =======================
# 8️⃣ Save Model
# =======================
from datetime import datetime
# Get current timestamp
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# Define filename with timestamp
model_path = f"/kaggle/working/arabert_hate_speech_topics_{timestamp}.pth"

torch.save(model.state_dict(), model_path)

print("Model saved! ✅")



Epoch 1/3
Train Loss: 1.7574 | Hate Speech Acc: 0.7053 | Topic Acc: 0.6173
Val Loss: 1.3887 | Hate Speech Acc: 0.7192 | Topic Acc: 0.7237
--------------------------------------------------
Epoch 2/3
Train Loss: 1.1201 | Hate Speech Acc: 0.7871 | Topic Acc: 0.7869
Val Loss: 1.2686 | Hate Speech Acc: 0.7356 | Topic Acc: 0.7453
--------------------------------------------------
Epoch 3/3
Train Loss: 0.8338 | Hate Speech Acc: 0.8343 | Topic Acc: 0.8566
Val Loss: 1.2762 | Hate Speech Acc: 0.7431 | Topic Acc: 0.7633
--------------------------------------------------
Model saved! ✅


In [7]:
import pandas as pd
import torch
import torch.nn as nn
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModel, get_scheduler
from torch.utils.data import Dataset, DataLoader

# =======================
# 1️⃣ Load and Preprocess Data
# =======================
file_path = "/kaggle/input/datasetes/Arabic.xlsx"
df = pd.read_excel(file_path)

# Convert labels to numeric values
df["Hate speech"] = df["Hate speech"].map({"yes": 1, "no": 0})

# Encode topics as numbers
topics = df["Topic"].unique()
topic_to_id = {topic: i for i, topic in enumerate(topics)}
df["Topic"] = df["Topic"].map(topic_to_id)

# Split data
train_texts, temp_texts, train_labels, temp_labels, train_topics, temp_topics = train_test_split(
    df["processed_comment"], df["Hate speech"], df["Topic"],
    test_size=0.2, random_state=42, stratify=df["Hate speech"]
)

val_texts, test_texts, val_labels, test_labels, val_topics, test_topics = train_test_split(
    temp_texts, temp_labels, temp_topics,
    test_size=0.5, random_state=42, stratify=temp_labels
)

# =======================
# 2️⃣ Tokenizer
# =======================
model_name = "aubmindlab/bert-base-arabertv02"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# =======================
# 3️⃣ Dataset Class (with on-the-fly tokenization ✅)
# =======================
class HateSpeechDataset(Dataset):
    def __init__(self, texts, hate_speech_labels, topic_labels, tokenizer, max_length=128):
        self.texts = texts.tolist()
        self.hate_speech_labels = hate_speech_labels.tolist()
        self.topic_labels = topic_labels.tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.hate_speech_labels)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors="pt"
        )

        # Remove batch dimension
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item["hate_speech_labels"] = torch.tensor(self.hate_speech_labels[idx], dtype=torch.long)
        item["topic_labels"] = torch.tensor(self.topic_labels[idx], dtype=torch.long)
        return item

# =======================
# 4️⃣ DataLoaders
# =======================
train_dataset = HateSpeechDataset(train_texts, train_labels, train_topics, tokenizer)
val_dataset = HateSpeechDataset(val_texts, val_labels, val_topics, tokenizer)
test_dataset = HateSpeechDataset(test_texts, test_labels, test_topics, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# =======================
# 5️⃣ Model Definition
# =======================
class MultiTaskModel(nn.Module):
    def __init__(self, model_name, num_topics):
        super(MultiTaskModel, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.hate_speech_classifier = nn.Linear(self.bert.config.hidden_size, 2)
        self.topic_classifier = nn.Linear(self.bert.config.hidden_size, num_topics)

    def forward(self, input_ids, attention_mask, token_type_ids=None, hate_speech_labels=None, topic_labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]

        hate_speech_logits = self.hate_speech_classifier(pooled_output)
        topic_logits = self.topic_classifier(pooled_output)

        loss = None
        if hate_speech_labels is not None and topic_labels is not None:
            hate_speech_loss = nn.CrossEntropyLoss()(hate_speech_logits, hate_speech_labels)
            topic_loss = nn.CrossEntropyLoss()(topic_logits, topic_labels)
            loss = hate_speech_loss + topic_loss

        return loss, hate_speech_logits, topic_logits

# Model initialization
num_topics = len(topics)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MultiTaskModel(model_name, num_topics).to(device)

# =======================
# 6️⃣ Optimizer and Scheduler
# =======================




       


# =======================
# 5️⃣ Training Setup
# =======================
optimizer = AdamW(model.parameters(), lr=2e-5)
num_training_steps = len(train_loader) * 3  # 3 epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# =======================
# 6️⃣ Training and Evaluation
# =======================
def train_epoch(model, train_loader):
    model.train()
    total_loss, correct_hate_speech, correct_topic = 0, 0, 0

    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        loss, hate_speech_logits, topic_logits = model(**batch)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        total_loss += loss.item()
        correct_hate_speech += (hate_speech_logits.argmax(1) == batch["hate_speech_labels"]).sum().item()
        correct_topic += (topic_logits.argmax(1) == batch["topic_labels"]).sum().item()

    return total_loss / len(train_loader), correct_hate_speech / len(train_dataset), correct_topic / len(train_dataset)

def evaluate(model, val_loader):
    model.eval()
    total_loss, correct_hate_speech, correct_topic = 0, 0, 0

    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            loss, hate_speech_logits, topic_logits = model(**batch)

            total_loss += loss.item()
            correct_hate_speech += (hate_speech_logits.argmax(1) == batch["hate_speech_labels"]).sum().item()
            correct_topic += (topic_logits.argmax(1) == batch["topic_labels"]).sum().item()

    return total_loss / len(val_loader), correct_hate_speech / len(val_dataset), correct_topic / len(val_dataset)


# =======================
# 7️⃣ Train Model
# =======================
num_epochs = 3
for epoch in range(num_epochs):
    train_loss, train_acc_hate, train_acc_topic = train_epoch(model, train_loader)
    val_loss, val_acc_hate, val_acc_topic = evaluate(model, val_loader)

    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"Train Loss: {train_loss:.4f} | Hate Speech Acc: {train_acc_hate:.4f} | Topic Acc: {train_acc_topic:.4f}")
    print(f"Val Loss: {val_loss:.4f} | Hate Speech Acc: {val_acc_hate:.4f} | Topic Acc: {val_acc_topic:.4f}")
    print("-" * 50)

# =======================
# 8️⃣ Save Model
# =======================
from datetime import datetime
# Get current timestamp
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# Define filename with timestamp
model_path = f"/kaggle/working/arabert_hate_speech_topics_better{timestamp}.pth"

torch.save(model.state_dict(), model_path)

print("Model saved! ✅")




Epoch 1/3
Train Loss: 1.7273 | Hate Speech Acc: 0.7049 | Topic Acc: 0.6295
Val Loss: 1.3833 | Hate Speech Acc: 0.7162 | Topic Acc: 0.7222
--------------------------------------------------
Epoch 2/3
Train Loss: 1.1088 | Hate Speech Acc: 0.7877 | Topic Acc: 0.7867
Val Loss: 1.2632 | Hate Speech Acc: 0.7476 | Topic Acc: 0.7625
--------------------------------------------------
Epoch 3/3
Train Loss: 0.8328 | Hate Speech Acc: 0.8348 | Topic Acc: 0.8569
Val Loss: 1.2671 | Hate Speech Acc: 0.7438 | Topic Acc: 0.7647
--------------------------------------------------
Model saved! ✅


In [8]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score
from transformers import AutoTokenizer, AutoModel, get_scheduler
from torch.utils.data import Dataset, DataLoader
from datetime import datetime
import numpy as np
import json
from typing import Dict, Tuple

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# =======================
# 1️⃣ Enhanced Data Loading and Preprocessing
# =======================
def load_and_preprocess_data(file_path: str) -> Tuple:
    """Load and preprocess data with enhanced validation"""
    try:
        df = pd.read_excel(file_path)
        
        # Validate data structure
        required_columns = {'processed_comment', 'Hate speech', 'Topic'}
        if not required_columns.issubset(df.columns):
            raise ValueError(f"Missing required columns. Needed: {required_columns}")
        
        # Enhanced label processing
        df["Hate speech"] = df["Hate speech"].map({"yes": 1, "no": 0})
        
        # Robust topic encoding
        topics = df["Topic"].unique()
        topic_to_id = {topic: i for i, topic in enumerate(topics)}
        df["Topic"] = df["Topic"].map(topic_to_id)
        
        # Stratified split with validation
        train_texts, temp_texts, train_labels, temp_labels, train_topics, temp_topics = train_test_split(
            df["processed_comment"], df["Hate speech"], df["Topic"], 
            test_size=0.2, random_state=42, stratify=df[["Hate speech", "Topic"]]
        )
        
        val_texts, test_texts, val_labels, test_labels, val_topics, test_topics = train_test_split(
            temp_texts, temp_labels, temp_topics, 
            test_size=0.5, random_state=42, stratify=temp_labels
        )
        
        return (train_texts, train_labels, train_topics, 
                val_texts, val_labels, val_topics,
                test_texts, test_labels, test_topics,
                len(topics))
    
    except Exception as e:
        print(f"Error loading data: {str(e)}")
        raise

file_path = "/kaggle/input/datasetes/Arabic.xlsx"
(train_texts, train_labels, train_topics, 
 val_texts, val_labels, val_topics,
 test_texts, test_labels, test_topics,
 num_topics) = load_and_preprocess_data(file_path)

# =======================
# 2️⃣ Enhanced Tokenization
# =======================
model_name = "aubmindlab/bert-base-arabertv02"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_texts(texts: pd.Series) -> Dict[str, torch.Tensor]:
    """Tokenize texts with enhanced settings"""
    return tokenizer(
        texts.tolist(),  # Convert to list for better performance
        truncation=True,
        padding='max_length',
        max_length=128,
        return_tensors="pt",
        return_attention_mask=True,
        return_token_type_ids=False  # Not needed for AraBERT
    )

train_encodings = tokenize_texts(train_texts)
val_encodings = tokenize_texts(val_texts)
test_encodings = tokenize_texts(test_texts)

# =======================
# 3️⃣ Enhanced Dataset Class
# =======================
class HateSpeechDataset(Dataset):
    def __init__(self, encodings: Dict[str, torch.Tensor], 
                 hate_speech_labels: pd.Series, 
                 topic_labels: pd.Series):
        self.encodings = encodings
        self.hate_speech_labels = hate_speech_labels.values  # Convert to numpy for faster access
        self.topic_labels = topic_labels.values

    def __len__(self) -> int:
        return len(self.hate_speech_labels)

    def __getitem__(self, idx: int) -> Dict:
        return {
            'input_ids': self.encodings['input_ids'][idx].clone(),
            'attention_mask': self.encodings['attention_mask'][idx].clone(),
            'hate_speech_labels': torch.tensor(self.hate_speech_labels[idx], dtype=torch.long),
            'topic_labels': torch.tensor(self.topic_labels[idx], dtype=torch.long)
        }

# Create datasets with validation
train_dataset = HateSpeechDataset(train_encodings, train_labels, train_topics)
val_dataset = HateSpeechDataset(val_encodings, val_labels, val_topics)
test_dataset = HateSpeechDataset(test_encodings, test_labels, test_topics)

# Enhanced DataLoaders with optimized settings
batch_size = 32  # Increased batch size for better GPU utilization
num_workers = 4 if torch.cuda.is_available() else 2

train_loader = DataLoader(
    train_dataset, 
    batch_size=batch_size, 
    shuffle=True, 
    pin_memory=True,
    num_workers=num_workers,
    persistent_workers=True
)

val_loader = DataLoader(
    val_dataset, 
    batch_size=batch_size, 
    shuffle=False, 
    pin_memory=True,
    num_workers=num_workers,
    persistent_workers=True
)

test_loader = DataLoader(
    test_dataset, 
    batch_size=batch_size, 
    shuffle=False,
    pin_memory=True,
    num_workers=num_workers
)

# =======================
# 4️⃣ Advanced Hybrid Model Architecture
# =======================
class EnhancedHybridModel(nn.Module):
    def __init__(self, model_name: str, num_topics: int):
        super().__init__()
        # BERT backbone
        self.bert = AutoModel.from_pretrained(model_name)
        self.embedding_dim = self.bert.config.hidden_size
        
        # Enhanced CNN architecture with explicit padding
        self.conv_layers = nn.ModuleList([
            nn.Sequential(
                nn.Conv1d(self.embedding_dim, 256, kernel_size=2, padding=1),
                nn.ReLU(),
                nn.Dropout(0.3)
            ),
            nn.Sequential(
                nn.Conv1d(256, 128, kernel_size=3, padding=1),
                nn.ReLU(),
                nn.Dropout(0.3)
            ),
            nn.Sequential(
                nn.Conv1d(128, 64, kernel_size=4, padding=2),
                nn.ReLU(),
                nn.Dropout(0.3)
            )
        ])
        
        # Attention mechanism
        self.attention = nn.Sequential(
            nn.Linear(64, 64),
            nn.Tanh(),
            nn.Linear(64, 1, bias=False)
        )
        
        # Layer normalization
        self.layer_norm = nn.LayerNorm(64)
        
        # Classifiers with proper initialization
        self.hate_classifier = self._init_classifier(64, 2)
        self.topic_classifier = self._init_classifier(64, num_topics)
        
    def _init_classifier(self, in_features: int, out_features: int) -> nn.Module:
        """Initialize classifier with Xavier initialization"""
        layer = nn.Linear(in_features, out_features)
        nn.init.xavier_uniform_(layer.weight)
        nn.init.zeros_(layer.bias)
        return layer
    
    def forward(self, input_ids: torch.Tensor, 
                attention_mask: torch.Tensor,
                hate_speech_labels: torch.Tensor = None,
                topic_labels: torch.Tensor = None) -> Tuple:
        # BERT embeddings
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        embeddings = bert_outputs.last_hidden_state.permute(0, 2, 1)  # (batch, emb_dim, seq_len)
        
        # CNN processing
        x = embeddings
        for conv_layer in self.conv_layers:
            x = conv_layer(x)
        x = x.permute(0, 2, 1)  # (batch, seq_len, features)
        
        # Attention mechanism
        attention_scores = F.softmax(self.attention(x), dim=1)
        x = torch.sum(attention_scores * x, dim=1)
        x = self.layer_norm(x)
        
        # Classification
        hate_logits = self.hate_classifier(x)
        topic_logits = self.topic_classifier(x)
        
        # Loss calculation if labels provided
        loss = None
        if hate_speech_labels is not None and topic_labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            hate_loss = loss_fn(hate_logits, hate_speech_labels)
            topic_loss = loss_fn(topic_logits, topic_labels)
            loss = hate_loss + 0.7 * topic_loss  # Weighted loss
            
        return loss, hate_logits, topic_logits

# Initialize model with device placement
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = EnhancedHybridModel(model_name, num_topics).to(device)

# =======================
# 5️⃣ Optimized Training Setup
# =======================
# Optimizer with gradient clipping
optimizer = AdamW(
    model.parameters(),
    lr=3e-5,
    weight_decay=0.01,
    eps=1e-8
)

# Learning rate scheduler
num_epochs = 10  # Increased epochs with early stopping
num_training_steps = num_epochs * len(train_loader)
warmup_steps = int(0.1 * num_training_steps)

lr_scheduler = get_scheduler(
    "cosine",
    optimizer=optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=num_training_steps
)

# Training monitoring
best_val_loss = float('inf')
best_val_f1 = 0.0
patience = 4
epochs_without_improvement = 0

# =======================
# 6️⃣ Enhanced Training Loop
# =======================
def compute_metrics(logits: torch.Tensor, labels: torch.Tensor) -> Dict:
    """Compute multiple metrics for evaluation"""
    preds = torch.argmax(logits, dim=1).cpu().numpy()
    labels = labels.cpu().numpy()
    return {
        'accuracy': accuracy_score(labels, preds),
        'f1': f1_score(labels, preds, average='weighted')
    }

for epoch in range(num_epochs):
    # Training phase
    model.train()
    total_train_loss = 0
    train_hate_correct = 0
    train_topic_correct = 0
    
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        loss, hate_logits, topic_logits = model(**batch)
        
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        lr_scheduler.step()
        
        total_train_loss += loss.item()
        train_hate_correct += (hate_logits.argmax(1) == batch["hate_speech_labels"]).sum().item()
        train_topic_correct += (topic_logits.argmax(1) == batch["topic_labels"]).sum().item()
    
    # Validation phase
    model.eval()
    total_val_loss = 0
    val_hate_correct = 0
    val_topic_correct = 0
    all_hate_preds = []
    all_hate_labels = []
    
    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            loss, hate_logits, topic_logits = model(**batch)
            
            total_val_loss += loss.item()
            val_hate_correct += (hate_logits.argmax(1) == batch["hate_speech_labels"]).sum().item()
            val_topic_correct += (topic_logits.argmax(1) == batch["topic_labels"]).sum().item()
            
            # Store predictions for F1 calculation
            all_hate_preds.extend(hate_logits.argmax(1).cpu().numpy())
            all_hate_labels.extend(batch["hate_speech_labels"].cpu().numpy())
    
    # Calculate metrics
    train_loss = total_train_loss / len(train_loader)
    train_hate_acc = train_hate_correct / len(train_dataset)
    train_topic_acc = train_topic_correct / len(train_dataset)
    
    val_loss = total_val_loss / len(val_loader)
    val_hate_acc = val_hate_correct / len(val_dataset)
    val_topic_acc = val_topic_correct / len(val_dataset)
    val_hate_f1 = f1_score(all_hate_labels, all_hate_preds, average='weighted')
    
    # Print statistics
    print(f"\nEpoch {epoch + 1}/{num_epochs}")
    print(f"Train Loss: {train_loss:.4f} | Hate Acc: {train_hate_acc:.4f} | Topic Acc: {train_topic_acc:.4f}")
    print(f"Val Loss: {val_loss:.4f} | Hate Acc: {val_hate_acc:.4f} | Topic Acc: {val_topic_acc:.4f} | Hate F1: {val_hate_f1:.4f}")
    
    # Early stopping and model checkpointing
    if val_hate_f1 > best_val_f1:
        best_val_f1 = val_hate_f1
        best_val_loss = val_loss
        epochs_without_improvement = 0
        torch.save(model.state_dict(), "best_model.pth")
        print("Saved new best model")
    else:
        epochs_without_improvement += 1
        if epochs_without_improvement >= patience:
            print(f"\nEarly stopping after {epoch + 1} epochs!")
            break

# Load best model for testing
model.load_state_dict(torch.load("best_model.pth", map_location=device))

# =======================
# 7️⃣ Comprehensive Evaluation
# =======================
def evaluate_model(model: nn.Module, data_loader: DataLoader) -> Dict:
    """Evaluate model on given data loader"""
    model.eval()
    all_hate_preds = []
    all_hate_labels = []
    all_topic_preds = []
    all_topic_labels = []
    
    with torch.no_grad():
        for batch in data_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            _, hate_logits, topic_logits = model(**batch)
            
            all_hate_preds.extend(hate_logits.argmax(1).cpu().numpy())
            all_hate_labels.extend(batch["hate_speech_labels"].cpu().numpy())
            all_topic_preds.extend(topic_logits.argmax(1).cpu().numpy())
            all_topic_labels.extend(batch["topic_labels"].cpu().numpy())
    
    # Hate speech metrics
    hate_metrics = {
        'classification_report': classification_report(
            all_hate_labels, all_hate_preds, 
            target_names=["No Hate", "Hate"],
            output_dict=True
        ),
        'accuracy': accuracy_score(all_hate_labels, all_hate_preds),
        'f1': f1_score(all_hate_labels, all_hate_preds, average='weighted')
    }
    
    # Topic metrics
    topic_metrics = {
        'classification_report': classification_report(
            all_topic_labels, all_topic_preds,
            output_dict=True
        ),
        'accuracy': accuracy_score(all_topic_labels, all_topic_preds),
        'f1': f1_score(all_topic_labels, all_topic_preds, average='weighted')
    }
    
    return {
        'hate_speech': hate_metrics,
        'topic': topic_metrics
    }

print("\nTest Set Evaluation:")
test_results = evaluate_model(model, test_loader)

# Print formatted results
print("\nHate Speech Performance:")
print(f"Accuracy: {test_results['hate_speech']['accuracy']:.4f}")
print(f"F1 Score: {test_results['hate_speech']['f1']:.4f}")
print("\nClassification Report:")
print(classification_report(
    test_results['hate_speech']['classification_report']['No Hate']['support'],
    test_results['hate_speech']['classification_report']['Hate']['support'],
    target_names=["No Hate", "Hate"]
))

print("\nTopic Classification Performance:")
print(f"Accuracy: {test_results['topic']['accuracy']:.4f}")
print(f"F1 Score: {test_results['topic']['f1']:.4f}")

# =======================
# 8️⃣ Enhanced Model Saving
# =======================
def save_model_with_metadata(model: nn.Module, 
                           tokenizer: AutoTokenizer, 
                           metrics: Dict,
                           config: Dict) -> None:
    """Save model with comprehensive metadata"""
    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    model_path = f"arabert_hate_speech_model_{timestamp}"
    
    try:
        # Save model weights
        torch.save(model.state_dict(), f"{model_path}.pth")
        
        # Save tokenizer
        tokenizer.save_pretrained(model_path)
        
        # Save full configuration
        full_config = {
            "model_config": {
                "model_name": model_name,
                "num_topics": num_topics,
                "architecture": str(model),
                "input_size": 128  # Based on tokenizer max_length
            },
            "training_config": {
                "batch_size": batch_size,
                "learning_rate": 3e-5,
                "weight_decay": 0.01,
                "epochs": num_epochs
            },
            "performance_metrics": metrics
        }
        
        with open(f"{model_path}_config.json", "w") as f:
            json.dump(full_config, f, indent=4)
        
        print(f"\nModel successfully saved to {model_path}")
        
    except Exception as e:
        print(f"\nError saving model: {str(e)}")

# Prepare metrics for saving
final_metrics = {
    'test_performance': test_results,
    'best_validation': {
        'loss': best_val_loss,
        'hate_f1': best_val_f1
    }
}

# Save the model
save_model_with_metadata(
    model=model,
    tokenizer=tokenizer,
    metrics=final_metrics,
    config={
        'model_name': model_name,
        'num_topics': num_topics
    }
)


Epoch 1/10
Train Loss: 1.7813 | Hate Acc: 0.6471 | Topic Acc: 0.4614
Val Loss: 1.3166 | Hate Acc: 0.7140 | Topic Acc: 0.6639 | Hate F1: 0.7143
Saved new best model

Epoch 2/10
Train Loss: 1.1185 | Hate Acc: 0.7562 | Topic Acc: 0.7270
Val Loss: 1.0511 | Hate Acc: 0.7677 | Topic Acc: 0.7580 | Hate F1: 0.7678
Saved new best model

Epoch 3/10
Train Loss: 0.7743 | Hate Acc: 0.8270 | Topic Acc: 0.8250
Val Loss: 1.0929 | Hate Acc: 0.7633 | Topic Acc: 0.7692 | Hate F1: 0.7636

Epoch 4/10
Train Loss: 0.5056 | Hate Acc: 0.8927 | Topic Acc: 0.8948
Val Loss: 1.1659 | Hate Acc: 0.7618 | Topic Acc: 0.7640 | Hate F1: 0.7616

Epoch 5/10
Train Loss: 0.3102 | Hate Acc: 0.9422 | Topic Acc: 0.9410
Val Loss: 1.3614 | Hate Acc: 0.7588 | Topic Acc: 0.7558 | Hate F1: 0.7588

Epoch 6/10
Train Loss: 0.1782 | Hate Acc: 0.9694 | Topic Acc: 0.9688
Val Loss: 1.4597 | Hate Acc: 0.7491 | Topic Acc: 0.7797 | Hate F1: 0.7490

Early stopping after 6 epochs!


  model.load_state_dict(torch.load("best_model.pth", map_location=device))



Test Set Evaluation:

Hate Speech Performance:
Accuracy: 0.7612
F1 Score: 0.7614

Classification Report:


TypeError: Expected sequence or array-like, got <class 'int'>

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel
import torch.nn as nn
import pandas as pd

# =======================
# 1️⃣ Load Model & Tokenizer
# =======================
model_name = "aubmindlab/bert-base-arabertv02"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load dataset to dynamically extract topics (ensure this is the correct path)
df = pd.read_excel("/kaggle/input/datasetes/Arabic.xlsx")  # Update this with your actual file path

# Dynamically extract topics and create a mapping
topics = df["Topic"].unique()  # Assuming 'Topic' column contains the topic names
topic_to_id = {topic: i for i, topic in enumerate(topics)}

# Define the same model structure
class MultiTaskModel(nn.Module):
    def __init__(self, model_name, num_topics):
        super(MultiTaskModel, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.hate_speech_classifier = nn.Linear(self.bert.config.hidden_size, 2)  # Hate speech (yes/no)
        self.topic_classifier = nn.Linear(self.bert.config.hidden_size, num_topics)  # Topic classification

    def forward(self, input_ids, attention_mask, token_type_ids=None):
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = bert_outputs.last_hidden_state[:, 0, :]  # Use [CLS] token

        hate_speech_logits = self.hate_speech_classifier(pooled_output)
        topic_logits = self.topic_classifier(pooled_output)

        return hate_speech_logits, topic_logits

# Load model
num_topics = len(topics)  # Dynamically adjust to the number of topics in the dataset
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = MultiTaskModel(model_name, num_topics).to(device)
model.load_state_dict(torch.load("/kaggle/working/arabert_hate_speech_model_2025-04-05_17-34-23.pth", map_location=device))
model.eval()

# =======================
# 2️⃣ Function to Test Model
# =======================
def predict(text):
    # Tokenize input
    inputs = tokenizer(text, truncation=True, padding=True, max_length=128, return_tensors="pt").to(device)

    # Get predictions
    with torch.no_grad():
        hate_speech_logits, topic_logits = model(**inputs)

    # Convert logits to predictions
    hate_speech_pred = torch.argmax(hate_speech_logits, dim=1).item()
    topic_pred = torch.argmax(topic_logits, dim=1).item()

    # Map predictions to labels (using topic_to_id for dynamic mapping)
    hate_speech_label = "Hate Speech" if hate_speech_pred == 1 else "Not Hate Speech"
    topic_label = [k for k, v in topic_to_id.items() if v == topic_pred][0]  # Get the topic label from topic_to_id

    return hate_speech_label, topic_label

# =======================
# 3️⃣ Test on New Text
# =======================
test_text = "كلام تاع واحد لا علاقة بالعلم والله انه لدجال لعنه الله"  # Change this text
hs_label, topic_label = predict(test_text)

print(f"🔹 Input Text: {test_text}")
print(f"✅ Hate Speech: {hs_label}")
print(f"✅ Topic: {topic_label}")


In [None]:
import pandas as pd
import torch
import torch.nn as nn
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModel, get_scheduler
from torch.utils.data import Dataset, DataLoader
from datetime import datetime

# =======================
# 1️⃣ Load and Preprocess Data
# =======================
file_path = "/kaggle/input/datasetes/Arabic.xlsx"
df = pd.read_excel(file_path)

# Map labels
df["Hate speech"] = df["Hate speech"].map({"yes": 1, "no": 0})
topics = df["Topic"].unique()
topic_to_id = {topic: i for i, topic in enumerate(topics)}
df["Topic"] = df["Topic"].map(topic_to_id)

# Split dataset
train_texts, temp_texts, train_labels, temp_labels, train_topics, temp_topics = train_test_split(
    df["processed_comment"], df["Hate speech"], df["Topic"], test_size=0.2, random_state=42, stratify=df["Hate speech"]
)

val_texts, test_texts, val_labels, test_labels, val_topics, test_topics = train_test_split(
    temp_texts, temp_labels, temp_topics, test_size=0.5, random_state=42, stratify=temp_labels
)

# =======================
# 2️⃣ Tokenization
# =======================
model_name = "aubmindlab/bert-base-arabertv02"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_texts(texts):
    return tokenizer(list(texts), truncation=True, padding=True, max_length=128, return_tensors="pt")

train_encodings = tokenize_texts(train_texts)
val_encodings = tokenize_texts(val_texts)
test_encodings = tokenize_texts(test_texts)

# =======================
# 3️⃣ Custom Dataset
# =======================
class HateSpeechDataset(Dataset):
    def __init__(self, encodings, hate_speech_labels, topic_labels):
        self.encodings = encodings
        self.hate_speech_labels = hate_speech_labels
        self.topic_labels = topic_labels

    def __len__(self):
        return len(self.hate_speech_labels)

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        item["hate_speech_labels"] = torch.tensor(self.hate_speech_labels.iloc[idx], dtype=torch.long)
        item["topic_labels"] = torch.tensor(self.topic_labels.iloc[idx], dtype=torch.long)
        return item

train_dataset = HateSpeechDataset(train_encodings, train_labels, train_topics)
val_dataset = HateSpeechDataset(val_encodings, val_labels, val_topics)
test_dataset = HateSpeechDataset(test_encodings, test_labels, test_topics)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# =======================
# 4️⃣ CNN-BERT Hybrid Model
# =======================
class CNNBERTMultiTaskModel(nn.Module):
    def __init__(self, model_name, num_topics, num_filters=100, filter_sizes=[2,3,4]):
        super(CNNBERTMultiTaskModel, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=self.bert.config.hidden_size,
                      out_channels=num_filters,
                      kernel_size=fs)
            for fs in filter_sizes
        ])
        self.dropout = nn.Dropout(0.3)
        self.fc_shared = nn.Linear(num_filters * len(filter_sizes), 256)
        self.hate_speech_classifier = nn.Linear(256, 2)
        self.topic_classifier = nn.Linear(256, num_topics)

    def forward(self, input_ids, attention_mask, token_type_ids=None, hate_speech_labels=None, topic_labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        x = outputs.last_hidden_state  # [batch_size, seq_len, hidden_size]
        x = x.permute(0, 2, 1)  # [batch_size, hidden_size, seq_len]

        conv_outputs = [torch.relu(conv(x)).max(dim=2)[0] for conv in self.convs]
        x = torch.cat(conv_outputs, dim=1)  # [batch_size, num_filters * len(filter_sizes)]

        x = self.dropout(x)
        x = self.fc_shared(x)

        hate_speech_logits = self.hate_speech_classifier(x)
        topic_logits = self.topic_classifier(x)

        loss = None
        if hate_speech_labels is not None and topic_labels is not None:
            hate_loss = nn.CrossEntropyLoss()(hate_speech_logits, hate_speech_labels)
            topic_loss = nn.CrossEntropyLoss()(topic_logits, topic_labels)
            loss = hate_loss + topic_loss

        return loss, hate_speech_logits, topic_logits

# =======================
# 5️⃣ Train & Eval Functions
# =======================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_topics = len(topics)
model = CNNBERTMultiTaskModel(model_name, num_topics).to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)
num_training_steps = len(train_loader) * 3
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

def train_epoch(model, loader):
    model.train()
    total_loss, correct_hate, correct_topic = 0, 0, 0

    for batch in loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        loss, hate_logits, topic_logits = model(**batch)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        total_loss += loss.item()
        correct_hate += (hate_logits.argmax(1) == batch["hate_speech_labels"]).sum().item()
        correct_topic += (topic_logits.argmax(1) == batch["topic_labels"]).sum().item()

    return total_loss / len(loader), correct_hate / len(loader.dataset), correct_topic / len(loader.dataset)

def evaluate(model, loader):
    model.eval()
    total_loss, correct_hate, correct_topic = 0, 0, 0

    with torch.no_grad():
        for batch in loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            loss, hate_logits, topic_logits = model(**batch)

            total_loss += loss.item()
            correct_hate += (hate_logits.argmax(1) == batch["hate_speech_labels"]).sum().item()
            correct_topic += (topic_logits.argmax(1) == batch["topic_labels"]).sum().item()

    return total_loss / len(loader), correct_hate / len(loader.dataset), correct_topic / len(loader.dataset)

# =======================
# 6️⃣ Training Loop
# =======================
for epoch in range(3):
    train_loss, train_acc_hate, train_acc_topic = train_epoch(model, train_loader)
    val_loss, val_acc_hate, val_acc_topic = evaluate(model, val_loader)

    print(f"Epoch {epoch+1}/3")
    print(f"Train Loss: {train_loss:.4f} | Hate Acc: {train_acc_hate:.4f} | Topic Acc: {train_acc_topic:.4f}")
    print(f"Val Loss: {val_loss:.4f} | Hate Acc: {val_acc_hate:.4f} | Topic Acc: {val_acc_topic:.4f}")
    print("-" * 60)

# =======================
# 7️⃣ Save the Model
# =======================
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
torch.save(model.state_dict(), f"/kaggle/working/cnn_bert_multitask_{timestamp}.pth")
print("✅ Model saved successfully!")


In [None]:
import pandas as pd
import torch
import torch.nn as nn
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModel, get_scheduler
from transformers import AutoModelForMaskedLM

from torch.utils.data import Dataset, DataLoader
from datetime import datetime

# =======================
# 1️⃣ Load and Preprocess Data
# =======================
file_path = "/kaggle/input/datasetes/Arabic.xlsx"
df = pd.read_excel(file_path)

# Map labels
df["Hate speech"] = df["Hate speech"].map({"yes": 1, "no": 0})
topics = df["Topic"].unique()
topic_to_id = {topic: i for i, topic in enumerate(topics)}
df["Topic"] = df["Topic"].map(topic_to_id)

# Split dataset
train_texts, temp_texts, train_labels, temp_labels, train_topics, temp_topics = train_test_split(
    df["processed_comment"], df["Hate speech"], df["Topic"], test_size=0.2, random_state=42, stratify=df["Hate speech"]
)

val_texts, test_texts, val_labels, test_labels, val_topics, test_topics = train_test_split(
    temp_texts, temp_labels, temp_topics, test_size=0.5, random_state=42, stratify=temp_labels
)

# =======================
# 2️⃣ Tokenization
# =======================
tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv02-twitter")
model = AutoModelForMaskedLM.from_pretrained("aubmindlab/bert-base-arabertv02-twitter")

def tokenize_texts(texts):
    return tokenizer(list(texts), truncation=True, padding=True, max_length=128, return_tensors="pt")

train_encodings = tokenize_texts(train_texts)
val_encodings = tokenize_texts(val_texts)
test_encodings = tokenize_texts(test_texts)

# =======================
# 3️⃣ Custom Dataset
# =======================
class HateSpeechDataset(Dataset):
    def __init__(self, encodings, hate_speech_labels, topic_labels):
        self.encodings = encodings
        self.hate_speech_labels = hate_speech_labels
        self.topic_labels = topic_labels

    def __len__(self):
        return len(self.hate_speech_labels)

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        item["hate_speech_labels"] = torch.tensor(self.hate_speech_labels.iloc[idx], dtype=torch.long)
        item["topic_labels"] = torch.tensor(self.topic_labels.iloc[idx], dtype=torch.long)
        return item

train_dataset = HateSpeechDataset(train_encodings, train_labels, train_topics)
val_dataset = HateSpeechDataset(val_encodings, val_labels, val_topics)
test_dataset = HateSpeechDataset(test_encodings, test_labels, test_topics)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# =======================
# 4️⃣ CNN-BERT Hybrid Model
# =======================
class CNNBERTMultiTaskModel(nn.Module):
    def __init__(self, model_name, num_topics, num_filters=100, filter_sizes=[2,3,4]):
        super(CNNBERTMultiTaskModel, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=self.bert.config.hidden_size,
                      out_channels=num_filters,
                      kernel_size=fs)
            for fs in filter_sizes
        ])
        self.dropout = nn.Dropout(0.3)
        self.fc_shared = nn.Linear(num_filters * len(filter_sizes), 256)
        self.hate_speech_classifier = nn.Linear(256, 2)
        self.topic_classifier = nn.Linear(256, num_topics)

    def forward(self, input_ids, attention_mask, token_type_ids=None, hate_speech_labels=None, topic_labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        x = outputs.last_hidden_state  # [batch_size, seq_len, hidden_size]
        x = x.permute(0, 2, 1)  # [batch_size, hidden_size, seq_len]

        conv_outputs = [torch.relu(conv(x)).max(dim=2)[0] for conv in self.convs]
        x = torch.cat(conv_outputs, dim=1)  # [batch_size, num_filters * len(filter_sizes)]

        x = self.dropout(x)
        x = self.fc_shared(x)

        hate_speech_logits = self.hate_speech_classifier(x)
        topic_logits = self.topic_classifier(x)

        loss = None
        if hate_speech_labels is not None and topic_labels is not None:
            hate_loss = nn.CrossEntropyLoss()(hate_speech_logits, hate_speech_labels)
            topic_loss = nn.CrossEntropyLoss()(topic_logits, topic_labels)
            loss = hate_loss + topic_loss

        return loss, hate_speech_logits, topic_logits

# =======================
# 5️⃣ Train & Eval Functions
# =======================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_topics = len(topics)
model = CNNBERTMultiTaskModel(model_name, num_topics).to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)
num_training_steps = len(train_loader) * 3
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

def train_epoch(model, loader):
    model.train()
    total_loss, correct_hate, correct_topic = 0, 0, 0

    for batch in loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        loss, hate_logits, topic_logits = model(**batch)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        total_loss += loss.item()
        correct_hate += (hate_logits.argmax(1) == batch["hate_speech_labels"]).sum().item()
        correct_topic += (topic_logits.argmax(1) == batch["topic_labels"]).sum().item()

    return total_loss / len(loader), correct_hate / len(loader.dataset), correct_topic / len(loader.dataset)

def evaluate(model, loader):
    model.eval()
    total_loss, correct_hate, correct_topic = 0, 0, 0

    with torch.no_grad():
        for batch in loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            loss, hate_logits, topic_logits = model(**batch)

            total_loss += loss.item()
            correct_hate += (hate_logits.argmax(1) == batch["hate_speech_labels"]).sum().item()
            correct_topic += (topic_logits.argmax(1) == batch["topic_labels"]).sum().item()

    return total_loss / len(loader), correct_hate / len(loader.dataset), correct_topic / len(loader.dataset)

# =======================
# 6️⃣ Training Loop
# =======================
for epoch in range(3):
    train_loss, train_acc_hate, train_acc_topic = train_epoch(model, train_loader)
    val_loss, val_acc_hate, val_acc_topic = evaluate(model, val_loader)

    print(f"Epoch {epoch+1}/3")
    print(f"Train Loss: {train_loss:.4f} | Hate Acc: {train_acc_hate:.4f} | Topic Acc: {train_acc_topic:.4f}")
    print(f"Val Loss: {val_loss:.4f} | Hate Acc: {val_acc_hate:.4f} | Topic Acc: {val_acc_topic:.4f}")
    print("-" * 60)

# =======================
# 7️⃣ Save the Model
# =======================
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
torch.save(model.state_dict(), f"/kaggle/working/cnn_bert_multitask_{timestamp}.pth")
print("✅ Model saved successfully!")


In [None]:
import pandas as pd
import torch
import torch.nn as nn
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModel, get_scheduler
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

# =======================
# 1️⃣ Load and Preprocess Data
# =======================
file_path = "/kaggle/input/datasetes/Arabic.xlsx"
df = pd.read_excel(file_path)

# Convert labels to numeric values
df["Hate speech"] = df["Hate speech"].map({"yes": 1, "no": 0})

# Encode topics as numbers
topics = df["Topic"].unique()
topic_to_id = {topic: i for i, topic in enumerate(topics)}
df["Topic"] = df["Topic"].map(topic_to_id)

# Split into train (80%), validation (10%), and test (10%)
train_texts, temp_texts, train_labels, temp_labels, train_topics, temp_topics = train_test_split(
    df["processed_comment"], df["Hate speech"], df["Topic"], test_size=0.2, random_state=42, stratify=df["Hate speech"]
)

val_texts, test_texts, val_labels, test_labels, val_topics, test_topics = train_test_split(
    temp_texts, temp_labels, temp_topics, test_size=0.5, random_state=42, stratify=temp_labels
)

# =======================
# 2️⃣ Tokenization
# =======================
model_name = "aubmindlab/bert-base-arabertv02"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_texts(texts):
    return tokenizer(list(texts), truncation=True, padding=True, max_length=128, return_tensors="pt")

train_encodings = tokenize_texts(train_texts)
val_encodings = tokenize_texts(val_texts)
test_encodings = tokenize_texts(test_texts)

# =======================
# 3️⃣ Custom Dataset Class
# =======================
class HateSpeechDataset(Dataset):
    def __init__(self, encodings, hate_speech_labels, topic_labels):
        self.encodings = encodings
        self.hate_speech_labels = hate_speech_labels
        self.topic_labels = topic_labels

    def __len__(self):
        return len(self.hate_speech_labels)

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        item["hate_speech_labels"] = torch.tensor(self.hate_speech_labels.iloc[idx], dtype=torch.long)
        item["topic_labels"] = torch.tensor(self.topic_labels.iloc[idx], dtype=torch.long)
        return item

# Create datasets and DataLoaders
train_dataset = HateSpeechDataset(train_encodings, train_labels, train_topics)
val_dataset = HateSpeechDataset(val_encodings, val_labels, val_topics)
test_dataset = HateSpeechDataset(test_encodings, test_labels, test_topics)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# =======================
# 4️⃣ Define Model
# =======================
class MultiTaskModel(nn.Module):
    def __init__(self, model_name, num_topics):
        super(MultiTaskModel, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        self.hate_speech_classifier = nn.Linear(self.bert.config.hidden_size, 2)
        self.topic_classifier = nn.Linear(self.bert.config.hidden_size, num_topics)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        hate_speech_logits = self.hate_speech_classifier(pooled_output)
        topic_logits = self.topic_classifier(pooled_output)
        return hate_speech_logits, topic_logits

# =======================
# 5️⃣ Training Setup
# =======================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MultiTaskModel(model_name, num_topics=len(topics)).to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_loader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

loss_fn = nn.CrossEntropyLoss()

# =======================
# 6️⃣ Training Loop
# =======================
def train(model, dataloader):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader, desc="Training"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        hate_speech_labels = batch["hate_speech_labels"].to(device)
        topic_labels = batch["topic_labels"].to(device)

        optimizer.zero_grad()
        hate_speech_logits, topic_logits = model(input_ids, attention_mask)

        loss_hate_speech = loss_fn(hate_speech_logits, hate_speech_labels)
        loss_topic = loss_fn(topic_logits, topic_labels)
        loss = loss_hate_speech + loss_topic

        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

# =======================
# 7️⃣ Evaluation Function
# =======================
def evaluate(model, dataloader):
    model.eval()
    total_loss = 0
    correct_hate_speech = 0
    correct_topic = 0
    total_samples = 0

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            hate_speech_labels = batch["hate_speech_labels"].to(device)
            topic_labels = batch["topic_labels"].to(device)

            hate_speech_logits, topic_logits = model(input_ids, attention_mask)

            loss_hate_speech = loss_fn(hate_speech_logits, hate_speech_labels)
            loss_topic = loss_fn(topic_logits, topic_labels)
            loss = loss_hate_speech + loss_topic

            total_loss += loss.item()

            hate_speech_preds = torch.argmax(hate_speech_logits, dim=1)
            topic_preds = torch.argmax(topic_logits, dim=1)

            correct_hate_speech += (hate_speech_preds == hate_speech_labels).sum().item()
            correct_topic += (topic_preds == topic_labels).sum().item()
            total_samples += hate_speech_labels.size(0)

    avg_loss = total_loss / len(dataloader)
    hate_speech_acc = correct_hate_speech / total_samples
    topic_acc = correct_topic / total_samples

    return avg_loss, hate_speech_acc, topic_acc

# =======================
# 8️⃣ Run Training & Evaluation
# =======================
for epoch in range(num_epochs):
    print(f"\nEpoch {epoch + 1}/{num_epochs}")
    train_loss = train(model, train_loader)
    val_loss, val_hate_speech_acc, val_topic_acc = evaluate(model, val_loader)

    print(f"Train Loss: {train_loss:.4f}")
    print(f"Val Loss: {val_loss:.4f} | Hate Speech Acc: {val_hate_speech_acc:.4f} | Topic Acc: {val_topic_acc:.4f}")

# =======================
# 9️⃣ Final Test Evaluation
# =======================
test_loss, test_hate_speech_acc, test_topic_acc = evaluate(model, test_loader)
print(f"\nTest Loss: {test_loss:.4f} | Hate Speech Acc: {test_hate_speech_acc:.4f} | Topic Acc: {test_topic_acc:.4f}")



In [None]:
!pip install nlpaug
!pip install transformers
!pip install torch
!pip install scikit-learn


In [None]:
import pandas as pd
import torch
import torch.nn as nn
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModel, get_scheduler
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import nlpaug.augmenter.word as naw
from sklearn.metrics import accuracy_score

# =======================
# 1️⃣ Load and Preprocess Data
# =======================
file_path = "/kaggle/input/datasetes/Arabic.xlsx"
df = pd.read_excel(file_path)

# Convert labels to numeric values
df["Hate speech"] = df["Hate speech"].map({"yes": 1, "no": 0})

# Encode topics as numbers
topics = df["Topic"].unique()
topic_to_id = {topic: i for i, topic in enumerate(topics)}
df["Topic"] = df["Topic"].map(topic_to_id)

# =======================
# Data Augmentation Function
# =======================
aug = naw.SynonymAug(aug_src='wordnet', lang='arb')

def augment_text(text):
    try:
        return aug.augment(text)
    except:
        return text

# Apply augmentation to positive class (Hate speech = 1)
augmented_rows = df[df["Hate speech"] == 1].copy()
augmented_rows["processed_comment"] = augmented_rows["processed_comment"].apply(augment_text)

# Concatenate the original data with augmented data
df = pd.concat([df, augmented_rows]).reset_index(drop=True)

# =======================
# Split Data
# =======================
train_texts, temp_texts, train_labels, temp_labels, train_topics, temp_topics = train_test_split(
    df["processed_comment"], df["Hate speech"], df["Topic"], test_size=0.2, random_state=42, stratify=df["Hate speech"]
)

val_texts, test_texts, val_labels, test_labels, val_topics, test_topics = train_test_split(
    temp_texts, temp_labels, temp_topics, test_size=0.5, random_state=42, stratify=temp_labels
)

# =======================
# Tokenization
# =======================
model_name = "aubmindlab/bert-base-arabertv02"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_texts(texts):
    return tokenizer(list(texts), truncation=True, padding=True, max_length=256, return_tensors="pt")

train_encodings = tokenize_texts(train_texts)
val_encodings = tokenize_texts(val_texts)
test_encodings = tokenize_texts(test_texts)

# =======================
# Custom Dataset Class
# =======================
class HateSpeechDataset(Dataset):
    def __init__(self, encodings, hate_speech_labels, topic_labels):
        self.encodings = encodings
        self.hate_speech_labels = hate_speech_labels
        self.topic_labels = topic_labels

    def __len__(self):
        return len(self.hate_speech_labels)

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        item["hate_speech_labels"] = torch.tensor(self.hate_speech_labels.iloc[idx], dtype=torch.long)
        item["topic_labels"] = torch.tensor(self.topic_labels.iloc[idx], dtype=torch.long)
        return item

train_dataset = HateSpeechDataset(train_encodings, train_labels, train_topics)
val_dataset = HateSpeechDataset(val_encodings, val_labels, val_topics)
test_dataset = HateSpeechDataset(test_encodings, test_labels, test_topics)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# =======================
# Model Definition
# =======================
class HateSpeechClassifier(nn.Module):
    def __init__(self, model_name):
        super(HateSpeechClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, 2)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = HateSpeechClassifier(model_name).to(device)

# =======================
# Training Setup
# =======================
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
num_epochs = 5
num_training_steps = num_epochs * len(train_loader)

lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

loss_fn = nn.CrossEntropyLoss()
save_path = "/kaggle/working/arabert_hate_speech_best.pth"
best_val_loss = float("inf")

# =======================
# Training Loop with Early Stopping
# =======================
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}")

    for batch in progress_bar:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(batch["input_ids"], batch["attention_mask"])
        loss = loss_fn(outputs, batch["hate_speech_labels"])

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        lr_scheduler.step()

        total_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())

    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1} - Training Loss: {avg_train_loss:.4f}")

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(batch["input_ids"], batch["attention_mask"])
            loss = loss_fn(outputs, batch["hate_speech_labels"])
            val_loss += loss.item()
    avg_val_loss = val_loss / len(val_loader)
    print(f"Epoch {epoch + 1} - Validation Loss: {avg_val_loss:.4f}")

    # Save best model if validation loss improves
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), save_path)
        print(f"✅ Model saved to {save_path}")

# =======================
# Evaluation on Test Set
# =======================
model.load_state_dict(torch.load(save_path))  # Load best model
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Evaluating"):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(batch["input_ids"], batch["attention_mask"])
        preds = torch.argmax(outputs, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(batch["hate_speech_labels"].cpu().numpy())

accuracy = accuracy_score(all_labels, all_preds)
print(f"📊 Test Accuracy: {accuracy * 100:.2f}%")
