In [1]:
!pip install transformers



In [2]:
from transformers import MarianMTModel, MarianTokenizer
import pandas as pd
import torch

# Load backtranslation model and tokenizer
def load_translation_model(src_lang, tgt_lang):
    model_name = f"Helsinki-NLP/opus-mt-{src_lang}-{tgt_lang}"
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)
    return model, tokenizer

# Translate text in batches
def translate_text(texts, model, tokenizer, device='cpu', batch_size=8):
    model = model.to(device)
    translations = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True).to(device)
        outputs = model.generate(**inputs)
        decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        translations.extend(decoded)
    return translations

# Backtranslate texts
def backtranslate_texts(texts, device='cpu'):
    # Load translation models
    en_to_fr_model, en_to_fr_tokenizer = load_translation_model('en', 'fr')
    fr_to_en_model, fr_to_en_tokenizer = load_translation_model('fr', 'en')

    # Step 1: Translate to French
    translated_to_fr = translate_text(texts, en_to_fr_model, en_to_fr_tokenizer, device)

    # Step 2: Translate back to English
    backtranslated = translate_text(translated_to_fr, fr_to_en_model, fr_to_en_tokenizer, device)

    return backtranslated

# Load dataset
dataset_path = '/content/labeled_data_cleaned_whole.csv'
data = pd.read_csv(dataset_path)

# Clean missing values
data['corrected_tweet'] = data['corrected_tweet'].fillna('')
data['corrected_tweet'] = data['corrected_tweet'].astype(str)

# Separate the hate speech class (class 0)
class_0 = data[data['class'] == 0]

# Augment class 0 with backtranslation
device = 'cuda' if torch.cuda.is_available() else 'cpu'
augmented_texts = backtranslate_texts(class_0['corrected_tweet'].tolist(), device)

# Create a new DataFrame for augmented data
augmented_class_0 = pd.DataFrame({
    'corrected_tweet': augmented_texts,
    'class': [0] * len(augmented_texts)
})

# Combine augmented data with the original dataset
augmented_data = pd.concat([data, augmented_class_0])
augmented_data = augmented_data.sample(frac=1, random_state=42)

# Save the augmented dataset
augmented_dataset_path = '/content/augmented_dataset.csv'
augmented_data.to_csv(augmented_dataset_path, index=False)
print(f"Augmented dataset saved to {augmented_dataset_path}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Augmented dataset saved to /content/augmented_dataset.csv


In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from torch.utils.data import Dataset
from torch.nn import CrossEntropyLoss

# Load Dataset
dataset_path = '/content/augmented_dataset_back_translation.csv'  
data = pd.read_csv(dataset_path)

# Handle missing values
data['corrected_tweet'] = data['corrected_tweet'].fillna('')
data['corrected_tweet'] = data['corrected_tweet'].astype(str)

# Separate texts and labels
texts = data['corrected_tweet'].values
labels = data['class'].values

# Train-Test Split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

# Load Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Tokenize Dataset with Reduced Sequence Length
def tokenize_texts(texts, max_length=64):  # Reduced max length
    return tokenizer(
        list(texts),
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )

train_encodings = tokenize_texts(train_texts)
val_encodings = tokenize_texts(val_texts)

# Convert Labels to Tensors
train_labels = torch.tensor(train_labels)
val_labels = torch.tensor(val_labels)

# Define Custom Dataset
class HateSpeechDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

# Create Datasets
train_dataset = HateSpeechDataset(train_encodings, train_labels)
val_dataset = HateSpeechDataset(val_encodings, val_labels)

# Load Pretrained mBERT Model
model = BertForSequenceClassification.from_pretrained(
    'bert-base-multilingual-cased',
    num_labels=3  # Number of classes
)

# Unfreeze all layers
for param in model.bert.parameters():
    param.requires_grad = True

# Define class weights
class_weights = torch.tensor([5.0, 1.0, 2.0])

# Custom Trainer with Weighted Loss
class WeightedLossTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        class_weights_device = class_weights.to(logits.device)
        loss_fn = CrossEntropyLoss(weight=class_weights_device)
        loss = loss_fn(logits, labels)
        return (loss, outputs) if return_outputs else loss

# Training Arguments with Increased Epochs
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,  # Increased epochs
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    fp16=True,
)

# Trainer with Weighted Loss
trainer = WeightedLossTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=lambda pred: {
        'accuracy': accuracy_score(pred.label_ids, pred.predictions.argmax(-1)),
        'precision': precision_recall_fscore_support(
            pred.label_ids, pred.predictions.argmax(-1), average='weighted'
        )[0],
        'recall': precision_recall_fscore_support(
            pred.label_ids, pred.predictions.argmax(-1), average='weighted'
        )[1],
        'f1': precision_recall_fscore_support(
            pred.label_ids, pred.predictions.argmax(-1), average='weighted'
        )[2],
    }
)

# Train the Model
trainer.train()

# Evaluate the Model
results = trainer.evaluate()
print("Evaluation Results:", results)

# Save the Model
model.save_pretrained('./mbert_hate_speech')
tokenizer.save_pretrained('./mbert_hate_speech')

# Predict on Validation Set
val_outputs = trainer.predict(val_dataset)
preds = val_outputs.predictions.argmax(-1)

# Classification Report
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
print("Classification Report:")
print(classification_report(val_labels, preds, target_names=['Hate Speech', 'Offensive', 'Neutral'], zero_division=0))

# Confusion Matrix
cm_normalized = confusion_matrix(val_labels, preds, normalize='true')
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Hate Speech', 'Offensive', 'Neutral'])
disp.plot(cmap='viridis', xticks_rotation='vertical')
plt.title("Confusion Matrix")
plt.show()
