In [None]:
!pip install transformers peft datasets torch evaluate

Collecting datasets
  Downloading datasets-3.4.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Co

In [None]:
import pandas as pd
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType

In [None]:
train_data = pd.read_csv('tamil_offensive_speech_train.csv', on_bad_lines='skip', encoding='utf-8')
print("Dataset loaded successfully:")
print(train_data.head())
train_dataset = Dataset.from_pandas(train_data)

Dataset loaded successfully:
   label                                            comment
0      0                  omg that bgm make me goosebumb...
1      0         neraya neraya neraya neraya neraya neraya.
2      0  thalaivar mersal look .semma massss thalaiva ....
3      0  paaaa... repeat mode.... adra adra adraaaaa......
4      0  epaa ena panaporam... sweet sapade poram... aw...


In [None]:
import numpy as np
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
from datasets import Dataset

# Extract features and labels
X_train = [example["comment"] for example in train_dataset]
y_train = [example["label"] for example in train_dataset]

# Convert lists to NumPy arrays
X_train = np.array(X_train).reshape(-1, 1)  # Reshape to (n_samples, 1)
y_train = np.array(y_train)

# Oversample minority class
ros = RandomOverSampler(sampling_strategy="auto", random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

# Convert back to Hugging Face Dataset
train_dataset = Dataset.from_dict({
    "comment": X_resampled.flatten().tolist(),  # Convert back to list of strings
    "label": y_resampled.tolist()
})

# Check class distribution after oversampling
print(Counter(y_resampled))  # Should show balanced classes



Counter({0: 21226, 1: 21226})


In [None]:
val_data = pd.read_csv('tamil_offensive_speech_val.csv', on_bad_lines='skip', encoding='utf-8')
print("Dataset loaded successfully:")
print(val_data.head())
val_dataset = Dataset.from_pandas(val_data)

Dataset loaded successfully:
   label                                            comment
0      0  annan thambingalam poondhu vilayada poranga🤩🤩....
1      1                  ha ha ha ha appo naiya thebingala
2      0  manitha samuthaayam amaipil irunthu intha pada...
3      1  otha virundhalikku porandhavan tha jaadhi ah t...
4      1  ama padichavan vanda mattum vote potruvanungee...


In [None]:
print(train_dataset[3])


{'comment': 'paaaa... repeat mode.... adra adra adraaaaa..... vera level... vare va....', 'label': 0}


In [None]:
tokenizer = AutoTokenizer.from_pretrained("google/muril-base-cased")

def tokenize_function(examples):
    comments = examples["comment"]

    # Ensure every comment is a string (convert None to an empty string)
    if isinstance(comments, list):
        comments = [c if isinstance(c, str) else "" for c in comments]
    else:
        comments = comments if isinstance(comments, str) else ""

    return tokenizer(comments, padding='max_length', truncation=True, max_length=128)



tamil_0_tokenized = train_dataset.map(tokenize_function, batched=False)

Map:   0%|          | 0/42452 [00:00<?, ? examples/s]

In [None]:
tamil_1_tokenized = val_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/6969 [00:00<?, ? examples/s]

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    "google/muril-base-cased",
    num_labels=2
)


lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["query", "key", "value"],
    bias="none"
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/muril-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import torch.nn as nn

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc}

In [None]:
import torch.nn.functional as F

class FocalLoss(nn.Module):
    def __init__(self, alpha=None, gamma=2.0, reduction='mean'):
        """
        alpha: Balancing factor for class imbalance (list or tensor).
        gamma: Focusing parameter (higher = more focus on hard examples).
        reduction: 'mean' or 'sum' (same as CrossEntropyLoss).
        """
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, logits, targets):
        ce_loss = F.cross_entropy(logits, targets, reduction='none')  # Standard CE loss
        pt = torch.exp(-ce_loss)  # Probability of the true class
        focal_loss = (1 - pt) ** self.gamma * ce_loss  # Apply Focal Loss scaling

        # Apply alpha weighting if provided
        if self.alpha is not None:
            alpha_t = self.alpha.gather(0, targets.view(-1))  # Get alpha for each label
            focal_loss *= alpha_t

        # Reduction method
        if self.reduction == 'mean':
            return focal_loss.mean()
        elif self.reduction == 'sum':
            return focal_loss.sum()
        return focal_loss

In [None]:
labels = np.array([example["label"] for example in train_dataset])
class_counts = np.bincount(labels)

class_weights = torch.tensor(1.0 / class_counts, dtype=torch.float).to("cuda")
class_weights = class_weights / class_weights.sum()
loss_fn = FocalLoss(alpha=class_weights, gamma=2.0)

In [None]:
from transformers import Trainer, TrainingArguments


peft_model = get_peft_model(model, lora_config)
peft_model.print_trainable_parameters()

train_dataset = tamil_0_tokenized
val_dataset = tamil_1_tokenized


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=1)

    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="macro")  # Macro F1 handles class imbalance

    return {"accuracy": acc, "f1_score": f1}


class CustomTrainer(Trainer):
  def __init__(self, gamma=2.0, *args, **kwargs):
    super().__init__(*args, **kwargs)
    self.gamma = gamma

  def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
      labels = inputs.pop("labels")
      outputs = model(**inputs)
      logits = outputs.logits
      loss = loss_fn(logits, labels)
      return (loss, outputs) if return_outputs else loss

# Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=1e-4,  # Increase learning rate
    num_train_epochs=10,  # Increase epochs
    per_device_train_batch_size=64,  # Try larger batch
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

# Use the Custom Trainer
trainer = CustomTrainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    gamma=3.0
)

# Train the model
trainer.train()

# Save the model
trainer.save_model("./tamil-comment-classifier")

# Function for inference
def classify_comment(text):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    model.eval()
    with torch.no_grad():
        outputs = peft_model(**inputs)

        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=1).item()
    return "Offensive" if predicted_class == 1 else "Not Offensive"

# Test with a sample comment
sample_comment = "இது ஒரு நல்ல பதிவு"
prediction = classify_comment(sample_comment)
print(f"Comment: {sample_comment}")
print(f"Prediction: {prediction}")



trainable params: 886,274 || all params: 238,444,036 || trainable%: 0.3717


  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1 Score
1,0.0625,0.05965,0.774717,0.737733
2,0.0526,0.062587,0.778447,0.744477
3,0.0475,0.053305,0.814034,0.769796
