In [None]:
!pip install transformers peft datasets torch evaluate

Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)


In [None]:
import pandas as pd
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType

In [None]:
train_data = pd.read_csv('/content/malayalam_offensive_speech_train.csv', on_bad_lines='skip', encoding='utf-8')
print("Dataset loaded successfully:")
print(train_data.head())
train_dataset = Dataset.from_pandas(train_data)

Dataset loaded successfully:
   label                                            comment
0      0  നൈസിൽ കമെന്റ് വായിക്കാൻ വന്ന ദിലീഷ് പോത്തന് ലൈ...
1      0  സച്ചിയുടെ script നല്ല കഥ കിട്ടിയാൽ making കൊണ്...
2      0         track mattan parayunnathu enna style anu 🧡
3      0        ഒരു amen touch തോന്നിയത് എനിക്ക് മാത്രം ആണോ
4      0  spr item rajuettan nalla making lalettante kar...


In [None]:
import numpy as np
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
from datasets import Dataset

X_train = [example["comment"] for example in train_dataset]
y_train = [example["label"] for example in train_dataset]

X_train = np.array(X_train).reshape(-1, 1)
y_train = np.array(y_train)


ros = RandomOverSampler(sampling_strategy="auto", random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

train_dataset = Dataset.from_dict({
    "comment": X_resampled.flatten().tolist(),
    "label": y_resampled.tolist()
})


print(Counter(y_resampled))



Counter({np.int64(0): 12734, np.int64(1): 12734})


In [None]:
val_data = pd.read_csv('/content/malayalam_offensive_speech_val.csv', on_bad_lines='skip', encoding='utf-8')
print("Dataset loaded successfully:")
print(val_data.head())
val_dataset = Dataset.from_pandas(val_data)

Dataset loaded successfully:
   label                                            comment
0      0        അണ്ണാ ഈ പദത്തിലെങ്കിലും ആയ ടോവിനോയെ പൂട്ടണം
1      0  idivett trailer . mammukka ennum njangalde swa...
2      0             entammo ikka powli powli ...heavy item
3      0        മസ്സാണ് മനസുമാണ് എല്ലാം ആണ് love u ഏട്ടാ ..
4      0  eppozhum abhinayikkan mohanlalum mamootyumea u...


In [None]:
tokenizer = AutoTokenizer.from_pretrained("google/muril-base-cased")

def tokenize_function(examples):
    comments = examples["comment"]


    if isinstance(comments, list):
        comments = [c if isinstance(c, str) else "" for c in comments]
    else:
        comments = comments if isinstance(comments, str) else ""

    return tokenizer(comments, padding='max_length', truncation=True, max_length=128)

malayalam_0_tokenized = train_dataset.map(tokenize_function, batched=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/206 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/3.16M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/113 [00:00<?, ?B/s]

Map:   0%|          | 0/25468 [00:00<?, ? examples/s]

In [None]:
malayalam_1_tokenized = val_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/3312 [00:00<?, ? examples/s]

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    "google/muril-base-cased",
    num_labels=2
)


lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["query", "key", "value"],
    bias="none"
)

pytorch_model.bin:   0%|          | 0.00/953M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/muril-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import torch.nn as nn

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc}

In [None]:
import torch.nn.functional as F

class FocalLoss(nn.Module):
    def __init__(self, alpha=None, gamma=2.0, reduction='mean'):

        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, logits, targets):
        ce_loss = F.cross_entropy(logits, targets, reduction='none')
        pt = torch.exp(-ce_loss)
        focal_loss = (1 - pt) ** self.gamma * ce_loss

        if self.alpha is not None:
            alpha_t = self.alpha.gather(0, targets.view(-1))
            focal_loss *= alpha_t

        if self.reduction == 'mean':
            return focal_loss.mean()
        elif self.reduction == 'sum':
            return focal_loss.sum()
        return focal_loss

In [None]:
labels = np.array([example["label"] for example in train_dataset])
class_counts = np.bincount(labels)

class_weights = torch.tensor(1.0 / class_counts, dtype=torch.float).to("cuda")
class_weights = class_weights / class_weights.sum()
loss_fn = FocalLoss(alpha=class_weights, gamma=2.0)

model.safetensors:   0%|          | 0.00/953M [00:00<?, ?B/s]

In [None]:
from transformers import Trainer, TrainingArguments


peft_model = get_peft_model(model, lora_config)
peft_model.print_trainable_parameters()

train_dataset = malayalam_0_tokenized
val_dataset = malayalam_1_tokenized


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=1)

    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="macro")
    return {"accuracy": acc, "f1_score": f1}


class CustomTrainer(Trainer):
  def __init__(self, gamma=2.0, *args, **kwargs):
    super().__init__(*args, **kwargs)
    self.gamma = gamma

  def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
      labels = inputs.pop("labels")
      outputs = model(**inputs)
      logits = outputs.logits
      loss = loss_fn(logits, labels)
      return (loss, outputs) if return_outputs else loss


training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=1e-4,
    num_train_epochs=9,
    per_device_train_batch_size=64,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    eval_steps = 50,
    logging_strategy = "steps",
    logging_steps = 10,
    gradient_accumulation_steps = 1


)

trainer = CustomTrainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    gamma=3.0
)


trainer.train()

trainer.save_model("./malayalam-comment-classifier")

def classify_comment(text):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    model.eval()
    with torch.no_grad():
        outputs = peft_model(**inputs)

        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=1).item()
    return "Offensive" if predicted_class == 1 else "Not Offensive"





trainable params: 886,274 || all params: 238,444,036 || trainable%: 0.3717


  super().__init__(*args, **kwargs)
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33manvaylabmail[0m ([33manvaylabmail-iit-madras[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,F1 Score
1,0.0729,0.075199,0.785326,0.530055
2,0.0511,0.048308,0.878019,0.617109
3,0.0382,0.03371,0.924215,0.69253
4,0.026,0.026165,0.944143,0.730405
5,0.0209,0.031363,0.927838,0.699546


Epoch,Training Loss,Validation Loss,Accuracy,F1 Score
1,0.0729,0.075199,0.785326,0.530055
2,0.0511,0.048308,0.878019,0.617109
3,0.0382,0.03371,0.924215,0.69253
4,0.026,0.026165,0.944143,0.730405
5,0.0209,0.031363,0.927838,0.699546
6,0.0252,0.030818,0.934179,0.712674
7,0.0207,0.023049,0.954408,0.757123
8,0.0172,0.021259,0.957729,0.764551
9,0.0164,0.02267,0.95471,0.75803


In [None]:
import os

model_save_path = "./malayalam_model"
tokenizer_save_path = "./malayalam_tokenizer"

os.makedirs(model_save_path, exist_ok=True)
os.makedirs(tokenizer_save_path, exist_ok=True)

model.save_pretrained(model_save_path)
tokenizer.save_pretrained(tokenizer_save_path)

print(f"Model saved to: {model_save_path}")
print(f"Tokenizer saved to: {tokenizer_save_path}")

!zip -r malayalam_model.zip malayalam_model
!zip -r malayalam_tokenizer.zip malayalam_tokenizer

print("Model and tokenizer zipped. Download them from the file browser.")

Model saved to: ./malayalam_model
Tokenizer saved to: ./malayalam_tokenizer
  adding: malayalam_model/ (stored 0%)
  adding: malayalam_model/config.json (deflated 49%)
  adding: malayalam_model/model.safetensors (deflated 7%)
  adding: malayalam_tokenizer/ (stored 0%)
  adding: malayalam_tokenizer/special_tokens_map.json (deflated 42%)
  adding: malayalam_tokenizer/tokenizer.json (deflated 70%)
  adding: malayalam_tokenizer/vocab.txt (deflated 61%)
  adding: malayalam_tokenizer/tokenizer_config.json (deflated 75%)
Model and tokenizer zipped. Download them from the file browser.
