In [None]:
!pip install transformers peft datasets torch evaluate



In [None]:
import pandas as pd
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType

In [None]:
train_data = pd.read_csv('/content/kannada_dataset_0.csv', on_bad_lines='skip', encoding='utf-8')
print("Dataset loaded successfully:")
print(train_data.head())
train_dataset = Dataset.from_pandas(train_data)

Dataset loaded successfully:
   label                                            comment
0      0  tik tok alli jagala madtidralla adra baggenu o...
1      0                          movie rerelease madi plss
2      0  amazon prime alli bittidira....yella manele no...
3      0  guru sure news nanu tik tok dawn lod madeda ya...
4      0  ಸುದೀಪ್ ಸರ್ ಅಂಡ್ ದರ್ಶನ್ ಸರ್ ಅವರಿಗೆ ಇರೋ ಫ್ಯಾನ್ಸ್...


In [None]:
import numpy as np
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
from datasets import Dataset


X_train = [example["comment"] for example in train_dataset]
y_train = [example["label"] for example in train_dataset]


X_train = np.array(X_train).reshape(-1, 1)
y_train = np.array(y_train)

ros = RandomOverSampler(sampling_strategy="auto", random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)


train_dataset = Dataset.from_dict({
    "comment": X_resampled.flatten().tolist(),
    "label": y_resampled.tolist()
})

print(Counter(y_resampled))



Counter({np.int64(0): 3544, np.int64(1): 3544})


In [None]:
val_data = pd.read_csv('/content/kannada_dataset_1.csv', on_bad_lines='skip', encoding='utf-8')
print("Dataset loaded successfully:")
print(val_data.head())
val_dataset = Dataset.from_pandas(val_data)

Dataset loaded successfully:
   label                                            comment
0      0                                     100 days pakka
1      0  sari nivu video na roast madi adre madvaga pub...
2      0                        krishana shapa tatteleebeku
3      0  ಕನ್ನಡ ಇವತ್ತು ರಾಷ್ಟ್ರೀಯ ಮತ್ತು ಅಂತಾರಾಷ್ಟ್ರೀಯ ಮಟ್...
4      0  ಕೇಳಿ ಕಾದಿರುವ ಭಾಂದವರೇnಭುವಿಯಲ್ಲಿ ಅವನ ಅರಿತವರೆnಯಾರ...


In [None]:
tokenizer = AutoTokenizer.from_pretrained("google/muril-base-cased")

def tokenize_function(examples):
    comments = examples["comment"]


    if isinstance(comments, list):
        comments = [c if isinstance(c, str) else "" for c in comments]
    else:
        comments = comments if isinstance(comments, str) else ""

    return tokenizer(comments, padding='max_length', truncation=True, max_length=128)



kannada_0_tokenized = train_dataset.map(tokenize_function, batched=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/7088 [00:00<?, ? examples/s]

In [None]:
kannada_1_tokenized = val_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/586 [00:00<?, ? examples/s]

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    "google/muril-base-cased",
    num_labels=2
)


lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["query", "key", "value"],
    bias="none"
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/muril-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import torch.nn as nn

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc}

In [None]:
import torch.nn.functional as F

class FocalLoss(nn.Module):
    def __init__(self, alpha=None, gamma=2.0, reduction='mean'):

        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, logits, targets):
        ce_loss = F.cross_entropy(logits, targets, reduction='none')
        pt = torch.exp(-ce_loss)
        focal_loss = (1 - pt) ** self.gamma * ce_loss

        if self.alpha is not None:
            alpha_t = self.alpha.gather(0, targets.view(-1))
            focal_loss *= alpha_t

        if self.reduction == 'mean':
            return focal_loss.mean()
        elif self.reduction == 'sum':
            return focal_loss.sum()
        return focal_loss

In [None]:
labels = np.array([example["label"] for example in train_dataset])
class_counts = np.bincount(labels)

class_weights = torch.tensor(1.0 / class_counts, dtype=torch.float).to("cuda")
class_weights = class_weights / class_weights.sum()
loss_fn = FocalLoss(alpha=class_weights, gamma=2.0)

In [None]:
from transformers import Trainer, TrainingArguments


peft_model = get_peft_model(model, lora_config)
peft_model.print_trainable_parameters()

train_dataset = kannada_0_tokenized
val_dataset = kannada_1_tokenized


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=1)

    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="macro")

    return {"accuracy": acc, "f1_score": f1}


class CustomTrainer(Trainer):
  def __init__(self, gamma=2.0, *args, **kwargs):
    super().__init__(*args, **kwargs)
    self.gamma = gamma

  def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
      labels = inputs.pop("labels")
      outputs = model(**inputs)
      logits = outputs.logits
      loss = loss_fn(logits, labels)
      return (loss, outputs) if return_outputs else loss


training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=1e-4,
    num_train_epochs=10,
    per_device_train_batch_size=64,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    eval_steps = 50,
    logging_strategy = "steps",
    logging_steps = 10,
    gradient_accumulation_steps = 1


)


trainer = CustomTrainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    gamma=3.0
)


trainer.train()


trainer.save_model("./malayalam-comment-classifier")


def classify_comment(text):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    model.eval()
    with torch.no_grad():
        outputs = peft_model(**inputs)

        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=1).item()
    return "Offensive" if predicted_class == 1 else "Not Offensive"





trainable params: 886,274 || all params: 238,444,036 || trainable%: 0.3717


  super().__init__(*args, **kwargs)
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33manvaylabmail[0m ([33manvaylabmail-iit-madras[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,F1 Score
1,0.0843,0.087802,0.564846,0.56039
2,0.0808,0.085014,0.663823,0.644386
3,0.0769,0.080184,0.71843,0.689454
4,0.0747,0.079565,0.716724,0.688637
5,0.0714,0.078974,0.716724,0.690018
6,0.0693,0.074981,0.737201,0.700961
7,0.0681,0.076211,0.733788,0.703964
8,0.0677,0.073284,0.75256,0.716497
9,0.0685,0.072418,0.750853,0.712513
10,0.0656,0.073076,0.745734,0.707858


In [None]:
import os

model_save_path = "./malayalam_model"
tokenizer_save_path = "./malayalam_tokenizer"

os.makedirs(model_save_path, exist_ok=True)
os.makedirs(tokenizer_save_path, exist_ok=True)

model.save_pretrained(model_save_path)
tokenizer.save_pretrained(tokenizer_save_path)

print(f"Model saved to: {model_save_path}")
print(f"Tokenizer saved to: {tokenizer_save_path}")

!zip -r malayalam_model.zip malayalam_model
!zip -r malayalam_tokenizer.zip malayalam_tokenizer

print("Model and tokenizer zipped. Download them from the file browser.")

Model saved to: ./malayalam_model
Tokenizer saved to: ./malayalam_tokenizer
  adding: malayalam_model/ (stored 0%)
  adding: malayalam_model/config.json (deflated 49%)
  adding: malayalam_model/model.safetensors (deflated 7%)
  adding: malayalam_tokenizer/ (stored 0%)
  adding: malayalam_tokenizer/special_tokens_map.json (deflated 42%)
  adding: malayalam_tokenizer/tokenizer.json (deflated 70%)
  adding: malayalam_tokenizer/vocab.txt (deflated 61%)
  adding: malayalam_tokenizer/tokenizer_config.json (deflated 75%)
Model and tokenizer zipped. Download them from the file browser.
