In [1]:
%pip install transformers datasets scikit-learn accelerate pandas numpy scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [2]:
%pip install tiktoken protobuf
%pip install sentencepiece


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = pd.read_csv("final_dataset_4000.csv")
le = LabelEncoder()
data["label_num"] = le.fit_transform(data['label'])
data = data.rename(columns={"label_num": "labels"})

In [3]:
print(data.head())

   Unnamed: 0                                            konkani     label  \
0           0                                     हागवणूय लागता.  negative   
1           1  फळांनी आनी भाजयांनी जावपी चरबी कुडी खातीर उपेग...  positive   
2           2  पूण ह्या स्वतंत्रतायेचे तात्पर्य हें न्हय की व...  positive   
3           3  गुदडींत तांबड्या बिहारा संदर्भांत ही गजाल खरी ...  positive   
4           4                                    स्वामी शनी आसा.   neutral   

   labels  
0       0  
1       2  
2       2  
3       2  
4       1  


In [4]:
data_final = data[["konkani", "labels"]]  # keep only relevant columns

In [5]:
dataset = Dataset.from_pandas(data_final)
dataset = dataset.train_test_split(test_size=0.2, shuffle=True, seed=42)


In [6]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        'accuracy': accuracy_score(labels, predictions),
        'f1': f1_score(labels, predictions, average='weighted')
    }

In [7]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    weight_decay=0.01,
    logging_dir="./logs",
    report_to="none",  # No wandb
)


In [None]:
model_names = [
    "ai4bharat/indic-bert",
    "google/muril-base-cased",
    "bert-base-multilingual-cased",
    "ibraheemmoosa/xlmindic-base-multiscript",
    "ibraheemmoosa/xlmindic-base-uniscript"
]

results = []

In [9]:
for model_name in model_names:
    print(f"Training {model_name}...")

    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)

    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

    # Tokenize dataset
    def preprocess_function(examples):
        return tokenizer(examples["konkani"], truncation=True, padding="max_length", max_length=128)

    tokenized_dataset = dataset.map(preprocess_function, batched=True)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["test"],
        compute_metrics=compute_metrics,
    )

    trainer.train()
    eval_result = trainer.evaluate()
    results.append((model_name, eval_result))

Training ai4bharat/indic-bert...


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 3200/3200 [00:00<00:00, 5543.68 examples/s]
Map: 100%|██████████| 800/800 [00:00<00:00, 6678.85 examples/s]


Step,Training Loss
500,1.0984
1000,1.0471
1500,0.9151


Training google/muril-base-cased...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/muril-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 3200/3200 [00:00<00:00, 4680.32 examples/s]
Map: 100%|██████████| 800/800 [00:00<00:00, 4743.83 examples/s]


Step,Training Loss
500,1.0851
1000,0.9763
1500,0.8572


Training bert-base-multilingual-cased...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 3200/3200 [00:00<00:00, 4223.85 examples/s]
Map: 100%|██████████| 800/800 [00:00<00:00, 4170.41 examples/s]


Step,Training Loss
500,1.0929
1000,0.9193
1500,0.6859


In [13]:
print("\nComparison of models:")
for model_name, eval_result in results:
    print("eval_result:", eval_result)
    print(f"Model: {model_name}")
    print(f"Accuracy: {eval_result['eval_accuracy']:.4f}")
    print(f"F1 Score: {eval_result['eval_f1']:.4f}")
    print(f"Loss: {eval_result['eval_loss']:.4f}")
    print("-" * 30)


Comparison of models:
eval_result: {'eval_loss': 1.0089890956878662, 'eval_accuracy': 0.53125, 'eval_f1': 0.5292116352280318, 'eval_runtime': 3.4883, 'eval_samples_per_second': 229.336, 'eval_steps_per_second': 28.667, 'epoch': 4.0}
Model: ai4bharat/indic-bert
Accuracy: 0.5312
F1 Score: 0.5292
Loss: 1.0090
------------------------------
eval_result: {'eval_loss': 0.9235853552818298, 'eval_accuracy': 0.6075, 'eval_f1': 0.6064303548823888, 'eval_runtime': 3.4138, 'eval_samples_per_second': 234.345, 'eval_steps_per_second': 29.293, 'epoch': 4.0}
Model: google/muril-base-cased
Accuracy: 0.6075
F1 Score: 0.6064
Loss: 0.9236
------------------------------
eval_result: {'eval_loss': 1.0604640245437622, 'eval_accuracy': 0.5725, 'eval_f1': 0.5732235038615129, 'eval_runtime': 3.4044, 'eval_samples_per_second': 234.992, 'eval_steps_per_second': 29.374, 'epoch': 4.0}
Model: bert-base-multilingual-cased
Accuracy: 0.5725
F1 Score: 0.5732
Loss: 1.0605
------------------------------
