## Importing Required Dependencies

In [None]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import numpy as np
from sklearn.metrics import accuracy_score

## Loading MuRIL Model and Tokenizer

In [None]:
model_name = "google/muril-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
print('Loading model')
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)  # Binary classification
print('Model Loaded')

device = torch.device("cpu")
model.to(device)
print('device check done')

## Tamil dataset and Preprocessing

In [3]:
data = {
    "text": [
        "சிகிச்சை நன்றாக வேலை செய்தது",  # Treatment worked well
        "மருந்து கொடுத்த பிறகு வலி அதிகரித்தது",  # Pain increased after medication
        "மருத்துவமனை சேவை சிறப்பாக இருந்தது",  # Hospital service was excellent
        "காய்ச்சல் குறையவில்லை",  # Fever didn’t reduce
        "மருத்துவர்கள் கவனமாக பராமரித்தனர்"  # Doctors cared attentively
    ],
    "label": [1, 0, 1, 0, 1]  # 1 = Positive, 0 = Negative
}
df = pd.DataFrame(data)
dataset = Dataset.from_pandas(df)

# Splitting into train and eval 
train_dataset = dataset.select(range(4))
eval_dataset = dataset.select(range(4, 5))

In [4]:
#Preprocess Tamil Text
def preprocess_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=128,  
        return_tensors="pt"
    )

encoded_train = train_dataset.map(preprocess_function, batched=True, remove_columns=["text"])
encoded_eval = eval_dataset.map(preprocess_function, batched=True, remove_columns=["text"])

# Renaming 'label' to 'labels' 
encoded_train = encoded_train.rename_column("label", "labels")
encoded_eval = encoded_eval.rename_column("label", "labels")

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [8]:
#Defining Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc}

print(' 1 okay')

# Training Arguments
training_args = TrainingArguments(
    output_dir=r".\muril_tamil_output",
    num_train_epochs= 6,  
    per_device_train_batch_size=1,  
    per_device_eval_batch_size=1,
    warmup_steps=2,  
    weight_decay=0.01,
    logging_dir=r".\muril_tamil_logs",
    logging_steps=1,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    gradient_accumulation_steps=2,  
    fp16=False,  # CPU-only
)

print(' 2 okay')

# 6. Initialize and Train Model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_train,
    eval_dataset=encoded_eval,
    compute_metrics=compute_metrics,
)

print(' 3 okay')

 1 okay
 2 okay




 3 okay


## Training Model

In [9]:
print("Starting MuRIL fine-tuning for Tamil sentiment classification...")
trainer.train()
print("Fine-tuning completed!")

Starting MuRIL fine-tuning for Tamil sentiment classification...


  0%|          | 0/12 [00:00<?, ?it/s]

{'loss': 0.69, 'grad_norm': 0.19738243520259857, 'learning_rate': 2.5e-05, 'epoch': 0.5}
{'loss': 0.6939, 'grad_norm': 0.14221253991127014, 'learning_rate': 5e-05, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.6943386793136597, 'eval_accuracy': 0.0, 'eval_runtime': 1.0004, 'eval_samples_per_second': 1.0, 'eval_steps_per_second': 1.0, 'epoch': 1.0}
{'loss': 0.6922, 'grad_norm': 0.12722145020961761, 'learning_rate': 4.5e-05, 'epoch': 1.5}
{'loss': 0.6925, 'grad_norm': 0.14881107211112976, 'learning_rate': 4e-05, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.693554699420929, 'eval_accuracy': 0.0, 'eval_runtime': 0.7289, 'eval_samples_per_second': 1.372, 'eval_steps_per_second': 1.372, 'epoch': 2.0}
{'loss': 0.6904, 'grad_norm': 0.13887521624565125, 'learning_rate': 3.5e-05, 'epoch': 2.5}
{'loss': 0.6908, 'grad_norm': 0.18203774094581604, 'learning_rate': 3e-05, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.6931657791137695, 'eval_accuracy': 0.0, 'eval_runtime': 0.8644, 'eval_samples_per_second': 1.157, 'eval_steps_per_second': 1.157, 'epoch': 3.0}
{'loss': 0.69, 'grad_norm': 0.21788634359836578, 'learning_rate': 2.5e-05, 'epoch': 3.5}
{'loss': 0.6894, 'grad_norm': 0.16752812266349792, 'learning_rate': 2e-05, 'epoch': 4.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.693187415599823, 'eval_accuracy': 0.0, 'eval_runtime': 0.7104, 'eval_samples_per_second': 1.408, 'eval_steps_per_second': 1.408, 'epoch': 4.0}
{'loss': 0.6888, 'grad_norm': 0.18691694736480713, 'learning_rate': 1.5e-05, 'epoch': 4.5}
{'loss': 0.6876, 'grad_norm': 0.17849470674991608, 'learning_rate': 1e-05, 'epoch': 5.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.6934930086135864, 'eval_accuracy': 0.0, 'eval_runtime': 1.0519, 'eval_samples_per_second': 0.951, 'eval_steps_per_second': 0.951, 'epoch': 5.0}
{'loss': 0.6845, 'grad_norm': 0.21268273890018463, 'learning_rate': 5e-06, 'epoch': 5.5}
{'loss': 0.6871, 'grad_norm': 0.1666143834590912, 'learning_rate': 0.0, 'epoch': 6.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.6934459805488586, 'eval_accuracy': 0.0, 'eval_runtime': 1.0962, 'eval_samples_per_second': 0.912, 'eval_steps_per_second': 0.912, 'epoch': 6.0}
{'train_runtime': 397.7181, 'train_samples_per_second': 0.06, 'train_steps_per_second': 0.03, 'train_loss': 0.6897701869408289, 'epoch': 6.0}
Fine-tuning completed!


## Saving trained model

In [25]:
model.save_pretrained(r".\muril_tamil_model")
tokenizer.save_pretrained(r".\muril_tamil_model")

In [2]:
#Inference Function
def predict_sentiment(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    probs = torch.softmax(outputs.logits, dim=-1)
    sentiment = "Positive" if torch.argmax(probs) == 1 else "Negative"
    return sentiment, probs[0][1].item()  # Return sentiment and positive probability

## Loadomg fine-tuned model

In [4]:
device = torch.device("cpu")
loaded_model = AutoModelForSequenceClassification.from_pretrained(r".\muril_tamil_model")
loaded_model.to(device)
loaded_tokenizer = AutoTokenizer.from_pretrained(r".\muril_tamil_model")

In [24]:
# Example prediction
print('Example 1')
tamil_note = "சிகிச்சை சரியாக வேலை செய்யவில்லை"  
tn = "Treatment didn't work well"
sentiment, prob = predict_sentiment(tamil_note, loaded_model, loaded_tokenizer)
print(f"Text: {tamil_note}, ({tn})")
print(f"Sentiment: {sentiment}, Positive Probability: {prob:.3f}")

tamil_note1 = "சிகிச்சை நன்றாக வேலை செய்தது, மருத்துவர்கள் மிகவும் உதவினார்கள்"  
tn1 = "Treatment worked well, doctors helped a lot"
sentiment1, prob1 = predict_sentiment(tamil_note1, loaded_model, loaded_tokenizer)
print(f"\nExample 2")
print(f"Text: {tamil_note1}, ({tn1})")
print(f"Sentiment: {sentiment1}, Positive Probability: {prob1:.3f}")

Example 1
Text: சிகிச்சை சரியாக வேலை செய்யவில்லை, (Treatment didn't work well)
Sentiment: Negative, Positive Probability: 0.500

Example 2
Text: சிகிச்சை நன்றாக வேலை செய்தது, மருத்துவர்கள் மிகவும் உதவினார்கள், (Treatment worked well, doctors helped a lot)
Sentiment: Positive, Positive Probability: 0.501
