In [None]:
# Import necessary libraries
!pip install transformers torch pandas scikit-learn openpyxl
!pip install git+https://github.com/csebuetnlp/normalizer # Required for BanglaBERT normalization

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.preprocessing import LabelEncoder
from normalizer import normalize # BanglaBERT-specific normalization
import os

# **BanglaBERT**

In [None]:
# Define dataset path
dataset_path = '/content/sampled_10_percent_data.xlsx' # Update if needed

# Load and preprocess the dataset
def load_dataset(file_path):
    df = pd.read_excel(file_path)
    texts = df['comments'].astype(str).tolist()
    labels = df['tag'].tolist()

    # Encode labels (Positive: 0, Negative: 1, Neutral: 2)
    label_encoder = LabelEncoder()
    encoded_labels = label_encoder.fit_transform(labels)

    print(f"Dataset size: {len(texts)} samples")
    return texts, encoded_labels, label_encoder

# Custom Dataset class for PyTorch
class BanglaSentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=64): # Reduced max_length
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = normalize(self.texts[idx]) # Normalize text for BanglaBERT
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Function to compute metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

# Function to train and evaluate BanglaBERT
def train_and_evaluate(train_dataset, eval_dataset, output_dir):
    # Load tokenizer and model
    model_name = 'csebuetnlp/banglabert'
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=3 # Positive, Negative, Neutral
    )

    # Move model to GPU if available
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    # Define training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=3,
        per_device_train_batch_size=4, # Reduced batch size
        per_device_eval_batch_size=4,
        gradient_accumulation_steps=2, # Effective batch size = 4 * 2 = 8
        warmup_steps=200, # Reduced warmup steps
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
        eval_strategy='epoch',
        save_strategy='epoch',
        load_best_model_at_end=True,
        metric_for_best_model='f1',
        report_to='none', # Disable Weights & Biases
        fp16=True # Enable mixed precision training
    )

    # Initialize Trainer with progress bar
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics
    )

    # Train the model with progress bar
    print("Starting training...")
    trainer.train()

    # Evaluate the model
    print("Evaluating...")
    eval_results = trainer.evaluate()

    return eval_results

# Main execution
def main():
    # Load dataset
    texts, encoded_labels, label_encoder = load_dataset(dataset_path)

    # Split dataset
    train_texts, eval_texts, train_labels, eval_labels = train_test_split(
        texts, encoded_labels, test_size=0.2, random_state=42
    )

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained('csebuetnlp/banglabert')

    # Prepare datasets
    train_dataset = BanglaSentimentDataset(train_texts, train_labels, tokenizer)
    eval_dataset = BanglaSentimentDataset(eval_texts, eval_labels, tokenizer)

    # Train and evaluate
    output_dir = './results_banglabert'
    eval_results = train_and_evaluate(train_dataset, eval_dataset, output_dir)

    # Print results
    print("\n=== BanglaBERT Results ===")
    print(f"Accuracy: {eval_results['eval_accuracy']:.4f}")
    print(f"Precision: {eval_results['eval_precision']:.4f}")
    print(f"Recall: {eval_results['eval_recall']:.4f}")
    print(f"F1-Score: {eval_results['eval_f1']:.4f}")

    # Save results to Google Drive
    !cp -r ./results_banglabert /content/drive/MyDrive/

if __name__ == "__main__":
    main()

Dataset size: 6148 samples


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at csebuetnlp/banglabert and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5373,0.525953,0.821951,0.795793,0.821951,0.808525
2,0.3074,0.591004,0.825203,0.803753,0.825203,0.811858
3,0.426,0.65768,0.828455,0.803199,0.828455,0.815047


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Evaluating...



=== BanglaBERT Results ===
Accuracy: 0.8285
Precision: 0.8032
Recall: 0.8285
F1-Score: 0.8150
cp: cannot create directory '/content/drive/MyDrive/': No such file or directory


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
!zip -r banglabert_model.zip ./results_banglabert
from google.colab import files
files.download("banglabert_model.zip")


  adding: results_banglabert/ (stored 0%)
  adding: results_banglabert/checkpoint-1845/ (stored 0%)
  adding: results_banglabert/checkpoint-1845/model.safetensors (deflated 7%)
  adding: results_banglabert/checkpoint-1845/scheduler.pt (deflated 56%)
  adding: results_banglabert/checkpoint-1845/rng_state.pth (deflated 24%)
  adding: results_banglabert/checkpoint-1845/config.json (deflated 55%)
  adding: results_banglabert/checkpoint-1845/training_args.bin (deflated 51%)
  adding: results_banglabert/checkpoint-1845/trainer_state.json (deflated 81%)
  adding: results_banglabert/checkpoint-1845/optimizer.pt (deflated 24%)
  adding: results_banglabert/checkpoint-1230/ (stored 0%)
  adding: results_banglabert/checkpoint-1230/model.safetensors (deflated 7%)
  adding: results_banglabert/checkpoint-1230/scheduler.pt (deflated 55%)
  adding: results_banglabert/checkpoint-1230/rng_state.pth (deflated 24%)
  adding: results_banglabert/checkpoint-1230/config.json (deflated 55%)
  adding: results_ba

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# **mBERT**

In [None]:
# Define dataset path
dataset_path = '/content/sampled_10_percent_data.xlsx' # Update if needed

# Load and preprocess the dataset
def load_dataset(file_path):
    df = pd.read_excel(file_path)
    texts = df['comments'].astype(str).tolist()
    labels = df['tag'].tolist()

    # Encode labels (Positive: 0, Negative: 1, Neutral: 2)
    label_encoder = LabelEncoder()
    encoded_labels = label_encoder.fit_transform(labels)

    print(f"Dataset size: {len(texts)} samples")
    return texts, encoded_labels, label_encoder

# Custom Dataset class for PyTorch
class BanglaSentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=64):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx] # No normalization for mBERT
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Function to compute metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

# Function to train and evaluate mBERT
def train_and_evaluate(train_dataset, eval_dataset, output_dir):
    # Load tokenizer and model
    model_name = 'bert-base-multilingual-cased'
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=3 # Positive, Negative, Neutral
    )

    # Move model to GPU if available
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    # Define training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=3,
        per_device_train_batch_size=4, # Reduced batch size
        per_device_eval_batch_size=4,
        gradient_accumulation_steps=2, # Effective batch size = 4 * 2 = 8
        warmup_steps=200, # Reduced warmup steps
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
        eval_strategy='epoch',
        save_strategy='epoch',
        load_best_model_at_end=True,
        metric_for_best_model='f1',
        report_to='none', # Disable Weights & Biases
        fp16=True # Enable mixed precision training
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics
    )

    # Train the model with progress bar
    print("Starting training...")
    trainer.train()

    # Evaluate the model
    print("Evaluating...")
    eval_results = trainer.evaluate()

    return eval_results

# Main execution
def main():
    # Load dataset
    texts, encoded_labels, label_encoder = load_dataset(dataset_path)

    # Split dataset
    train_texts, eval_texts, train_labels, eval_labels = train_test_split(
        texts, encoded_labels, test_size=0.2, random_state=42
    )

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')

    # Prepare datasets
    train_dataset = BanglaSentimentDataset(train_texts, train_labels, tokenizer)
    eval_dataset = BanglaSentimentDataset(eval_texts, eval_labels, tokenizer)

    # Train and evaluate
    output_dir = './results_mbert'
    eval_results = train_and_evaluate(train_dataset, eval_dataset, output_dir)

    # Print results
    print("\n=== mBERT Results ===")
    print(f"Accuracy: {eval_results['eval_accuracy']:.4f}")
    print(f"Precision: {eval_results['eval_precision']:.4f}")
    print(f"Recall: {eval_results['eval_recall']:.4f}")
    print(f"F1-Score: {eval_results['eval_f1']:.4f}")


if __name__ == "__main__":
    main()

Dataset size: 6148 samples


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5916,0.638748,0.770732,0.745703,0.770732,0.757586
2,0.4617,0.529939,0.793496,0.769314,0.793496,0.779205
3,0.4008,0.679903,0.804878,0.778562,0.804878,0.791437


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Evaluating...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



=== mBERT Results ===
Accuracy: 0.8049
Precision: 0.7786
Recall: 0.8049
F1-Score: 0.7914


# **XLM-R**

In [None]:
# Install required libraries
!pip install transformers torch pandas scikit-learn openpyxl tqdm

import pandas as pd
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.preprocessing import LabelEncoder
from tqdm.auto import tqdm
import os

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
# Define dataset path
dataset_path = '/content/sampled_10_percent_data.xlsx' # Update if needed

# Load and preprocess the dataset
def load_dataset(file_path):
    df = pd.read_excel(file_path)
    texts = df['comments'].astype(str).tolist()
    labels = df['tag'].tolist()

    # Encode labels (Positive: 0, Negative: 1, Neutral: 2)
    label_encoder = LabelEncoder()
    encoded_labels = label_encoder.fit_transform(labels)

    print(f"Dataset size: {len(texts)} samples")
    return texts, encoded_labels, label_encoder

# Custom Dataset class for PyTorch
class BanglaSentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=64):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx] # No normalization for XLM-R
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Function to compute metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

# Function to train and evaluate XLM-R
def train_and_evaluate(train_dataset, eval_dataset, output_dir):
    # Load tokenizer and model
    model_name = 'xlm-roberta-base'
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=3 # Positive, Negative, Neutral
    )

    # Move model to GPU if available
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    # Define training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=3,
        per_device_train_batch_size=4, # Reduced batch size
        per_device_eval_batch_size=4,
        gradient_accumulation_steps=2, # Effective batch size = 4 * 2 = 8
        warmup_steps=200, # Reduced warmup steps
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
        eval_strategy='epoch',
        save_strategy='epoch',
        load_best_model_at_end=True,
        metric_for_best_model='f1',
        report_to='none', # Disable Weights & Biases
        fp16=True # Enable mixed precision training
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics
    )

    # Train the model with progress bar
    print("Starting training...")
    trainer.train()

    # Evaluate the model
    print("Evaluating...")
    eval_results = trainer.evaluate()

    return eval_results

# Main execution
def main():
    # Load dataset
    texts, encoded_labels, label_encoder = load_dataset(dataset_path)

    # Split dataset
    train_texts, eval_texts, train_labels, eval_labels = train_test_split(
        texts, encoded_labels, test_size=0.2, random_state=42
    )

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')

    # Prepare datasets
    train_dataset = BanglaSentimentDataset(train_texts, train_labels, tokenizer)
    eval_dataset = BanglaSentimentDataset(eval_texts, eval_labels, tokenizer)

    # Train and evaluate
    output_dir = './results_xlmr'
    eval_results = train_and_evaluate(train_dataset, eval_dataset, output_dir)

    # Print results
    print("\n=== XLM-R Results ===")
    print(f"Accuracy: {eval_results['eval_accuracy']:.4f}")
    print(f"Precision: {eval_results['eval_precision']:.4f}")
    print(f"Recall: {eval_results['eval_recall']:.4f}")
    print(f"F1-Score: {eval_results['eval_f1']:.4f}")


if __name__ == "__main__":
    main()

Dataset size: 6148 samples


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5557,0.731275,0.770732,0.746654,0.770732,0.757183
2,0.5279,0.609058,0.784553,0.758985,0.784553,0.771183


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5557,0.731275,0.770732,0.746654,0.770732,0.757183
2,0.5279,0.609058,0.784553,0.758985,0.784553,0.771183
3,0.5738,0.657824,0.796748,0.770777,0.796748,0.783247


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Evaluating...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



=== XLM-R Results ===
Accuracy: 0.7967
Precision: 0.7708
Recall: 0.7967
F1-Score: 0.7832


# **MuRIL**

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.preprocessing import LabelEncoder
from tqdm.auto import tqdm
import os

In [None]:
# Define dataset path
dataset_path = '/content/sampled_10_percent_data.xlsx' # Update if needed

# Load and preprocess the dataset
def load_dataset(file_path):
    df = pd.read_excel(file_path)
    texts = df['comments'].astype(str).tolist()
    labels = df['tag'].tolist()

    # Encode labels (Positive: 0, Negative: 1, Neutral: 2)
    label_encoder = LabelEncoder()
    encoded_labels = label_encoder.fit_transform(labels)

    print(f"Dataset size: {len(texts)} samples")
    return texts, encoded_labels, label_encoder

# Custom Dataset class for PyTorch
class BanglaSentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=64):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx] # No normalization for MuRIL
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Function to compute metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

# Function to train and evaluate MuRIL
def train_and_evaluate(train_dataset, eval_dataset, output_dir):
    # Load tokenizer and model
    model_name = 'google/muril-base-cased'
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=3 # Positive, Negative, Neutral
    )

    # Move model to GPU if available
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    # Define training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=3,
        per_device_train_batch_size=4, # Reduced batch size
        per_device_eval_batch_size=4,
        gradient_accumulation_steps=2, # Effective batch size = 4 * 2 = 8
        warmup_steps=200, # Reduced warmup steps
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
        eval_strategy='epoch',
        save_strategy='epoch',
        load_best_model_at_end=True,
        metric_for_best_model='f1',
        report_to='none', # Disable Weights & Biases
        fp16=True # Enable mixed precision training
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics
    )

    # Train the model with progress bar
    print("Starting training...")
    trainer.train()

    # Evaluate the model
    print("Evaluating...")
    eval_results = trainer.evaluate()

    return eval_results

# Main execution
def main():
    # Load dataset
    texts, encoded_labels, label_encoder = load_dataset(dataset_path)

    # Split dataset
    train_texts, eval_texts, train_labels, eval_labels = train_test_split(
        texts, encoded_labels, test_size=0.2, random_state=42
    )

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained('google/muril-base-cased')

    # Prepare datasets
    train_dataset = BanglaSentimentDataset(train_texts, train_labels, tokenizer)
    eval_dataset = BanglaSentimentDataset(eval_texts, eval_labels, tokenizer)

    # Train and evaluate
    output_dir = './results_muril'
    eval_results = train_and_evaluate(train_dataset, eval_dataset, output_dir)

    # Print results
    print("\n=== MuRIL Results ===")
    print(f"Accuracy: {eval_results['eval_accuracy']:.4f}")
    print(f"Precision: {eval_results['eval_precision']:.4f}")
    print(f"Recall: {eval_results['eval_recall']:.4f}")
    print(f"F1-Score: {eval_results['eval_f1']:.4f}")

    # Save results to Google Drive
    !cp -r ./results_muril /content/drive/MyDrive/

if __name__ == "__main__":
    main()

Dataset size: 6148 samples


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/206 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/3.16M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/113 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/953M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/953M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/muril-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5764,0.608363,0.797561,0.771894,0.797561,0.784476
2,0.3932,0.575613,0.821138,0.794501,0.821138,0.807236
3,0.4052,0.602223,0.821138,0.794434,0.821138,0.807565


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Evaluating...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



=== MuRIL Results ===
Accuracy: 0.8211
Precision: 0.7944
Recall: 0.8211
F1-Score: 0.8076
cp: cannot create directory '/content/drive/MyDrive/': No such file or directory


# **IndicBERT**

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.preprocessing import LabelEncoder
from tqdm.auto import tqdm
import os

In [None]:
# Define dataset path
dataset_path = '/content/sampled_10_percent_data.xlsx' # Update if needed

# Load and preprocess the dataset
def load_dataset(file_path):
    df = pd.read_excel(file_path)
    texts = df['comments'].astype(str).tolist()
    labels = df['tag'].tolist()

    # Encode labels (Positive: 0, Negative: 1, Neutral: 2)
    label_encoder = LabelEncoder()
    encoded_labels = label_encoder.fit_transform(labels)

    print(f"Dataset size: {len(texts)} samples")
    return texts, encoded_labels, label_encoder

# Custom Dataset class for PyTorch
class BanglaSentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=64):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx] # No normalization for IndicBERT
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Function to compute metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

# Function to train and evaluate IndicBERT
def train_and_evaluate(train_dataset, eval_dataset, output_dir):
    # Load tokenizer and model
    model_name = 'ai4bharat/indic-bert'
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=3 # Positive, Negative, Neutral
    )

    # Move model to GPU if available
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    # Define training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=3,
        per_device_train_batch_size=4, # Reduced batch size
        per_device_eval_batch_size=4,
        gradient_accumulation_steps=2, # Effective batch size = 4 * 2 = 8
        warmup_steps=200, # Reduced warmup steps
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
        eval_strategy='epoch',
        save_strategy='epoch',
        load_best_model_at_end=True,
        metric_for_best_model='f1',
        report_to='none', # Disable Weights & Biases
        fp16=True # Enable mixed precision training
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics
    )

    # Train the model with progress bar
    print("Starting training...")
    trainer.train()

    # Evaluate the model
    print("Evaluating...")
    eval_results = trainer.evaluate()

    return eval_results

# Main execution
def main():
    # Load dataset
    texts, encoded_labels, label_encoder = load_dataset(dataset_path)

    # Split dataset
    train_texts, eval_texts, train_labels, eval_labels = train_test_split(
        texts, encoded_labels, test_size=0.2, random_state=42
    )

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained('ai4bharat/indic-bert')

    # Prepare datasets
    train_dataset = BanglaSentimentDataset(train_texts, train_labels, tokenizer)
    eval_dataset = BanglaSentimentDataset(eval_texts, eval_labels, tokenizer)

    # Train and evaluate
    output_dir = './results_indicbert'
    eval_results = train_and_evaluate(train_dataset, eval_dataset, output_dir)

    # Print results
    print("\n=== IndicBERT Results ===")
    print(f"Accuracy: {eval_results['eval_accuracy']:.4f}")
    print(f"Precision: {eval_results['eval_precision']:.4f}")
    print(f"Recall: {eval_results['eval_recall']:.4f}")
    print(f"F1-Score: {eval_results['eval_f1']:.4f}")

if __name__ == "__main__":
    main()

Dataset size: 6148 samples


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/507 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/5.65M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/135M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training...


model.safetensors:   0%|          | 0.00/135M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6305,0.721856,0.696748,0.674322,0.696748,0.683758


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6305,0.721856,0.696748,0.674322,0.696748,0.683758
2,0.6351,0.614573,0.754472,0.734713,0.754472,0.738991
3,0.6134,0.612471,0.773171,0.748154,0.773171,0.759563


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Evaluating...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



=== IndicBERT Results ===
Accuracy: 0.7732
Precision: 0.7482
Recall: 0.7732
F1-Score: 0.7596


# **DistilBERT Multilingual**

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.preprocessing import LabelEncoder
from tqdm.auto import tqdm
import os

In [None]:
# Define dataset path
dataset_path = '/content/sampled_10_percent_data.xlsx' # Update if needed

# Load and preprocess the dataset
def load_dataset(file_path):
    df = pd.read_excel(file_path)
    texts = df['comments'].astype(str).tolist()
    labels = df['tag'].tolist()

    # Encode labels (Positive: 0, Negative: 1, Neutral: 2)
    label_encoder = LabelEncoder()
    encoded_labels = label_encoder.fit_transform(labels)

    return texts, encoded_labels, label_encoder

# Custom Dataset class for PyTorch
class BanglaSentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=64):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx] # No normalization for DistilBERT
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Function to compute metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

# Function to train and evaluate DistilBERT Multilingual
def train_and_evaluate(train_dataset, eval_dataset, output_dir):
    # Load tokenizer and model
    model_name = 'distilbert-base-multilingual-cased'
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=3 # Positive, Negative, Neutral
    )

    # Move model to GPU if available
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    # Define training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=3,
        per_device_train_batch_size=4, # Reduced batch size
        per_device_eval_batch_size=4,
        gradient_accumulation_steps=2, # Effective batch size = 4 * 2 = 8
        warmup_steps=200, # Reduced warmup steps
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
        eval_strategy='epoch',
        save_strategy='epoch',
        load_best_model_at_end=True,
        metric_for_best_model='f1',
        report_to='none', # Disable Weights & Biases
        fp16=True # Enable mixed precision training
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics
    )

    # Train the model with progress bar
    print("Starting training...")
    trainer.train()

    # Evaluate the model
    print("Evaluating...")
    eval_results = trainer.evaluate()

    return eval_results

# Main execution
def main():
    # Load dataset
    texts, encoded_labels, label_encoder = load_dataset(dataset_path)

    # Split dataset
    train_texts, eval_texts, train_labels, eval_labels = train_test_split(
        texts, encoded_labels, test_size=0.2, random_state=42
    )

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained('distilbert-base-multilingual-cased')

    # Prepare datasets
    train_dataset = BanglaSentimentDataset(train_texts, train_labels, tokenizer)
    eval_dataset = BanglaSentimentDataset(eval_texts, eval_labels, tokenizer)

    # Train and evaluate
    output_dir = './results_distilbert'
    eval_results = train_and_evaluate(train_dataset, eval_dataset, output_dir)

    # Print results
    print("\n=== DistilBERT Multilingual Results ===")
    print(f"Accuracy: {eval_results['eval_accuracy']:.4f}")
    print(f"Precision: {eval_results['eval_precision']:.4f}")
    print(f"Recall: {eval_results['eval_recall']:.4f}")
    print(f"F1-Score: {eval_results['eval_f1']:.4f}")

if __name__ == "__main__":
    main()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4835,0.638862,0.76748,0.742429,0.76748,0.754734
2,0.4676,0.555348,0.793496,0.769634,0.793496,0.779235
3,0.3573,0.712135,0.810569,0.784092,0.810569,0.797077


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Evaluating...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



=== DistilBERT Multilingual Results ===
Accuracy: 0.8106
Precision: 0.7841
Recall: 0.8106
F1-Score: 0.7971
