<a href="https://colab.research.google.com/github/coder7475/sentiment_analysis_bangla/blob/main/BanglaBert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## BanglaBert Training

In [1]:
import os
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Create output directory
os.makedirs("models/baseline_models/banglabert_baseline", exist_ok=True)
logging.info("Output directory created: banglabert_baseline")

In [2]:
# Define paths
data_dir = "text_representation/"
files = {
    'input_ids': f"{data_dir}bert_input_ids.npy",
    'attention_masks': f"{data_dir}bert_attention_masks.npy",
    'labels_train': f"{data_dir}labels_train.csv",
    'labels_val': f"{data_dir}labels_val.csv",
    'split_indices': f"{data_dir}split_indices.csv"
}

In [3]:
files

{'input_ids': 'text_representation/bert_input_ids.npy',
 'attention_masks': 'text_representation/bert_attention_masks.npy',
 'labels_train': 'text_representation/labels_train.csv',
 'labels_val': 'text_representation/labels_val.csv',
 'split_indices': 'text_representation/split_indices.csv'}

In [4]:
# prompt: dowload text_representation folder from here: https://github.com/coder7475/sentiment_analysis_bangla/tree/main/text_representation

!wget https://github.com/coder7475/sentiment_analysis_bangla/raw/main/text_representation/bert_attention_masks.npy -P text_representation/
!wget https://github.com/coder7475/sentiment_analysis_bangla/raw/main/text_representation/bert_input_ids.npy -P text_representation/
!wget https://github.com/coder7475/sentiment_analysis_bangla/raw/main/text_representation/labels_train.csv -P text_representation/
!wget https://github.com/coder7475/sentiment_analysis_bangla/raw/main/text_representation/labels_val.csv -P text_representation/
!wget https://github.com/coder7475/sentiment_analysis_bangla/raw/main/text_representation/split_indices.csv -P text_representation/

--2025-06-18 09:34:20--  https://github.com/coder7475/sentiment_analysis_bangla/raw/main/text_representation/bert_attention_masks.npy
Resolving github.com (github.com)... 140.82.113.3
Connecting to github.com (github.com)|140.82.113.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/coder7475/sentiment_analysis_bangla/main/text_representation/bert_attention_masks.npy [following]
--2025-06-18 09:34:20--  https://raw.githubusercontent.com/coder7475/sentiment_analysis_bangla/main/text_representation/bert_attention_masks.npy
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7928960 (7.6M) [application/octet-stream]
Saving to: ‘text_representation/bert_attention_masks.npy’


2025-06-18 09:34:21 (79.1 M

In [5]:
# Check file existence
for name, path in files.items():
    if not os.path.exists(path):
        logging.error(f"Missing file: {path}")
        raise FileNotFoundError(f"Missing file: {path}")


In [6]:
import numpy as np
import pandas as pd

# Load data
input_ids = np.load(files['input_ids'])
attention_masks = np.load(files['attention_masks'])

y_train = pd.read_csv(files['labels_train'], encoding='utf-8')['Label'].values
y_val = pd.read_csv(files['labels_val'], encoding='utf-8')['Label'].values

logging.info("Data loaded successfully")

In [7]:
# Load Split indices
split_df = pd.read_csv(files['split_indices'])
train_idx = split_df[split_df['Split'] == 'Train']['Index'].values
val_idx = split_df[split_df['Split'] == 'Val']['Index'].values

In [8]:
train_idx

array([2384, 5908, 6026, ..., 1990, 7527, 7718])

In [9]:
import torch

# Create Custom PyTorch Dataset for sentiment analysis
# that returns input_ids, attention_mask, and labels as tensors
class SentimentDataset(torch.utils.data.Dataset):
    # Constructor
    def __init__(self, input_ids, attention_masks, labels):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.labels = labels

    # Methods
    def __len__(self): return len(self.labels)  # length of labels - dataset

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
            'attention_mask': torch.tensor(self.attention_masks[idx], dtype=torch.long),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }


In [10]:
# Prepare datasets
train_dataset = SentimentDataset(input_ids[train_idx], attention_masks[train_idx], y_train)
val_dataset = SentimentDataset(input_ids[val_idx], attention_masks[val_idx], y_val)

logging.info("Datasets prepared")

In [11]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments

# Initialize model
model = AutoModelForSequenceClassification.from_pretrained(
        "sagorsarker/bangla-bert-base", num_labels=3
    )

logging.info("BanglaBERT model initialized")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/491 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/660M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sagorsarker/bangla-bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
from sklearn.metrics import precision_recall_fscore_support

# Metrics computer
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    return {'f1': f1}

In [21]:
# Training arguments
training_args = TrainingArguments(
        output_dir="./banglabert_results",
        num_train_epochs=10,  # Increased to find optimal epochs
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        learning_rate=2e-5,
        eval_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=3,  # Keep only last 3 checkpoints
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
        save_on_each_node=True,
        logging_strategy="epoch",
    )

In [22]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train()
logging.info("BanglaBERT training completed")

Epoch,Training Loss,Validation Loss,F1
1,0.2133,1.831763,0.568034
2,0.3881,1.307234,0.585798
3,0.2953,1.649559,0.551225
4,0.2166,1.857975,0.568547
5,0.1748,2.114043,0.589016
6,0.1486,2.402265,0.567921
7,0.1224,2.544892,0.567182
8,0.1044,2.690192,0.577315
9,0.0837,2.744723,0.569236
10,0.0706,2.819317,0.575391


In [24]:
trainer.save_model("/content/drive/MyDrive/models/baseline_models/banglabert_baseline")
logging.info("BanglaBERT model saved: models/baseline_models/banglabert_baseline")