In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score
import pandas as pd
import matplotlib.pyplot as plt
from google.colab import drive
drive.mount('/content/drive')
file_path ='/content/drive/MyDrive/Colab Notebooks/Bengali Dataset.csv'
data = pd.read_csv(file_path)

texts = data['Text'].tolist()
labels = data['Humor'].tolist()

label_dict = {label: idx for idx, label in enumerate(set(labels))}
numeric_labels = [label_dict[label] for label in labels]

train_texts, test_texts, train_labels, test_labels = train_test_split(texts, numeric_labels, test_size=0.1, random_state=42)
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.1, random_state=42)

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128, return_tensors='pt')
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128, return_tensors='pt')

train_labels = torch.tensor(train_labels)
test_labels = torch.tensor(test_labels)

model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=len(label_dict))
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)



Mounted at /content/drive


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [None]:
epochs = 20
batch_size = 64
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-6)

train_accuracy_values = []

for epoch in range(epochs):
    model.train()
    total_train_accuracy = 0
    total_batches = 0

    for i in range(0, len(train_encodings['input_ids']), batch_size):
        optimizer.zero_grad()
        batch_input = {key: val[i:i+batch_size].to(device) for key, val in train_encodings.items()}
        labels = train_labels[i:i+batch_size].to(device)
        outputs = model(**batch_input, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        predictions = torch.argmax(outputs.logits, dim=1)
        batch_accuracy = torch.sum(predictions == labels).item() / len(predictions)
        total_train_accuracy += batch_accuracy
        total_batches += 1

    epoch_train_accuracy = total_train_accuracy / total_batches
    train_accuracy_values.append(epoch_train_accuracy)
    print(f"Epoch {epoch+1} - Train Accuracy: {epoch_train_accuracy}")

# Test Evaluation
model.eval()
test_input = {key: val.to(device) for key, val in test_encodings.items()}
with torch.no_grad():
    outputs = model(**test_input)
    predictions = torch.argmax(outputs.logits, dim=1).cpu().numpy()

test_accuracy = accuracy_score(test_labels, predictions)
test_precision = precision_score(test_labels, predictions, average='weighted')
test_f1 = f1_score(test_labels, predictions, average='weighted')
test_recall = recall_score(test_labels, predictions, average='weighted')

print(f"Test Accuracy: {test_accuracy}")
print(f"Test Precision: {test_precision}")
print(f"Test F1 Score: {test_f1}")
print(f"Test Recall Score: {test_recall}")


Epoch 1 - Train Accuracy: 0.3967013888888889
Epoch 2 - Train Accuracy: 0.532986111111111
Epoch 3 - Train Accuracy: 0.532986111111111
Epoch 4 - Train Accuracy: 0.5277777777777778
Epoch 5 - Train Accuracy: 0.532986111111111
Epoch 6 - Train Accuracy: 0.48003472222222227
Epoch 7 - Train Accuracy: 0.5434027777777778
Epoch 8 - Train Accuracy: 0.6145833333333334
Epoch 9 - Train Accuracy: 0.59375
Epoch 10 - Train Accuracy: 0.6432291666666666
Epoch 11 - Train Accuracy: 0.625
Epoch 12 - Train Accuracy: 0.6328125
Epoch 13 - Train Accuracy: 0.6614583333333334
Epoch 14 - Train Accuracy: 0.7005208333333334
Epoch 15 - Train Accuracy: 0.7239583333333334
Epoch 16 - Train Accuracy: 0.7421875
Epoch 17 - Train Accuracy: 0.765625
Epoch 18 - Train Accuracy: 0.78125
Epoch 19 - Train Accuracy: 0.7994791666666666
Epoch 20 - Train Accuracy: 0.828125
Test Accuracy: 0.7
Test Precision: 0.7
Test F1 Score: 0.7
Test Recall Score: 0.7
