# Sentiment Analysis using IndoBERT
This notebook focuses on finetuning IndoBERT on a specific dataset to do sentiment analysis task. This notebook runs on Google Colab using T4 GPU.

## Install dependencies and import libraries

In [1]:
!pip install transformers datasets

Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m71.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.13.1-py3-none-any.whl (486 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.2/486.2 kB[0m [31m51.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m101.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloa

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import random
import numpy as np
import pandas as pd
import torch
from torch import optim
import torch.nn.functional as F
from tqdm import tqdm

from transformers import BertForSequenceClassification, BertConfig, BertTokenizer

from utils.forward_fn import forward_sequence_classification
from utils.metrics import document_sentiment_metrics_fn
from utils.data_utils import DocumentSentimentDataset, DocumentSentimentDataLoader

In [4]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

def count_param(module, trainable=False):
    if trainable:
        return sum(p.numel() for p in module.parameters() if p.requires_grad)
    else:
        return sum(p.numel() for p in module.parameters())

def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

def metrics_to_string(metric_dict):
    string_list = []
    for key, value in metric_dict.items():
        string_list.append('{}:{:.2f}'.format(key, value))
    return ' '.join(string_list)

In [5]:
# Set random seed
# So that the finetuning process can be remade
set_seed(2023)

In [7]:
# Define device
import torch
if torch.cuda.is_available():
  device = torch.device("cuda")
else:
  device = torch.device("cpu")

## Import model and tokenizer

In [8]:
# Load Tokenizer and Config
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')
config = BertConfig.from_pretrained('indobenchmark/indobert-base-p1')

# Instantiate model
model = BertForSequenceClassification.from_pretrained(
    'indobenchmark/indobert-base-p1',
    num_labels = 3,
    output_attentions = False,
    output_hidden_states = False
).to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Import dataset

In [9]:
train_dataset_path = "/content/train_preprocess.tsv"
valid_dataset_path = "/content/valid_preprocess.tsv"
test_dataset_path = "/content/test_preprocess_masked_label.tsv"

In [10]:
train_dataset = DocumentSentimentDataset(train_dataset_path, tokenizer, lowercase=True)
valid_dataset = DocumentSentimentDataset(valid_dataset_path, tokenizer, lowercase=True)
test_dataset = DocumentSentimentDataset(test_dataset_path, tokenizer, lowercase=True)

train_loader = DocumentSentimentDataLoader(dataset=train_dataset, max_seq_len=512, batch_size=8, num_workers=2, shuffle=True)
valid_loader = DocumentSentimentDataLoader(dataset=valid_dataset, max_seq_len=512, batch_size=8, num_workers=2, shuffle=False)
test_loader = DocumentSentimentDataLoader(dataset=test_dataset, max_seq_len=512, batch_size=8, num_workers=2, shuffle=False)

In [11]:
w2i, i2w = DocumentSentimentDataset.LABEL2INDEX, DocumentSentimentDataset.INDEX2LABEL
print(w2i)
print(i2w)

{'positive': 0, 'neutral': 1, 'negative': 2}
{0: 'positive', 1: 'neutral', 2: 'negative'}


## Inference before finetuning

In [12]:
text = 'Bahagia hatiku melihat pernikahan putri sulungku yang cantik jelita'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: Bahagia hatiku melihat pernikahan putri sulungku yang cantik jelita | Label : negative (43.330%)


It can be seen that the result is not as expected where the sentiment should be positive but the model recognize it as negative.

## Training
We finetune the model with learning rate 0.00003 in 10 epochs.

In [13]:
optimizer = optim.Adam(model.parameters(), lr=3e-6)
model = model.cuda()

In [14]:
import time

# Train
n_epochs = 10
start = time.time()
for epoch in range(n_epochs):
    model.train()
    torch.set_grad_enabled(True)

    total_train_loss = 0
    list_hyp, list_label = [], []

    train_pbar = tqdm(train_loader, leave=True, total=len(train_loader))
    for i, batch_data in enumerate(train_pbar):
        # Forward model
        loss, batch_hyp, batch_label = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')

        # Update model
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        tr_loss = loss.item()
        total_train_loss = total_train_loss + tr_loss

        # Calculate metrics
        list_hyp += batch_hyp
        list_label += batch_label

        train_pbar.set_description("(Epoch {}) TRAIN LOSS:{:.4f} LR:{:.8f}".format((epoch+1),
            total_train_loss/(i+1), get_lr(optimizer)))

    # Calculate train metric
    metrics = document_sentiment_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) TRAIN LOSS:{:.4f} {} LR:{:.8f}".format((epoch+1),
        total_train_loss/(i+1), metrics_to_string(metrics), get_lr(optimizer)))

    # Evaluate on validation
    model.eval()
    torch.set_grad_enabled(False)

    total_loss, total_correct, total_labels = 0, 0, 0
    list_hyp, list_label = [], []

    pbar = tqdm(valid_loader, leave=True, total=len(valid_loader))
    for i, batch_data in enumerate(pbar):
        batch_seq = batch_data[-1]
        loss, batch_hyp, batch_label = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')

        # Calculate total loss
        valid_loss = loss.item()
        total_loss = total_loss + valid_loss

        # Calculate evaluation metrics
        list_hyp += batch_hyp
        list_label += batch_label
        metrics = document_sentiment_metrics_fn(list_hyp, list_label)

        pbar.set_description("VALID LOSS:{:.4f} {}".format(total_loss/(i+1), metrics_to_string(metrics)))

    metrics = document_sentiment_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) VALID LOSS:{:.4f} {}".format((epoch+1),
        total_loss/(i+1), metrics_to_string(metrics)))

stop = time.time()
print(f"\n\nTraining time: {stop - start}s")

(Epoch 1) TRAIN LOSS:0.2558 LR:0.00000300: 100%|██████████| 1375/1375 [03:13<00:00,  7.12it/s]


(Epoch 1) TRAIN LOSS:0.2558 ACC:0.91 F1:0.87 REC:0.86 PRE:0.89 LR:0.00000300


VALID LOSS:0.1801 ACC:0.93 F1:0.91 REC:0.90 PRE:0.92: 100%|██████████| 158/158 [00:08<00:00, 18.08it/s]


(Epoch 1) VALID LOSS:0.1801 ACC:0.93 F1:0.91 REC:0.90 PRE:0.92


(Epoch 2) TRAIN LOSS:0.1306 LR:0.00000300: 100%|██████████| 1375/1375 [03:14<00:00,  7.07it/s]


(Epoch 2) TRAIN LOSS:0.1306 ACC:0.96 F1:0.95 REC:0.94 PRE:0.95 LR:0.00000300


VALID LOSS:0.1774 ACC:0.94 F1:0.91 REC:0.89 PRE:0.93: 100%|██████████| 158/158 [00:08<00:00, 18.09it/s]


(Epoch 2) VALID LOSS:0.1774 ACC:0.94 F1:0.91 REC:0.89 PRE:0.93


(Epoch 3) TRAIN LOSS:0.0828 LR:0.00000300: 100%|██████████| 1375/1375 [03:15<00:00,  7.04it/s]


(Epoch 3) TRAIN LOSS:0.0828 ACC:0.97 F1:0.97 REC:0.96 PRE:0.97 LR:0.00000300


VALID LOSS:0.1892 ACC:0.94 F1:0.91 REC:0.91 PRE:0.91: 100%|██████████| 158/158 [00:08<00:00, 18.18it/s]


(Epoch 3) VALID LOSS:0.1892 ACC:0.94 F1:0.91 REC:0.91 PRE:0.91


(Epoch 4) TRAIN LOSS:0.0501 LR:0.00000300: 100%|██████████| 1375/1375 [03:14<00:00,  7.07it/s]


(Epoch 4) TRAIN LOSS:0.0501 ACC:0.99 F1:0.98 REC:0.98 PRE:0.99 LR:0.00000300


VALID LOSS:0.2486 ACC:0.93 F1:0.91 REC:0.91 PRE:0.92: 100%|██████████| 158/158 [00:08<00:00, 17.71it/s]


(Epoch 4) VALID LOSS:0.2486 ACC:0.93 F1:0.91 REC:0.91 PRE:0.92


(Epoch 5) TRAIN LOSS:0.0352 LR:0.00000300: 100%|██████████| 1375/1375 [03:14<00:00,  7.07it/s]


(Epoch 5) TRAIN LOSS:0.0352 ACC:0.99 F1:0.99 REC:0.98 PRE:0.99 LR:0.00000300


VALID LOSS:0.2306 ACC:0.93 F1:0.91 REC:0.91 PRE:0.91: 100%|██████████| 158/158 [00:08<00:00, 17.80it/s]


(Epoch 5) VALID LOSS:0.2306 ACC:0.93 F1:0.91 REC:0.91 PRE:0.91


(Epoch 6) TRAIN LOSS:0.0187 LR:0.00000300: 100%|██████████| 1375/1375 [03:14<00:00,  7.05it/s]


(Epoch 6) TRAIN LOSS:0.0187 ACC:1.00 F1:0.99 REC:0.99 PRE:0.99 LR:0.00000300


VALID LOSS:0.2692 ACC:0.93 F1:0.90 REC:0.89 PRE:0.92: 100%|██████████| 158/158 [00:08<00:00, 17.76it/s]


(Epoch 6) VALID LOSS:0.2692 ACC:0.93 F1:0.90 REC:0.89 PRE:0.92


(Epoch 7) TRAIN LOSS:0.0177 LR:0.00000300: 100%|██████████| 1375/1375 [03:14<00:00,  7.08it/s]


(Epoch 7) TRAIN LOSS:0.0177 ACC:0.99 F1:0.99 REC:0.99 PRE:0.99 LR:0.00000300


VALID LOSS:0.2681 ACC:0.94 F1:0.91 REC:0.90 PRE:0.92: 100%|██████████| 158/158 [00:08<00:00, 17.81it/s]


(Epoch 7) VALID LOSS:0.2681 ACC:0.94 F1:0.91 REC:0.90 PRE:0.92


(Epoch 8) TRAIN LOSS:0.0126 LR:0.00000300: 100%|██████████| 1375/1375 [03:15<00:00,  7.05it/s]


(Epoch 8) TRAIN LOSS:0.0126 ACC:1.00 F1:1.00 REC:0.99 PRE:1.00 LR:0.00000300


VALID LOSS:0.2928 ACC:0.93 F1:0.91 REC:0.90 PRE:0.91: 100%|██████████| 158/158 [00:09<00:00, 17.49it/s]


(Epoch 8) VALID LOSS:0.2928 ACC:0.93 F1:0.91 REC:0.90 PRE:0.91


(Epoch 9) TRAIN LOSS:0.0100 LR:0.00000300: 100%|██████████| 1375/1375 [03:15<00:00,  7.03it/s]


(Epoch 9) TRAIN LOSS:0.0100 ACC:1.00 F1:1.00 REC:0.99 PRE:1.00 LR:0.00000300


VALID LOSS:0.3065 ACC:0.93 F1:0.90 REC:0.89 PRE:0.92: 100%|██████████| 158/158 [00:08<00:00, 17.72it/s]


(Epoch 9) VALID LOSS:0.3065 ACC:0.93 F1:0.90 REC:0.89 PRE:0.92


(Epoch 10) TRAIN LOSS:0.0098 LR:0.00000300: 100%|██████████| 1375/1375 [03:15<00:00,  7.05it/s]


(Epoch 10) TRAIN LOSS:0.0098 ACC:1.00 F1:1.00 REC:1.00 PRE:1.00 LR:0.00000300


VALID LOSS:0.3076 ACC:0.93 F1:0.91 REC:0.90 PRE:0.91: 100%|██████████| 158/158 [00:08<00:00, 17.66it/s]

(Epoch 10) VALID LOSS:0.3076 ACC:0.93 F1:0.91 REC:0.90 PRE:0.91


Training time: 2037.5092387199402s





In [15]:
model.eval()
torch.set_grad_enabled(False)

total_loss, total_correct, total_labels = 0, 0, 0
list_hyp, list_label = [], []

pbar = tqdm(test_loader, leave=True, total=len(test_loader))
for i, batch_data in enumerate(pbar):
    _, batch_hyp, _ = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')
    list_hyp += batch_hyp

# Save prediction
df = pd.DataFrame({'label':list_hyp}).reset_index()
df.to_csv('pred.txt', index=False)

df.head()

100%|██████████| 63/63 [00:01<00:00, 39.13it/s]


Unnamed: 0,index,label
0,0,negative
1,1,negative
2,2,negative
3,3,negative
4,4,negative


## Inference after finetuning

In [16]:
text = 'Bahagia hatiku melihat pernikahan putri sulungku yang cantik jelita'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: Bahagia hatiku melihat pernikahan putri sulungku yang cantik jelita | Label : positive (99.980%)


After being finetuned, the model is able to classify the sentiment correctly.

## Save the model
Save the model and try to load it one more time to ensure that the model works fine.

In [17]:
model.save_pretrained("/content")
tokenizer.save_pretrained("/content")

('/content/tokenizer_config.json',
 '/content/special_tokens_map.json',
 '/content/vocab.txt',
 '/content/added_tokens.json')

In [18]:
saved_model = BertForSequenceClassification.from_pretrained("/content")
saved_tokenizer = BertTokenizer.from_pretrained("/content")

text = 'Bahagia hatiku melihat pernikahan putri sulungku yang cantik jelita'
subwords = saved_tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(saved_model.device)

logits = saved_model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: Bahagia hatiku melihat pernikahan putri sulungku yang cantik jelita | Label : positive (99.980%)
