In [13]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
from transformers import get_linear_schedule_with_warmup

In [14]:
# Let's take our IndoBERT tokenizer
bert_tokenizer = AutoTokenizer.from_pretrained('indonesia-bert-sentiment-classification')

# A sample sentence
sample_text = "Saya sangat senang dengan layanan BCA."

# --- Inspection Step 1: Tokenize ---
tokens = bert_tokenizer.tokenize(sample_text)
print(f"Sentence: {sample_text}")
print(f"Tokens: {tokens}")

# --- Inspection Step 2: Convert to IDs ---
input_ids = bert_tokenizer.convert_tokens_to_ids(tokens)
print(f"Input IDs: {input_ids}")

# The encode_plus method does both steps at once and adds special tokens
encoding = bert_tokenizer.encode_plus(sample_text, max_length=12, padding='max_length', truncation=True)
print(f"\nFull Encoding (with special tokens [CLS] and [SEP]):")
print(encoding['input_ids'])

Sentence: Saya sangat senang dengan layanan BCA.
Tokens: ['saya', 'sangat', 'senang', 'dengan', 'layanan', 'bca', '.']
Input IDs: [209, 310, 3000, 79, 1629, 8320, 30470]

Full Encoding (with special tokens [CLS] and [SEP]):
[2, 209, 310, 3000, 79, 1629, 8320, 30470, 3, 0, 0, 0]


In [15]:
# Load the model
bert_model = AutoModelForSequenceClassification.from_pretrained('indonesia-bert-sentiment-classification')

# Print the model's structure
print(bert_model)


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(50000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,