In [None]:
!pip install tokenizers==0.20.0
!pip install transformers==4.41.2

Collecting tokenizers==0.20.0
  Downloading tokenizers-0.20.0-cp312-none-win_amd64.whl.metadata (6.9 kB)
Downloading tokenizers-0.20.0-cp312-none-win_amd64.whl (2.3 MB)
   ---------------------------------------- 0.0/2.3 MB ? eta -:--:--
   ------------------------------- -------- 1.8/2.3 MB 14.3 MB/s eta 0:00:01
   ---------------------------------------- 2.3/2.3 MB 9.4 MB/s eta 0:00:00
Installing collected packages: tokenizers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.20.3
    Uninstalling tokenizers-0.20.3:
      Successfully uninstalled tokenizers-0.20.3
Successfully installed tokenizers-0.20.0
Collecting transformers==4.41.2
  Using cached transformers-4.41.2-py3-none-any.whl.metadata (43 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers==4.41.2)
  Using cached tokenizers-0.19.1-cp312-none-win_amd64.whl.metadata (6.9 kB)
Using cached transformers-4.41.2-py3-none-any.whl (9.1 MB)
Using cached tokenizers-0.19.1-cp312-none-win_amd64.whl (

In [36]:
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from datasets import Dataset, DatasetDict
from collections.abc import Iterable
import pandas as pd
import torch
import ast

In [40]:
tag_list_path = '../../data/wordlist/tag.txt'
financeNer_path = '../../data/FinanceNewsNER.csv'

# Dataset Prepare

In [41]:
df_financeNer = pd.read_csv(financeNer_path, index_col=0)
# Parsing ulang kolom yang seharusnya berupa list
df_financeNer['tokens'] = df_financeNer['tokens'].apply(ast.literal_eval)
df_financeNer['ner_tags'] = df_financeNer['ner_tags'].apply(ast.literal_eval)

In [42]:
df_financeNer.head(5)

Unnamed: 0,tokens,ner_tags
0,"[Tantangan, Ketimpangan, Ekonomi]","[O, B-ECONOMIC_INDICATOR, I-ECONOMIC_INDICATOR]"
1,"[Doktor, ekonomi, dari, UNU-MERIT, Maastricht,...","[O, O, O, O, O, O, O]"
2,"[Alumni, generasi, pertama, beasiswa, LPDP, ma...","[O, O, O, O, O, I-PRD]"
3,"[Pernah, bekerja, di, ASEAN, Secretariat, Indo...","[O, O, O, B-NOR, O, O, O, O, O, I-NOR]"
4,"[Saat, ini, berkiprah, sebagai, akademisi, pen...","[O, O, O, O, O, O, O, O]"


In [43]:
unique_tags = set(tag for tags in df_financeNer["ner_tags"] for tag in tags)

label_to_id = {"O": 0}
id_to_label = {0: "O"}
current_id = 1

for tag in unique_tags:
    if tag != 'O':
        label_to_id[tag] = current_id
        id_to_label[current_id] = tag
        current_id += 1

In [44]:
df_financeNer['ner_tags'] = [list(map(lambda x: label_to_id.get(x, -1), ner_tag)) for ner_tag in df_financeNer['ner_tags']]

In [46]:
df_train, df_test = train_test_split(df_financeNer,test_size=0.2, random_state=42)

In [47]:
df_train.to_csv('../../data/train.csv')

In [48]:
df_test.to_csv('../../data/test.csv')

# Modelling

## indobenchmark/indobert-base-p1

In [14]:
# Create label mapping
unique_tags = set(tag for tags in df_financeNer["ner_tags"] for tag in tags)
tag2id = {tag: idx for idx, tag in enumerate(unique_tags)}
id2tag = {idx: tag for tag, idx in tag2id.items()}

In [None]:
# Load tokenizer and model
model_name = "model_name"
tokenizer = BertTokenizerFast.from_pretrained(model_name, use_fast=True) # Added use_fast=True to load the fast tokenizer
model = BertForTokenClassification.from_pretrained(model_name, num_labels=len(unique_tags))  # Adjust num_labels based on your dataset

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        padding="max_length",  # Pad to the maximum length in the batch
        is_split_into_words=True,
        max_length=128,  # Set a maximum sequence length
    )
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                # Check if word_idx is within the bounds of the original label
                if word_idx < len(label):
                    label_ids.append(label[word_idx])
                else:
                    # Handle cases where word_idx is out of bounds
                    label_ids.append(-100)  # Or another appropriate value
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx

        # Ensure label_ids and input_ids have the same length
        # by padding or truncating label_ids if necessary
        label_ids = label_ids[:len(tokenized_inputs["input_ids"][i])]  # Truncate if longer
        label_ids += [-100] * (len(tokenized_inputs["input_ids"][i]) - len(label_ids))  # Pad if shorter

        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Convert to Hugging Face Dataset
dataset = Dataset.from_dict(df_financeNer)
dataset = dataset.map(tokenize_and_align_labels, batched=True)

# Split dataset
train_test_split = dataset.train_test_split(test_size=0.2)
datasets = DatasetDict({"train": train_test_split["train"], "test": train_test_split["test"]})

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    # evaluation_strategy="epoch",  # Sama dengan save_strategy
    # save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    # load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=datasets["train"],
    eval_dataset=datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=lambda p: {
        "accuracy": (p.predictions.argmax(-1) == p.label_ids).mean()
    }
)

# Train and evaluate
trainer.train()

Some weights of BertForTokenClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/2802 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss
10,1.1675
20,0.6759
30,0.4226
40,0.3828
50,0.3544
60,0.3261
70,0.3257
80,0.2694
90,0.2664
100,0.226


KeyError: tensor(0)

In [41]:
from sklearn.metrics import classification_report

# Evaluate on the test set
predictions, labels, _ = trainer.predict(datasets["test"])
predictions = torch.argmax(torch.tensor(predictions), dim=-1)

# Flatten lists to compare label-wise, handling padding tokens correctly
true_tags_flat = []
predicted_tags_flat = []
for label, prediction in zip(labels, predictions):
    for true_label, predicted_label in zip(label, prediction):
        if true_label != -100:  # Only consider non-padding tokens
            true_tags_flat.append(true_label.item())  # Convert tensor to int
            predicted_tags_flat.append(predicted_label.item())  # Convert tensor to int

# Print classification report using IDs directly
print(classification_report(true_tags_flat, predicted_tags_flat))

              precision    recall  f1-score   support

           0       0.98      0.99      0.98      9357
           1       0.57      0.26      0.36        77
           2       1.00      0.98      0.99        65
           3       0.52      0.30      0.38        37
           4       0.91      0.82      0.86        62
           5       0.84      0.82      0.83        96
           6       0.96      0.88      0.92        26
           7       1.00      1.00      1.00         1
           8       0.81      0.73      0.77        79
           9       0.93      0.90      0.92        61
          10       0.70      0.60      0.65        35
          11       0.82      0.58      0.68        24
          12       0.81      0.52      0.63        25
          13       0.85      0.38      0.52        29
          14       0.80      0.73      0.76        89
          15       0.83      0.73      0.78        62
          16       0.71      0.67      0.69        18
          17       1.00    

## indobenchmark/indobert-base-p2

In [18]:
import torch

# Check if CUDA is available and set the device
device = "cuda" if torch.cuda.is_available() else "cpu"

In [19]:
# Load tokenizer and model
model2_name = "indobenchmark/indobert-base-p2"
tokenizer = BertTokenizerFast.from_pretrained(model2_name, use_fast=True)
model = BertForTokenClassification.from_pretrained(model2_name, num_labels=len(unique_tags))

# Ensure the model is moved to the device (GPU or CPU)
model.to(device)

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        padding="max_length",  # Pad to the maximum length in the batch
        is_split_into_words=True,
        max_length=128,  # Set a maximum sequence length
    )
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                if word_idx < len(label):
                    label_ids.append(label[word_idx])
                else:
                    label_ids.append(-100)
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx

        label_ids = label_ids[:len(tokenized_inputs["input_ids"][i])]
        label_ids += [-100] * (len(tokenized_inputs["input_ids"][i]) - len(label_ids))  # Pad if shorter

        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Convert to Hugging Face Dataset
dataset = Dataset.from_dict(df_financeNer)
dataset = dataset.map(tokenize_and_align_labels, batched=True)

# Split dataset
train_test_split = dataset.train_test_split(test_size=0.2)
datasets = DatasetDict({"train": train_test_split["train"], "test": train_test_split["test"]})

# Training arguments with CUDA enabled
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    metric_for_best_model="accuracy",
    # Add this to ensure CUDA is used if available
    no_cuda=False if device == "cuda" else True,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=datasets["train"],
    eval_dataset=datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=lambda p: {
        "accuracy": (p.predictions.argmax(-1) == p.label_ids).mean()
    }
)

# Train and evaluate
trainer.train()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/2802 [00:00<?, ? examples/s]

  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mfadhilahhilmi04[0m ([33mfadhilahhilmi04-brawijaya-university[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
10,1.0054
20,0.5053
30,0.4283
40,0.3855
50,0.326
60,0.3214
70,0.2683
80,0.3616
90,0.3291
100,0.3222


TrainOutput(global_step=843, training_loss=0.14200700598537994, metrics={'train_runtime': 207.0769, 'train_samples_per_second': 32.466, 'train_steps_per_second': 4.071, 'total_flos': 439261875984384.0, 'train_loss': 0.14200700598537994, 'epoch': 3.0})

In [22]:
# Evaluate on the test set
predictions, labels, _ = trainer.predict(datasets["test"])
predictions = torch.argmax(torch.tensor(predictions), dim=-1)

# Flatten lists to compare label-wise, handling padding tokens correctly
true_tags_flat = []
predicted_tags_flat = []
for label, prediction in zip(labels, predictions):
    for true_label, predicted_label in zip(label, prediction):
        if true_label != -100:  # Only consider non-padding tokens
            true_tags_flat.append(true_label.item())  # Convert tensor to int
            predicted_tags_flat.append(predicted_label.item())  # Convert tensor to int

# Print classification report using IDs directly
print(classification_report(true_tags_flat, predicted_tags_flat))

              precision    recall  f1-score   support

           0       0.98      0.99      0.98      9574
           1       0.83      0.62      0.71        16
           2       0.51      0.33      0.40        66
           3       0.79      0.67      0.72        33
           4       0.90      0.72      0.80        25
           5       0.73      0.73      0.73        30
           6       0.80      0.80      0.80        66
           7       0.62      0.38      0.47        47
           8       0.56      0.38      0.46        47
           9       0.50      0.43      0.46         7
          10       0.86      0.86      0.86        65
          11       0.78      0.47      0.58        15
          12       0.89      0.67      0.76        12
          13       1.00      1.00      1.00        57
          14       0.96      0.95      0.96        84
          15       0.99      0.99      0.99       140
          16       0.91      0.52      0.67        61
          17       0.82    

## indolem/indobert-base-uncased

In [23]:
# Load tokenizer and model
model3_name = "indolem/indobert-base-uncased"
tokenizer = BertTokenizerFast.from_pretrained(model3_name, use_fast=True)
model = BertForTokenClassification.from_pretrained(model3_name, num_labels=len(unique_tags))

# Ensure the model is moved to the device (GPU or CPU)
model.to(device)

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        padding="max_length",  # Pad to the maximum length in the batch
        is_split_into_words=True,
        max_length=128,  # Set a maximum sequence length
    )
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                if word_idx < len(label):
                    label_ids.append(label[word_idx])
                else:
                    label_ids.append(-100)
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx

        label_ids = label_ids[:len(tokenized_inputs["input_ids"][i])]
        label_ids += [-100] * (len(tokenized_inputs["input_ids"][i]) - len(label_ids))  # Pad if shorter

        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Convert to Hugging Face Dataset
dataset = Dataset.from_dict(df_financeNer)
dataset = dataset.map(tokenize_and_align_labels, batched=True)

# Split dataset
train_test_split = dataset.train_test_split(test_size=0.2)
datasets = DatasetDict({"train": train_test_split["train"], "test": train_test_split["test"]})

# Training arguments with CUDA enabled
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    metric_for_best_model="accuracy",
    # Add this to ensure CUDA is used if available
    no_cuda=False if device == "cuda" else True,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=datasets["train"],
    eval_dataset=datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=lambda p: {
        "accuracy": (p.predictions.argmax(-1) == p.label_ids).mean()
    }
)

# Train and evaluate
trainer.train()

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/234k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at indolem/indobert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/2802 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss
10,1.0711
20,0.8309
30,0.6657
40,0.7414
50,0.7145
60,0.6848
70,0.7264
80,0.4769
90,0.6077
100,0.4473


TrainOutput(global_step=843, training_loss=0.29613929329393596, metrics={'train_runtime': 193.1204, 'train_samples_per_second': 34.812, 'train_steps_per_second': 4.365, 'total_flos': 439261875984384.0, 'train_loss': 0.29613929329393596, 'epoch': 3.0})

In [24]:
# Evaluate on the test set
predictions, labels, _ = trainer.predict(datasets["test"])
predictions = torch.argmax(torch.tensor(predictions), dim=-1)

# Flatten lists to compare label-wise, handling padding tokens correctly
true_tags_flat = []
predicted_tags_flat = []
for label, prediction in zip(labels, predictions):
    for true_label, predicted_label in zip(label, prediction):
        if true_label != -100:  # Only consider non-padding tokens
            true_tags_flat.append(true_label.item())  # Convert tensor to int
            predicted_tags_flat.append(predicted_label.item())  # Convert tensor to int

# Print classification report using IDs directly
print(classification_report(true_tags_flat, predicted_tags_flat))

              precision    recall  f1-score   support

           0       0.97      0.98      0.97      9357
           1       0.00      0.00      0.00        15
           2       0.33      0.16      0.21        77
           3       0.58      0.28      0.38        25
           4       0.72      0.81      0.76        26
           5       0.68      0.62      0.65        24
           6       0.61      0.71      0.65        89
           7       0.64      0.19      0.29        37
           8       0.38      0.51      0.44        35
           9       0.00      0.00      0.00        11
          10       0.83      0.82      0.83        61
          11       0.25      0.07      0.11        14
          12       0.57      0.22      0.32        18
          13       0.90      0.92      0.91        65
          14       0.86      0.86      0.86        74
          15       0.98      0.98      0.98       121
          16       0.81      0.68      0.74        79
          17       0.75    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## LazarusNLP/NusaBERT-large

In [25]:
# Load tokenizer and model
model4_name = "LazarusNLP/NusaBERT-large"
tokenizer = BertTokenizerFast.from_pretrained(model4_name, use_fast=True)
model = BertForTokenClassification.from_pretrained(model4_name, num_labels=len(unique_tags))

# Ensure the model is moved to the device (GPU or CPU)
model.to(device)

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        padding="max_length",  # Pad to the maximum length in the batch
        is_split_into_words=True,
        max_length=128,  # Set a maximum sequence length
    )
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                if word_idx < len(label):
                    label_ids.append(label[word_idx])
                else:
                    label_ids.append(-100)
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx

        label_ids = label_ids[:len(tokenized_inputs["input_ids"][i])]
        label_ids += [-100] * (len(tokenized_inputs["input_ids"][i]) - len(label_ids))  # Pad if shorter

        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Convert to Hugging Face Dataset
dataset = Dataset.from_dict(df_financeNer)
dataset = dataset.map(tokenize_and_align_labels, batched=True)

# Split dataset
train_test_split = dataset.train_test_split(test_size=0.2)
datasets = DatasetDict({"train": train_test_split["train"], "test": train_test_split["test"]})

# Training arguments with CUDA enabled
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    metric_for_best_model="accuracy",
    # Add this to ensure CUDA is used if available
    no_cuda=False if device == "cuda" else True,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=datasets["train"],
    eval_dataset=datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=lambda p: {
        "accuracy": (p.predictions.argmax(-1) == p.label_ids).mean()
    }
)

# Train and evaluate
trainer.train()

tokenizer_config.json:   0%|          | 0.00/261k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/229k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/984k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/28.9k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.35G [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at LazarusNLP/NusaBERT-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/2802 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss
10,1.1065
20,0.6904
30,0.4544
40,0.4136
50,0.3588
60,0.364
70,0.3527
80,0.2827
90,0.272
100,0.2369


TrainOutput(global_step=843, training_loss=0.16122567204676772, metrics={'train_runtime': 630.8648, 'train_samples_per_second': 10.657, 'train_steps_per_second': 1.336, 'total_flos': 1561040708622336.0, 'train_loss': 0.16122567204676772, 'epoch': 3.0})

In [26]:
# Evaluate on the test set
predictions, labels, _ = trainer.predict(datasets["test"])
predictions = torch.argmax(torch.tensor(predictions), dim=-1)

# Flatten lists to compare label-wise, handling padding tokens correctly
true_tags_flat = []
predicted_tags_flat = []
for label, prediction in zip(labels, predictions):
    for true_label, predicted_label in zip(label, prediction):
        if true_label != -100:  # Only consider non-padding tokens
            true_tags_flat.append(true_label.item())  # Convert tensor to int
            predicted_tags_flat.append(predicted_label.item())  # Convert tensor to int

# Print classification report using IDs directly
print(classification_report(true_tags_flat, predicted_tags_flat))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98      9357
           1       0.80      0.53      0.64        15
           2       0.66      0.25      0.36        77
           3       0.81      0.52      0.63        25
           4       0.96      0.85      0.90        26
           5       0.73      0.67      0.70        24
           6       0.77      0.78      0.77        89
           7       0.67      0.22      0.33        37
           8       0.53      0.66      0.59        35
           9       0.60      0.27      0.38        11
          10       0.91      0.84      0.87        61
          11       0.60      0.86      0.71        14
          12       0.62      0.72      0.67        18
          13       0.97      0.98      0.98        65
          14       0.93      0.93      0.93        74
          15       0.99      1.00      1.00       121
          16       0.85      0.63      0.72        79
          17       0.90    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## indobenchmark/indobert-large-p1

In [27]:
# Load tokenizer and model
model5_name = "indobenchmark/indobert-large-p1"
tokenizer = BertTokenizerFast.from_pretrained(model5_name, use_fast=True)
model = BertForTokenClassification.from_pretrained(model5_name, num_labels=len(unique_tags))

# Ensure the model is moved to the device (GPU or CPU)
model.to(device)

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        padding="max_length",  # Pad to the maximum length in the batch
        is_split_into_words=True,
        max_length=128,  # Set a maximum sequence length
    )
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                if word_idx < len(label):
                    label_ids.append(label[word_idx])
                else:
                    label_ids.append(-100)
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx

        label_ids = label_ids[:len(tokenized_inputs["input_ids"][i])]
        label_ids += [-100] * (len(tokenized_inputs["input_ids"][i]) - len(label_ids))  # Pad if shorter

        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Convert to Hugging Face Dataset
dataset = Dataset.from_dict(df_financeNer)
dataset = dataset.map(tokenize_and_align_labels, batched=True)

# Split dataset
train_test_split = dataset.train_test_split(test_size=0.2)
datasets = DatasetDict({"train": train_test_split["train"], "test": train_test_split["test"]})

# Training arguments with CUDA enabled
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    metric_for_best_model="accuracy",
    # Add this to ensure CUDA is used if available
    no_cuda=False if device == "cuda" else True,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=datasets["train"],
    eval_dataset=datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=lambda p: {
        "accuracy": (p.predictions.argmax(-1) == p.label_ids).mean()
    }
)

# Train and evaluate
trainer.train()

tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/229k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at indobenchmark/indobert-large-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/2802 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss
10,1.1552
20,0.7027
30,0.4684
40,0.4107
50,0.3513
60,0.3452
70,0.3663
80,0.2787
90,0.2686
100,0.2248


TrainOutput(global_step=843, training_loss=0.16007480603308016, metrics={'train_runtime': 700.2469, 'train_samples_per_second': 9.601, 'train_steps_per_second': 1.204, 'total_flos': 1561040708622336.0, 'train_loss': 0.16007480603308016, 'epoch': 3.0})

In [28]:
# Evaluate on the test set
predictions, labels, _ = trainer.predict(datasets["test"])
predictions = torch.argmax(torch.tensor(predictions), dim=-1)

# Flatten lists to compare label-wise, handling padding tokens correctly
true_tags_flat = []
predicted_tags_flat = []
for label, prediction in zip(labels, predictions):
    for true_label, predicted_label in zip(label, prediction):
        if true_label != -100:  # Only consider non-padding tokens
            true_tags_flat.append(true_label.item())  # Convert tensor to int
            predicted_tags_flat.append(predicted_label.item())  # Convert tensor to int

# Print classification report using IDs directly
print(classification_report(true_tags_flat, predicted_tags_flat))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98      9357
           1       0.78      0.47      0.58        15
           2       0.58      0.18      0.28        77
           3       0.88      0.60      0.71        25
           4       0.89      0.92      0.91        26
           5       0.83      0.62      0.71        24
           6       0.82      0.73      0.77        89
           7       0.62      0.22      0.32        37
           8       0.54      0.71      0.62        35
           9       0.75      0.27      0.40        11
          10       0.91      0.84      0.87        61
          11       0.65      0.79      0.71        14
          12       0.76      0.72      0.74        18
          13       0.98      0.98      0.98        65
          14       0.92      0.96      0.94        74
          15       0.99      0.98      0.99       121
          16       0.85      0.70      0.76        79
          17       0.87    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
