# BERT Fine Tuning

# 01 Install & Import Library

In [None]:
!pip install datasets

In [None]:
!pip install transformers datasets

In [None]:
!pip install huggingface_hub

In [None]:
from huggingface_hub import notebook_login
notebook_login()

Token: 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

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

# 02 Dataset


In [None]:
from datasets import load_dataset

dataset = load_dataset("Davlan/conll2003_noMISC")

# Tampilkan beberapa sampel dari dataset
print(dataset)
print(dataset["train"][0])  # Contoh data pertama dari split "train"

# Check type of example before loop
print(type(dataset["train"][0]))

In [None]:
dataset['train'][0]

In [None]:
import pandas as pd

In [None]:
label_data = dataset['train']['ner_tags'] # Change 'labels' to 'ner_tags'
label_counts = pd.Series(label_data).value_counts()

# Print the label counts
print(label_counts)

In [None]:
# see the sample
dataset['train'][1]

In [None]:
# Access the 'ner_tags' column from the 'train' split
label_data = dataset['train']['ner_tags']  # Change 'labels' to 'ner_tags'

# Flatten the list of lists into a single list of labels
all_labels = [label for sublist in label_data for label in sublist]

# Count label occurrences using pandas
label_counts = pd.Series(all_labels).value_counts()

# Print the label counts
label_counts

# Preprocessing

In [None]:
for i in range(len(dataset['train'])):
    words =dataset['train'][i]['tokens']
    labels = dataset['train'][i]['ner_tags']
    if len(words) != len(labels):
      print(f"Mismatch found in example {i}:")
      print("Words:", words)
      print("Labels:", labels)
      print("-" * 20)

In [None]:
for i in range(len(dataset['test'])):
    words = dataset['test'][i]['tokens']
    labels = dataset['test'][i]['ner_tags']
    if len(words) != len(labels):
      print(f"Mismatch found in example {i}:")
      print("Words:", words)
      print("Labels:", labels)
      print("-" * 20)

In [None]:
# Dapatkan daftar unik label
unique_labels = set(label for seq in dataset["train"]["ner_tags"] for label in seq) # Change 'labels' to 'ner_tags'

# Buat mapping dari label ke integer
label2id = {label: i for i, label in enumerate(sorted(unique_labels))}
id2label = {i: label for label, i in label2id.items()}

In [None]:
# Terapkan mapping ke dataset
def encode_labels(examples):
    # Change 'labels' to 'ner_tags' to access the correct key in the dataset
    examples["labels"] = [[label2id[label] for label in seq] for seq in examples["ner_tags"]]
    return examples

encoded_dataset = dataset.map(encode_labels, batched=True)

In [None]:
from sklearn.model_selection import train_test_split
from datasets import Dataset

train_df = dataset['train'].to_pandas()

train_data, val_data = train_test_split(
    train_df, test_size=0.1, random_state=42)

# Convert the split data back to Hugging Face Dataset objects
train_dataset = Dataset.from_dict(train_data)
val_dataset = Dataset.from_dict(val_data)

In [None]:
tokenized_datasets = {
    "train": train_dataset,
    "validation": val_dataset,
    "test": dataset["test"]  # Access the original test set
}

In [None]:
tokenized_datasets

# Load Tokenizer (Tokenizer dan model BERT, dan Tokenisasi Data)

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

# Load tokenizer dan model
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
model = AutoModelForTokenClassification.from_pretrained(
    "bert-base-cased",
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)

# Tokenisasi dengan label alignment
def tokenize_and_align_labels(examples):
    # Use 'tokens' instead of 'words' to access the tokenized words
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        for word_id in word_ids:
            if word_id is None:
                label_ids.append(-100)  # Ignore token seperti [CLS], [SEP]
            else:
                label_ids.append(label[word_id])
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = encoded_dataset.map(tokenize_and_align_labels, batched=True)

In [None]:
##roberta
from transformers import AutoTokenizer, AutoModelForTokenClassification

# Load tokenizer dan model
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

In [None]:
# Import necessary libraries
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
from datasets import DatasetDict

# Load Roberta tokenizer and model
# Load Roberta tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("roberta-base", add_prefix_space=True) # Add add_prefix_space=True here
model = AutoModelForTokenClassification.from_pretrained(
    "roberta-base",
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)

# Tokenize and align labels with Roberta tokenizer
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True, padding=True) # Add padding=True
    labels = []
    for i, label in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        for word_id in word_ids:
            if word_id is None:
                label_ids.append(-100)  # Ignore token seperti [CLS], [SEP]
            else:
                label_ids.append(label[word_id])
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Apply tokenization and label alignment to the dataset
tokenized_datasets = encoded_dataset.map(tokenize_and_align_labels, batched=True)

# Split the dataset into train, validation, and test sets
tokenized_datasets = tokenized_datasets["train"].train_test_split(test_size=0.2, seed=42)
tokenized_datasets = DatasetDict({
    "train": tokenized_datasets["train"],
    "test": tokenized_datasets["test"]
})
tokenized_datasets["train"] = tokenized_datasets["train"].train_test_split(test_size=0.125, seed=42)
tokenized_datasets = DatasetDict({
    "train": tokenized_datasets["train"]["train"],
    "validation": tokenized_datasets["train"]["test"],
    "test": tokenized_datasets["test"]
})

# Define training arguments
training_args = TrainingArguments(
    output_dir="./roberta-finetuned-pos",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
)

# Create data collator
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"], # Use validation set for evaluation
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Fine-tune the model
trainer.train()

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.1696,0.062017
2,0.0456,0.051108
3,0.0339,0.04997


TrainOutput(global_step=1845, training_loss=0.07180381247667762, metrics={'train_runtime': 735.819, 'train_samples_per_second': 40.07, 'train_steps_per_second': 2.507, 'total_flos': 2019194595055704.0, 'train_loss': 0.07180381247667762, 'epoch': 3.0})

b35c93d536538065ec5cd0c614cca2b779b07fc7

In [None]:
# prompt: how to show performance like accuracy, precision, recall, and f1

import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# ... (Your existing code) ...

# Fine-tune the model
trainer.train()

# Evaluate the model on the test set
predictions = trainer.predict(tokenized_datasets["test"])
predicted_labels = np.argmax(predictions.predictions, axis=2)

# Remove -100 labels from both predicted and true labels
true_labels = [[label for label, pred_label in zip(example_labels, pred_labels) if label != -100]
               for example_labels, pred_labels in zip(tokenized_datasets["test"]["labels"], predicted_labels)]
predicted_labels = [[pred_label for label, pred_label in zip(example_labels, pred_labels) if label != -100]
                    for example_labels, pred_labels in zip(tokenized_datasets["test"]["labels"], predicted_labels)]

# Flatten the lists for sklearn metrics
true_labels_flat = [label for sublist in true_labels for label in sublist]
predicted_labels_flat = [label for sublist in predicted_labels for label in sublist]

# Calculate metrics
accuracy = accuracy_score(true_labels_flat, predicted_labels_flat)
precision, recall, f1, _ = precision_recall_fscore_support(true_labels_flat, predicted_labels_flat, average='weighted')

# Print the metrics
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")

Epoch,Training Loss,Validation Loss
1,0.0303,0.055821
2,0.0181,0.054065


Epoch,Training Loss,Validation Loss
1,0.0303,0.055821
2,0.0181,0.054065


In [None]:
!pip install datasets
from transformers import AutoTokenizer, DataCollatorForTokenClassification
tokenizer = AutoTokenizer.from_pretrained("roberta-base", add_prefix_space=True)
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

from datasets import DatasetDict # Corrected import

# Load the dataset if it hasn't been loaded already
dataset = load_dataset("Davlan/conll2003_noMISC")

split_dataset = dataset["train"].train_test_split(test_size=0.125, seed=42)

# 80% of 80% is the 64% for training set
dataset = DatasetDict({ # Use corrected name
    "train": split_dataset["train"],  # Access the 'train' split from the result
    "validation": split_dataset["test"],  # Access the 'test' split (validation) from the result
    "test": dataset["test"]
})

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"],
    batch_size=16,
    collate_fn=data_collator  # Data Collator menangani padding otomatis
)

In [None]:
#
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",         # Directory to save model checkpoints
    evaluation_strategy="epoch",   # Evaluate at the end of each epoch
    learning_rate=5e-5,            # Learning rate
    per_device_train_batch_size=32, # Batch size per GPU/CPU
    per_device_eval_batch_size=32, # Batch size for evaluation
    num_train_epochs=3,            # Number of training epochs
    weight_decay=0.01,             # Weight decay for regularization
    save_total_limit=2,            # Limit the number of saved checkpoints
    logging_dir="./logs",          # Directory for logs
    logging_steps=100,             # Log every 100 steps
    load_best_model_at_end=True,   # Load the best model at the end of training
    metric_for_best_model="accuracy", # Metric to use for determining the best model
    save_strategy="epoch"  # Change save_strategy to "epoch" to match evaluation_strategy
)

In [None]:
!pip install evaluate  # Install the evaluate package

In [None]:
from evaluate import load # Import load_metric from evaluate

In [None]:
!pip install seqeval # Install seqeval

In [None]:
import numpy as np
from evaluate import load # Import load from evaluate

# Load metric for evaluation
metric = load("seqeval")

# Define compute_metrics function
def compute_metrics(pred):
    predictions, labels = pred
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (-100)
    true_predictions = [
        [id2label[p] for (p, l) in zip(pred, label) if l != -100]
        for pred, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(pred, label) if l != -100]
        for pred, label in zip(predictions, labels)
    ]
    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
from datasets import DatasetDict

split_dataset = dataset["train"].train_test_split(test_size=0.125, seed=42)

# 80% of 80% is the 64% for training set
dataset = DatasetDict({ # Changed datasetsDict to DatasetDict
    "train": split_dataset["train"],  # Access the 'train' split from the result
    "validation": split_dataset["test"],  # Access the 'test' split (validation) from the result
    "test": dataset["test"]
})

In [None]:
# Import the necessary modules and classes from the 'transformers' library
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification

# Now you can initialize the Trainer:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],  # This should work now
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
# Train the model
trainer.train()

In [None]:
#
import numpy as np

# Generate predictions on the validation dataset
raw_predictions, labels, _ = trainer.predict(tokenized_datasets["test"])

# Convert raw predictions to the most likely labels
predictions = np.argmax(raw_predictions, axis=2)

# Remove padding and special tokens
true_labels = [
    [id2label[l] for (p, l) in zip(pred, label) if l != -100]
    for pred, label in zip(predictions, labels)
]
true_predictions = [
    [id2label[p] for (p, l) in zip(pred, label) if l != -100]
    for pred, label in zip(predictions, labels)
]

In [None]:
#
# Flatten the lists
flat_true_labels = [label for seq in true_labels for label in seq]
flat_predicted_labels = [label for seq in true_predictions for label in seq]

In [None]:
#
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

# Generate confusion matrix
cm = confusion_matrix(flat_true_labels, flat_predicted_labels, labels=list(label2id.keys()))

# Print classification report
print(classification_report(flat_true_labels, flat_predicted_labels, labels=list(label2id.keys())))


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
# Plot confusion matrix
plt.figure(figsize=(12, 10))
sns.heatmap(cm, annot=True, fmt="d", xticklabels=label2id.keys(), yticklabels=label2id.keys(), cmap="Blues")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Confusion Matrix for POS Tagging")
plt.show()

In [None]:
# prompt: give codes to implement peft using LoRA

!pip install peft
!pip install bitsandbytes

from peft import LoraConfig, get_peft_model, TaskType

# Define LoRA configuration
lora_config = LoraConfig(
    r=8,  # Rank
    lora_alpha=16,
    target_modules=["query", "value"], # Target modules for LoRA
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.TOKEN_CLS
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# The rest of your training code remains the same, using the 'model' with LoRA applied.

# Example (replace with your actual training loop)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],  # This should work now
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
trainer.train()

**************************************************

### BATASSS