In [4]:
# Di dalam notebook 03
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch

# Muat data
df = pd.read_csv('../data/processed/synthetic_data.csv')

# Ubah label kategori menjadi angka
labels = df['category'].unique().tolist()
label2id = {label: i for i, label in enumerate(labels)}
id2label = {i: label for i, label in enumerate(labels)}
df['label'] = df['category'].map(label2id)

# Bagi data
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42
)

# Inisialisasi tokenizer dari model pre-trained Indonesia
tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")

# Tokenisasi data
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

# Buat class Dataset
class EmergencyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = EmergencyDataset(train_encodings, train_labels)
val_dataset = EmergencyDataset(val_encodings, val_labels)

# Muat model
model = AutoModelForSequenceClassification.from_pretrained(
    "indobenchmark/indobert-base-p1", 
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id
)

# Definisikan argumen training
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

# Definisikan trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Mulai fine-tuning
trainer.train()

# Simpan model
output_dir = '../api/models/classification_model'
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Model klasifikasi disimpan di {output_dir}")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
10,1.1228
20,0.9083
30,0.5653
40,0.2046
50,0.03
60,0.0049
70,0.0017
80,0.0011
90,0.0008
100,0.0007


Model klasifikasi disimpan di ../api/models/classification_model
