In [1]:
import pandas as pd
import re
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    Trainer, TrainingArguments, EarlyStoppingCallback
)
from datasets import Dataset
from bert_score import score
from sentence_transformers import SentenceTransformer, util
import matplotlib.pyplot as plt
import numpy as np

# 1. Fungsi pembersih teks
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s\?\.,]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

# 2. Load dan bersihkan data
df = pd.read_csv("data04.csv")
df['Pertanyaan'] = df['Pertanyaan'].astype(str).apply(clean_text)
df['Jawaban'] = df['Jawaban'].astype(str).apply(clean_text)

# 3. Buat pasangan positif-negatif
data_pairs = []
for idx, row in df.iterrows():
    data_pairs.append({'text1': row['Pertanyaan'], 'text2': row['Jawaban'], 'label': 1})
    neg = df.sample(n=1)
    while neg.index[0] == idx:
        neg = df.sample(n=1)
    data_pairs.append({'text1': row['Pertanyaan'], 'text2': neg.iloc[0]['Jawaban'], 'label': 0})

pair_df = pd.DataFrame(data_pairs)
train_df, val_df = train_test_split(pair_df, test_size=0.2, random_state=42)

# 4. Load tokenizer dan model IndoBERT
model_name = "indobenchmark/indobert-base-p1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Deteksi dan pindahkan model ke GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print("✅ Model dijalankan di device:", device)
if torch.cuda.is_available():
    print("🟢 GPU Aktif:", torch.cuda.get_device_name(0))
else:
    print("🟡 GPU tidak tersedia, menggunakan CPU (lambat)")

# 5. Tokenisasi
def tokenize(batch):
    return tokenizer(batch['text1'], batch['text2'], truncation=True, padding='max_length', max_length=256)

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
train_dataset = train_dataset.map(tokenize, batched=True).remove_columns(['text1', 'text2'])
val_dataset = val_dataset.map(tokenize, batched=True).remove_columns(['text1', 'text2'])

# 6. TrainingArguments + EarlyStopping + GPU support (fp16)
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir='./logs',
    fp16=torch.cuda.is_available()  # hanya aktif jika ada GPU
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
    compute_metrics=lambda p: {
        'accuracy': accuracy_score(p.label_ids, p.predictions.argmax(-1))
    }
)

# 7. Jalankan training IndoBERT di GPU
trainer.train()


  from .autonotebook import tqdm as notebook_tqdm
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ Model dijalankan di device: cuda
🟢 GPU Aktif: NVIDIA GeForce RTX 4060


Map: 100%|██████████| 6894/6894 [00:00<00:00, 9066.47 examples/s] 
Map: 100%|██████████| 1724/1724 [00:00<00:00, 10842.89 examples/s]
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2594,0.187694,0.937355
2,0.1433,0.214626,0.933875
3,0.0922,0.281054,0.948376
4,0.0621,0.281356,0.942575
5,0.0405,0.300416,0.952436
6,0.04,0.329279,0.944896
7,0.0286,0.317605,0.948376


TrainOutput(global_step=3017, training_loss=0.09515880360917853, metrics={'train_runtime': 24207.7887, 'train_samples_per_second': 2.848, 'train_steps_per_second': 0.178, 'total_flos': 6348606654781440.0, 'train_loss': 0.09515880360917853, 'epoch': 7.0})

In [2]:
import os

# 8. Simpan model dan tokenizer yang sudah di-fine-tune
save_path = "./model-data/indobert_qa_finetuned"

# Buat direktori jika belum ada
os.makedirs(save_path, exist_ok=True)

# Simpan model dan tokenizer
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"✅ Model dan tokenizer disimpan di: {save_path}")


✅ Model dan tokenizer disimpan di: ./model-data/indobert_qa_finetuned
