In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
# !pip install transformers torch pandas scikit-learn accelerate>=0.26.0

In [7]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [8]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Read dataset
file_path = '/content/drive/MyDrive/ril-or-fek/dataset_final.csv'
df = pd.read_csv(file_path)

# Load Dataset
print("1. Memuat dataset...")
df = pd.read_csv(file_path)
df.dropna(subset=['Clean Narasi', 'hoax'], inplace=True)

df['hoax'] = df['hoax'].astype(int)
print("Dataset dimuat.")

1. Memuat dataset...
Dataset dimuat.


In [9]:
# Menentukan model dari hugging face
MODEL_NAME = "indobenchmark/indobert-base-p1"
print(f"\n2. Mengunduh pre-trained model dan tokenizer untuk '{MODEL_NAME}'...")


2. Mengunduh pre-trained model dan tokenizer untuk 'indobenchmark/indobert-base-p1'...


In [10]:
# Muat tokenizer untuk IndoBert
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [11]:
# Muat model IndoBERT
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
print("Model dan tokenizer berhasil diunduh.")

pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model dan tokenizer berhasil diunduh.


In [12]:
# Tokenisasi
print("\n3. Memproses dan melakukan tokenisasi pada data...")

file_path = '/content/drive/MyDrive/ril-or-fek/dataset_final.csv'
df = pd.read_csv(file_path)

# Bagi dataframe menjadi train dan validation set
train_df, val_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df['hoax']
)


train_texts = train_df['Clean Narasi'].tolist()
train_labels = train_df['hoax'].tolist()
val_texts = val_df['Clean Narasi'].tolist()
val_labels = val_df['hoax'].tolist()

# Lakukan tokenisasi pada set latih dan validasi
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

print("Data berhasil di-tokenisasi.")


3. Memproses dan melakukan tokenisasi pada data...
Data berhasil di-tokenisasi.


In [13]:
# Membuat dataset custom untuk pytorch
import torch

class HoaxDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Membuat objek dataset (data latih dan validasi)
train_dataset = HoaxDataset(train_encodings, train_labels)
val_dataset = HoaxDataset(val_encodings, val_labels)
print("\n4. Custom PyTorch Dataset berhasil dibuat.")


4. Custom PyTorch Dataset berhasil dibuat.


In [14]:
# Argumen untuk training
from transformers import TrainingArguments

print("\n5. Mendefinisikan argumen untuk pelatihan...")

training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/ril-or-fek/results',
    logging_dir='/content/drive/MyDrive/ril-or-fek/logs',

    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,

    logging_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",

    learning_rate=5e-5,
    warmup_steps=500,
    weight_decay=0.01,
    load_best_model_at_end=True,
    report_to="none",
)

print("Argumen pelatihan didefinisikan.")


5. Mendefinisikan argumen untuk pelatihan...
Argumen pelatihan didefinisikan.


In [15]:
# Train model dengan trainer
from transformers import Trainer

# Inisialisasi Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

print("\n6. Memulai proses fine-tuning...")
# Mulai training
trainer.train()
print("Proses fine-tuning selesai!")


6. Memulai proses fine-tuning...


Epoch,Training Loss,Validation Loss
1,0.0689,0.011819
2,0.0109,0.010801
3,0.0029,0.011844


Proses fine-tuning selesai!


In [16]:
# Simpan model ke drive
final_model_path = '/content/drive/MyDrive/ril-or-fek/model_final_indobert'
trainer.save_model(final_model_path)
tokenizer.save_pretrained(final_model_path)

print(f"\nModel final berhasil disimpan secara permanen di: {final_model_path}")


Model final berhasil disimpan secara permanen di: /content/drive/MyDrive/ril-or-fek/model_final_indobert
