In [2]:
import pandas as pd

df1 = pd.read_csv("email_spam_indo.csv")
df2 = pd.read_csv("spam.csv", encoding='latin-1')  # banyak dataset spam pakai latin-1

print("=== email_spam_indo.csv ===")
print(df1.head())
print(df1.columns)
print("\n=== spam.csv ===")
print(df2.head())
print(df2.columns)


=== email_spam_indo.csv ===
  Kategori                                              Pesan
0     spam  Secara alami tak tertahankan identitas perusah...
1     spam  Fanny Gunslinger Perdagangan Saham adalah Merr...
2     spam  Rumah -rumah baru yang luar biasa menjadi muda...
3     spam  4 Permintaan Khusus Pencetakan Warna Informasi...
4     spam  Jangan punya uang, dapatkan CD perangkat lunak...
Index(['Kategori', 'Pesan'], dtype='object')

=== spam.csv ===
  Category                                            Message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...
Index(['Category', 'Message'], dtype='object')


In [3]:
import pandas as pd
import re

# ======================================================
# 1️⃣ Baca dua dataset
# ======================================================
print("📥 Membaca dataset...")

# Dataset 1: email spam bahasa Indonesia
df_indo = pd.read_csv("email_spam_indo.csv")[["Kategori", "Pesan"]]
df_indo.rename(columns={"Kategori": "label_text", "Pesan": "text"}, inplace=True)

# Dataset 2: email/SMS spam internasional
df_en = pd.read_csv("spam.csv", encoding="latin-1")[["Category", "Message"]]
df_en.rename(columns={"Category": "label_text", "Message": "text"}, inplace=True)

# ======================================================
# 2️⃣ Gabungkan dataset
# ======================================================
df = pd.concat([df_indo, df_en], ignore_index=True)
print(f"Total data sebelum cleaning: {len(df)} baris")

# ======================================================
# 3️⃣ Bersihkan teks
# ======================================================
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"http\S+", "", text)  # hapus URL
    text = re.sub(r"[^a-zA-Z0-9\s]", " ", text)  # hapus simbol
    text = re.sub(r"\s+", " ", text).strip()  # hapus spasi berlebih
    return text

df["clean_text"] = df["text"].apply(clean_text)

# ======================================================
# 4️⃣ Konversi label jadi angka
# ======================================================
label_mapping = {
    "ham": 0,      # normal
    "spam": 1
}
df["label"] = df["label_text"].str.lower().map(label_mapping)

# Cek apakah ada label yang belum terkonversi
missing_labels = df[df["label"].isna()]
if len(missing_labels) > 0:
    print("\n⚠️ Ada label yang tidak dikenal:")
    print(missing_labels["label_text"].unique())

# Hapus baris tanpa label valid
df = df.dropna(subset=["label"])
df["label"] = df["label"].astype(int)

# ======================================================
# 5️⃣ Simpan hasil bersih
# ======================================================
df_clean = df[["clean_text", "label"]]
df_clean.to_csv("clean_data.csv", index=False)

print("\n✅ Dataset bersih disimpan sebagai clean_data.csv")
print("Distribusi label:")
print(df_clean["label"].value_counts())


📥 Membaca dataset...
Total data sebelum cleaning: 8208 baris

✅ Dataset bersih disimpan sebagai clean_data.csv
Distribusi label:
label
0    6093
1    2115
Name: count, dtype: int64


In [4]:
import pandas as pd

df1 = pd.read_csv("email_spam_indo.csv")
df2 = pd.read_csv("spam.csv", encoding="latin-1")

print("email_spam_indo.csv:", len(df1))
print("spam.csv:", len(df2))

# Cek baris kosong
print("\nBaris kosong di email_spam_indo.csv:", df1.isna().all(axis=1).sum())
print("Baris kosong di spam.csv:", df2.isna().all(axis=1).sum())


email_spam_indo.csv: 2636
spam.csv: 5572

Baris kosong di email_spam_indo.csv: 0
Baris kosong di spam.csv: 0


In [8]:
import os
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
import torch
from sklearn.metrics import accuracy_score, f1_score

# ======================================================
# 1️⃣ LOAD DATASET
# ======================================================
print("📥 Membaca dataset...")
df = pd.read_csv("clean_data.csv")

if "clean_text" not in df.columns or "label" not in df.columns:
    raise ValueError("Dataset harus memiliki kolom 'clean_text' dan 'label'!")

# Pastikan kolom teks tidak kosong dan bertipe string
df["clean_text"] = df["clean_text"].astype(str).fillna("")
df = df[df["clean_text"].str.strip() != ""]

# Pastikan label numerik dan float
df["label"] = pd.to_numeric(df["label"], errors="coerce").fillna(0).astype(int)

print(f"✅ Total data setelah pembersihan: {len(df)}")

# Split train/test (80:20)
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=42)

train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_df.reset_index(drop=True))
dataset = DatasetDict({"train": train_dataset, "test": test_dataset})

# ======================================================
# 2️⃣ TOKENIZER
# ======================================================
print("🔤 Menyiapkan tokenizer IndoBERT...")
model_name = "indobenchmark/indobert-base-p1"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

def tokenize(batch):
    # pastikan semua elemen batch["clean_text"] adalah string
    texts = [str(x) if not pd.isna(x) else "" for x in batch["clean_text"]]
    return tokenizer(texts, truncation=True, padding="max_length", max_length=128)

dataset = dataset.map(tokenize, batched=True, remove_columns=["clean_text"], load_from_cache_file=False)

# Format untuk PyTorch
dataset.set_format(
    type="torch",
    columns=["input_ids", "token_type_ids", "attention_mask", "label"]
)

# ======================================================
# 3️⃣ MODEL
# ======================================================
print("🧠 Menyiapkan model IndoBERT untuk klasifikasi email spam...")
num_labels = len(df["label"].unique())
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
print("⚠️ Note: Pesan 'Some weights ... newly initialized' itu normal (lapisan klasifikasi baru).")

# ======================================================
# 4️⃣ CUSTOM COLLATE FN (untuk mencegah dtype error)
# ======================================================
def custom_collate_fn(features):
    batch = DataCollatorWithPadding(tokenizer=tokenizer)(features)
    if "labels" in batch:
        batch["labels"] = batch["labels"].to(torch.long)  # harus long untuk classification
    elif "label" in batch:
        batch["labels"] = batch["label"].to(torch.long)
    return batch

output_dir = "./bert_email_finetuned"
os.makedirs(output_dir, exist_ok=True)

from transformers import TrainingArguments

# coba mode modern dulu, kalau muncul TypeError gunakan fallback
try:
    training_args = TrainingArguments(
        output_dir=output_dir,
        evaluation_strategy="epoch",   # modern arg
        save_strategy="epoch",         # modern arg
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=3,
        weight_decay=0.01,
        logging_dir="./logs",
        save_total_limit=2,
        load_best_model_at_end=False,
        fp16=torch.cuda.is_available(),
    )
    print("✅ Using modern TrainingArguments.")
except TypeError:
    # fallback untuk transformers versi lama
    print("⚠️ TrainingArguments tidak mendukung argumen modern — menggunakan fallback compat mode.")
    # versi lama cenderung memakai do_train/do_eval/save_steps/logging_steps
    training_args = TrainingArguments(
        output_dir=output_dir,
        do_train=True,
        do_eval=True,
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=3,
        weight_decay=0.01,
        logging_dir="./logs",
        logging_steps=500,
        save_steps=500,
        fp16=torch.cuda.is_available(),
    )

# ======================================================
# 6️⃣ METRICS
# ======================================================
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")
    return {"accuracy": acc, "f1": f1}

# ======================================================
# 7️⃣ TRAINER
# ======================================================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=custom_collate_fn
)

# ======================================================
# 8️⃣ TRAINING
# ======================================================
print("🚀 Mulai fine-tuning IndoBERT (Email Spam Detector)...")
trainer.train()

# ======================================================
# 9️⃣ SAVE MODEL
# ======================================================
print("💾 Menyimpan model ke folder", output_dir)
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)
print("\n✅ Fine-tuning selesai! Model siap digunakan untuk deteksi email spam.")


📥 Membaca dataset...
✅ Total data setelah pembersihan: 8208
🔤 Menyiapkan tokenizer IndoBERT...


Map: 100%|██████████| 6566/6566 [00:00<00:00, 10832.39 examples/s]
Map: 100%|██████████| 1642/1642 [00:00<00:00, 11153.47 examples/s]


🧠 Menyiapkan model IndoBERT untuk klasifikasi email spam...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


⚠️ Note: Pesan 'Some weights ... newly initialized' itu normal (lapisan klasifikasi baru).
⚠️ TrainingArguments tidak mendukung argumen modern — menggunakan fallback compat mode.
🚀 Mulai fine-tuning IndoBERT (Email Spam Detector)...




Step,Training Loss
500,0.1488
1000,0.0627
1500,0.0391
2000,0.0101




💾 Menyimpan model ke folder ./bert_email_finetuned

✅ Fine-tuning selesai! Model siap digunakan untuk deteksi email spam.
