# Preprocessing

In [None]:
import pandas as pd

df = pd.read_csv("jaklingko_labeled.csv")
df.head()

Unnamed: 0,Title,URL,Date,Year,real_url,article_content,content_cleaned,label
0,Dishub DKI Targetkan Seluruh Angkot Gabung Jak...,https://news.google.com/rss/articles/CBMinAFBV...,10/15/2022 7:00,2022,https://www.tempo.co/arsip/dishub-dki-targetka...,"TEMPO.CO, Jakarta - Dinas Perhubungan DKI Jaka...",tempoco jakarta dinas perhubungan dki jakarta ...,positif
1,"Terkendala Saat Beli Tiket Tarif Integrasi, In...",https://news.google.com/rss/articles/CBMi0gFBV...,8/13/2022 7:00,2022,https://megapolitan.kompas.com/read/2022/08/13...,"JAKARTA, KOMPAS.com - Tarif integrasi untuk mo...",jakarta kompascom tarif integrasi moda transja...,netral
2,Optimalisasi Penerapan Tarif Terintegrasi JakL...,https://news.google.com/rss/articles/CBMiiAFBV...,11/27/2022 8:00,2022,https://www.kompas.id/artikel/optimalisasi-pen...,Di tengah dominasi penggunaan kendaraan pribad...,dominasi penggunaan kendaraan pribadi moda tra...,netral
3,4 Alasan Kita Mesti Berhati-hati Naik Angkot J...,https://news.google.com/rss/articles/CBMiowFBV...,8/24/2022 7:00,2022,https://www.liputan6.com/regional/read/5048592...,"Liputan6.com, Bandung - Angkutan umum mikrotra...",liputancom bandung angkutan mikrotrans pembaya...,negatif
4,"Jajal Tarif Integrasi TransJ-MRT-LRT, Penumpan...",https://news.google.com/rss/articles/CBMitAFBV...,8/12/2022 7:00,2022,https://news.detik.com/berita/d-6231873/jajal-...,Pemprov DKI Jakarta telah menerapkan tarif int...,pemprov dki jakarta menerapkan tarif integrasi...,negatif


In [None]:
import pandas as pd
import re
from transformers import AutoTokenizer

# 2. Hapus kolom yang tidak diperlukan
columns_to_drop = ["URL", "Date", "real_url", "content_cleaned"]
df = df.drop(columns=columns_to_drop, errors="ignore")

MEDIA_PATTERNS = [
    r"\bkompas(?:\.com)?\b",
    r"\bdetik(?:\.com)?\b",
    r"\btribun(?:news)?(?:\.com)?\b",
    r"\bcnn\s*indonesia(?:\.com)?\b",
    r"\bcnbc\s*indonesia(?:\.com)?\b",
    r"\btempo(?:\.co)?\b",
    r"\bsuara(?:\.com)?\b",
    r"\bokezone(?:\.com)?\b",
    r"\bmerdeka(?:\.com)?\b",
    r"\bliputan6(?:\.com)?\b",
    r"\bkumparan(?:\.com)?\b",
    r"\bidn\s*times(?:\.com)?\b",
    r"\brepublika(?:\.co\.id)?\b",
    r"\bsindonews(?:\.com)?\b",
    r"\bantaranews(?:\.com)?\b",
    r"\bviva(?:\.co\.id)?\b",
    r"\bgrid(?:\.id)?\b",
    r"\bsinpo(?:\.id)?\b"
]

def remove_media_names(text):
    for pattern in MEDIA_PATTERNS:
        text = re.sub(pattern, " ", text, flags=re.IGNORECASE)
    return text


# 3. Fungsi normalisasi teks ringan
def clean_text(text):
    if pd.isna(text):
        return ""

    text = text.lower()

    # Hapus nama media
    text = remove_media_names(text)

    # 3. Hapus template "baca juga"
    text = re.sub(r"\bbaca juga\b.*?(?=[a-z]{3,}|$)", " ", text)

    # 4. Hapus advertisement atau template berita
    text = re.sub(r"advertisement", " ", text)
    text = re.sub(r"keterangan tertulis", " ", text)

    # 5. Normalisasi dash:
    #    - Hilangkan dash di dalam kata (non-jaklingko -> non jaklingko)
    text = re.sub(r"(?<=\w)-(?=\w)", " ", text)

    # Pertahankan angka & tanda baca dasar, hapus karakter lain
    text = re.sub(r"[^\w\s\.\,\!\?]", " ", text)

    # 9. Hapus [url] placeholder jika tidak dipakai
    text = re.sub(r"\[url\]", " ", text)

    # 10. Normalisasi whitespace
    text = re.sub(r"\s+", " ", text).strip()

    return text

# 4. Terapkan normalisasi ke kolom article_content
df["cleaned_article"] = df["article_content"].astype(str).apply(clean_text)

# 5. Inisialisasi tokenizer (contoh: IndoBERT)
tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")

# 6. Tokenisasi sample untuk mengecek hasil
sample_index = 0  # ambil baris ke-100 misalnya
sample_text = df.loc[sample_index, "cleaned_article"]

tokens = tokenizer.tokenize(sample_text)

print("=== Sample Cleaned Text ===")
print(sample_text)
print("\n=== Tokenizer Output ===")
print(tokens)

# ==========================
# 7. Simpan ke CSV baru
# ==========================
output_path = "jaklingko_labeled_prepocessed.csv"
df.to_csv(output_path, index=False)
print(f"\nFile berhasil disimpan sebagai: {output_path}")

=== Sample Cleaned Text ===
, jakarta dinas perhubungan dki jakarta menargetkan seluruh angkutan kota atau angkot tergabung dalam satu sistem transportasi pada 2026. sebab, untuk membawa angkot biasa bergabung dalam sistem jaklingko membutuhkan proses panjang. baca berita dengan sedikit iklan, klik di sini kalau enggak salah 2026 semua sudah tergabung dalam sistem jaklingko. kalau sudah tergabung, maka tidak ada lagi angkot biasa, kata kepala pusat data dan informasi perhubungan anton r. parura kepada saat ditemui di hotel shangri la jakarta, jumat, 14 oktober 2022. scroll ke bawah untuk melanjutkan membaca baca berita dengan sedikit iklan, klik di sini namun, ia belum bisa memastikan lantaran tidak membawa data yang dibutuhkan untuk menjelaskan hal tersebut. kurang lebih segitu, deh, mudah mudahan benar. saya enggak pegang datanya, tapi ada targetnya bahwa tahun sekian sudah semua dalam satu sistem transportasi di jakarta, ujar anton. bandung uji coba angkot listrik rute gunung batu s

In [None]:
df = pd.read_csv("jaklingko_preprocess.csv")
df.head()

Unnamed: 0,Title,Year,article_content,label,cleaned_article
0,Dishub DKI Targetkan Seluruh Angkot Gabung Jak...,2022,"TEMPO.CO, Jakarta - Dinas Perhubungan DKI Jaka...",positif,", jakarta dinas perhubungan dki jakarta menarg..."
1,"Terkendala Saat Beli Tiket Tarif Integrasi, In...",2022,"JAKARTA, KOMPAS.com - Tarif integrasi untuk mo...",netral,"jakarta, tarif integrasi untuk moda transjakar..."
2,Optimalisasi Penerapan Tarif Terintegrasi JakL...,2022,Di tengah dominasi penggunaan kendaraan pribad...,netral,di tengah dominasi penggunaan kendaraan pribad...
3,4 Alasan Kita Mesti Berhati-hati Naik Angkot J...,2022,"Liputan6.com, Bandung - Angkutan umum mikrotra...",negatif,", bandung angkutan umum mikrotrans dengan pemb..."
4,"Jajal Tarif Integrasi TransJ-MRT-LRT, Penumpan...",2022,Pemprov DKI Jakarta telah menerapkan tarif int...,negatif,pemprov dki jakarta telah menerapkan tarif int...


# Baseline Model BERT

In [None]:
# Mapping label
mapping = {
    "negatif": 0,
    "netral": 1,
    "positif": 2
}

df["label"] = df["label"].map(mapping)
print(df["label"].value_counts())


label
2    121
1    105
0     79
Name: count, dtype: int64


In [None]:
df = pd.read_csv("jaklingko_mapped.csv")
df.head()

Unnamed: 0,Title,Year,article_content,label,cleaned_article
0,Dishub DKI Targetkan Seluruh Angkot Gabung Jak...,2022,"TEMPO.CO, Jakarta - Dinas Perhubungan DKI Jaka...",2,", jakarta dinas perhubungan dki jakarta menarg..."
1,"Terkendala Saat Beli Tiket Tarif Integrasi, In...",2022,"JAKARTA, KOMPAS.com - Tarif integrasi untuk mo...",1,"jakarta, tarif integrasi untuk moda transjakar..."
2,Optimalisasi Penerapan Tarif Terintegrasi JakL...,2022,Di tengah dominasi penggunaan kendaraan pribad...,1,di tengah dominasi penggunaan kendaraan pribad...
3,4 Alasan Kita Mesti Berhati-hati Naik Angkot J...,2022,"Liputan6.com, Bandung - Angkutan umum mikrotra...",0,", bandung angkutan umum mikrotrans dengan pemb..."
4,"Jajal Tarif Integrasi TransJ-MRT-LRT, Penumpan...",2022,Pemprov DKI Jakarta telah menerapkan tarif int...,0,pemprov dki jakarta telah menerapkan tarif int...


In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from torch.optim import AdamW
from tqdm import tqdm

# =========================================================
# 1. Load Dataset
# =========================================================
df = pd.read_csv("jaklingko_mapped.csv")

# Pastikan nama kolom sesuai
df = df[["cleaned_article", "label"]]

# Train-val split
train_df, val_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df["label"]
)

# =========================================================
# 2. Tokenizer IndoBERT
# =========================================================
tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")

MAX_LEN = 256
BATCH_SIZE = 8
EPOCHS = 3
LR = 2e-5

# =========================================================
# 3. Dataset Class
# =========================================================
class SentimentDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts.tolist()
        self.labels = labels.tolist()

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoded = tokenizer(
            self.texts[idx],
            max_length=MAX_LEN,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        return {
            "input_ids": encoded["input_ids"].squeeze(),
            "attention_mask": encoded["attention_mask"].squeeze(),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }

train_dataset = SentimentDataset(train_df["cleaned_article"], train_df["label"])
val_dataset   = SentimentDataset(val_df["cleaned_article"], val_df["label"])

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=BATCH_SIZE)

# =========================================================
# 4. Load Model
# =========================================================
model = BertForSequenceClassification.from_pretrained(
    "indobenchmark/indobert-base-p1",
    num_labels=3
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# =========================================================
# 5. Optimizer & Scheduler
# =========================================================
optimizer = AdamW(model.parameters(), lr=LR)

total_steps = len(train_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

# =========================================================
# 6. Training Loop
# =========================================================
def train_epoch():
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader, desc="Training"):
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        mask      = batch["attention_mask"].to(device)
        labels    = batch["labels"].to(device)

        outputs = model(input_ids, attention_mask=mask, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    return avg_loss

# =========================================================
# 7. Evaluation Loop
# =========================================================
def eval_epoch():
    model.eval()
    preds, trues = [], []

    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Evaluating"):
            input_ids = batch["input_ids"].to(device)
            mask      = batch["attention_mask"].to(device)
            labels    = batch["labels"].to(device)

            outputs = model(input_ids, attention_mask=mask)
            logits = outputs.logits

            predictions = torch.argmax(logits, dim=1)

            preds.extend(predictions.cpu().tolist())
            trues.extend(labels.cpu().tolist())

    acc  = accuracy_score(trues, preds)
    prec = precision_score(trues, preds, average="macro")
    rec  = recall_score(trues, preds, average="macro")
    f1   = f1_score(trues, preds, average="macro")

    return acc, prec, rec, f1


# =========================================================
# 8. Run Training & Evaluation
# =========================================================
for epoch in range(EPOCHS):
    print(f"\n===== Epoch {epoch+1}/{EPOCHS} =====")

    train_loss = train_epoch()
    acc, prec, rec, f1 = eval_epoch()

    print(f"\nTrain Loss: {train_loss:.4f}")
    print(f"Val Acc   : {acc:.4f}")
    print(f"Val Prec  : {prec:.4f}")
    print(f"Val Rec   : {rec:.4f}")
    print(f"Val F1    : {f1:.4f}")

# =========================================================
# 9. Contoh Prediksi
# =========================================================
def predict(text):
    encoded = tokenizer(
        text,
        max_length=256,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    ).to(device)

    model.eval()
    with torch.no_grad():
        outputs = model(**encoded)
        pred = torch.argmax(outputs.logits).item()
    return pred

contoh = "Layanan ini sering mengalami kendala dalam pemakaiannya."
print("\nPrediksi contoh:", predict(contoh))




pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



===== Epoch 1/3 =====


Training:   0%|          | 0/31 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

Training: 100%|██████████| 31/31 [13:50<00:00, 26.78s/it]
Evaluating: 100%|██████████| 8/8 [00:57<00:00,  7.24s/it]



Train Loss: 0.9874
Val Acc   : 0.5738
Val Prec  : 0.6843
Val Rec   : 0.5883
Val F1    : 0.5548

===== Epoch 2/3 =====


Training: 100%|██████████| 31/31 [13:27<00:00, 26.05s/it]
Evaluating: 100%|██████████| 8/8 [00:58<00:00,  7.31s/it]



Train Loss: 0.7918
Val Acc   : 0.6393
Val Prec  : 0.6814
Val Rec   : 0.6181
Val F1    : 0.6114

===== Epoch 3/3 =====


Training: 100%|██████████| 31/31 [13:33<00:00, 26.23s/it]
Evaluating: 100%|██████████| 8/8 [00:57<00:00,  7.18s/it]



Train Loss: 0.6467
Val Acc   : 0.7213
Val Prec  : 0.7369
Val Rec   : 0.7163
Val F1    : 0.7163

Prediksi contoh: 2


# Fine Tuning BERT

In [None]:
# Fine tuning

import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from torch.optim import AdamW
from tqdm import tqdm

# =========================================================
# 1. Load Dataset
# =========================================================
df = pd.read_csv("jaklingko_mapped.csv")

# Pastikan nama kolom sesuai
df = df[["cleaned_article", "label"]]

# Train-val split
train_df, val_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df["label"]
)

# =========================================================
# 2. Tokenizer IndoBERT
# =========================================================
tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")

MAX_LEN = 256
BATCH_SIZE = 8
EPOCHS = 5
LR = 1e-5

# =========================================================
# 3. Dataset Class
# =========================================================
class SentimentDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts.tolist()
        self.labels = labels.tolist()

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoded = tokenizer(
            self.texts[idx],
            max_length=MAX_LEN,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        return {
            "input_ids": encoded["input_ids"].squeeze(),
            "attention_mask": encoded["attention_mask"].squeeze(),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }

train_dataset = SentimentDataset(train_df["cleaned_article"], train_df["label"])
val_dataset   = SentimentDataset(val_df["cleaned_article"], val_df["label"])

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=BATCH_SIZE)

# =========================================================
# 4. Load Model
# =========================================================
model = BertForSequenceClassification.from_pretrained(
    "indobenchmark/indobert-base-p1",
    num_labels=3
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# =========================================================
# 5. Optimizer & Scheduler
# =========================================================
optimizer = AdamW(model.parameters(), lr=LR)

total_steps = len(train_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

# =========================================================
# 6. Training Loop
# =========================================================
def train_epoch():
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader, desc="Training"):
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        mask      = batch["attention_mask"].to(device)
        labels    = batch["labels"].to(device)

        outputs = model(input_ids, attention_mask=mask, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    return avg_loss

# =========================================================
# 7. Evaluation Loop
# =========================================================
def eval_epoch():
    model.eval()
    preds, trues = [], []

    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Evaluating"):
            input_ids = batch["input_ids"].to(device)
            mask      = batch["attention_mask"].to(device)
            labels    = batch["labels"].to(device)

            outputs = model(input_ids, attention_mask=mask)
            logits = outputs.logits

            predictions = torch.argmax(logits, dim=1)

            preds.extend(predictions.cpu().tolist())
            trues.extend(labels.cpu().tolist())

    acc  = accuracy_score(trues, preds)
    prec = precision_score(trues, preds, average="macro")
    rec  = recall_score(trues, preds, average="macro")
    f1   = f1_score(trues, preds, average="macro")

    return acc, prec, rec, f1


# =========================================================
# 8. Run Training & Evaluation
# =========================================================
for epoch in range(EPOCHS):
    print(f"\n===== Epoch {epoch+1}/{EPOCHS} =====")

    train_loss = train_epoch()
    acc, prec, rec, f1 = eval_epoch()

    print(f"\nTrain Loss: {train_loss:.4f}")
    print(f"Val Acc   : {acc:.4f}")
    print(f"Val Prec  : {prec:.4f}")
    print(f"Val Rec   : {rec:.4f}")
    print(f"Val F1    : {f1:.4f}")

# =========================================================
# 9. Contoh Prediksi
# =========================================================
def predict(text):
    encoded = tokenizer(
        text,
        max_length=256,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    ).to(device)

    model.eval()
    with torch.no_grad():
        outputs = model(**encoded)
        pred = torch.argmax(outputs.logits).item()
    return pred

contoh = "Layanan ini sering mengalami kendala dalam pemakaiannya."
print("\nPrediksi contoh:", predict(contoh))


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



===== Epoch 1/5 =====


Training: 100%|██████████| 31/31 [13:35<00:00, 26.30s/it]
Evaluating: 100%|██████████| 8/8 [00:58<00:00,  7.36s/it]



Train Loss: 1.0616
Val Acc   : 0.6230
Val Prec  : 0.7281
Val Rec   : 0.5853
Val F1    : 0.5797

===== Epoch 2/5 =====


Training: 100%|██████████| 31/31 [13:26<00:00, 26.02s/it]
Evaluating: 100%|██████████| 8/8 [00:57<00:00,  7.18s/it]



Train Loss: 0.8663
Val Acc   : 0.7049
Val Prec  : 0.7628
Val Rec   : 0.6865
Val F1    : 0.6915

===== Epoch 3/5 =====


Training: 100%|██████████| 31/31 [13:46<00:00, 26.66s/it]
Evaluating: 100%|██████████| 8/8 [00:57<00:00,  7.20s/it]



Train Loss: 0.7696
Val Acc   : 0.6230
Val Prec  : 0.6454
Val Rec   : 0.6458
Val F1    : 0.6301

===== Epoch 4/5 =====


Training: 100%|██████████| 31/31 [13:29<00:00, 26.10s/it]
Evaluating: 100%|██████████| 8/8 [00:57<00:00,  7.20s/it]



Train Loss: 0.6904
Val Acc   : 0.7377
Val Prec  : 0.8073
Val Rec   : 0.7331
Val F1    : 0.7225

===== Epoch 5/5 =====


Training: 100%|██████████| 31/31 [13:32<00:00, 26.20s/it]
Evaluating: 100%|██████████| 8/8 [00:58<00:00,  7.35s/it]



Train Loss: 0.6404
Val Acc   : 0.7213
Val Prec  : 0.8058
Val Rec   : 0.7054
Val F1    : 0.7025

Prediksi contoh: 1


In [None]:
# =========================================================
# 9. Prediksi Sentimen (Multiple Contoh)
# =========================================================

# Mapping label (opsional, cuma buat clarity di kode)
LABEL_MAP = {
    0: "negatif",
    1: "netral",
    2: "positif"
}

def predict_sentiment(texts):
    """
    texts : list of string
    return: list of int (0=negatif, 1=netral, 2=positif)
    """
    model.eval()
    predictions = []

    with torch.no_grad():
        for text in texts:
            encoded = tokenizer(
                text,
                max_length=256,
                truncation=True,
                padding="max_length",
                return_tensors="pt"
            ).to(device)

            outputs = model(**encoded)
            pred = torch.argmax(outputs.logits, dim=1).item()
            predictions.append(pred)

    return predictions


# =========================================================
# Contoh Penggunaan
# =========================================================

contoh_teks = [
    "Layanan ini sering mengalami kendala dalam pemakaiannya.",
    "Pelayanannya cukup baik dan petugasnya ramah",
    "Aplikasi ini biasa saja, tidak terlalu membantu."
]

hasil_prediksi = predict_sentiment(contoh_teks)

for teks, label in zip(contoh_teks, hasil_prediksi):
    print(f"Teks     : {teks}")
    print(f"Prediksi : {label} ({LABEL_MAP[label]})")
    print("-" * 50)


Teks     : Layanan ini sering mengalami kendala dalam pemakaiannya.
Prediksi : 1 (netral)
--------------------------------------------------
Teks     : Pelayanannya cukup baik dan petugasnya ramah
Prediksi : 1 (netral)
--------------------------------------------------
Teks     : Aplikasi ini biasa saja, tidak terlalu membantu.
Prediksi : 2 (positif)
--------------------------------------------------


In [None]:
# =========================================================
# 9. Prediksi Sentimen (Multiple Contoh)
# =========================================================

# Mapping label (opsional, cuma buat clarity di kode)
LABEL_MAP = {
    0: "negatif",
    1: "netral",
    2: "positif"
}

def predict_sentiment(texts):
    """
    texts : list of string
    return: list of int (0=negatif, 1=netral, 2=positif)
    """
    model.eval()
    predictions = []

    with torch.no_grad():
        for text in texts:
            encoded = tokenizer(
                text,
                max_length=256,
                truncation=True,
                padding="max_length",
                return_tensors="pt"
            ).to(device)

            outputs = model(**encoded)
            pred = torch.argmax(outputs.logits, dim=1).item()
            predictions.append(pred)

    return predictions


# =========================================================
# Contoh Penggunaan
# =========================================================

contoh_teks = [
    # Negatif
    "Layanan ini sering mengalami gangguan dan membuat pengguna merasa dirugikan.",
    "Aplikasi kerap bermasalah dan sulit digunakan pada jam sibuk.",

    # Netral
    "Layanan ini digunakan oleh masyarakat untuk mendukung aktivitas transportasi harian.",
    "Aplikasi tersebut mulai diterapkan di beberapa wilayah tertentu.",

    # Positif
    "Pelayanan yang diberikan sangat membantu dan memudahkan pengguna.",
    "Petugas bekerja dengan profesional dan memberikan respons yang cepat.",

    # Mixed sentiment (rawan salah klasifikasi)
    "Layanan ini cukup membantu, meskipun masih terdapat beberapa kendala teknis.",
    "Aplikasinya mudah digunakan, tetapi performanya belum konsisten.",

    # Ambigu / implisit
    "Pengguna berharap kualitas layanan dapat terus ditingkatkan ke depannya.",
    "Beberapa fitur masih dalam tahap pengembangan."
]


hasil_prediksi = predict_sentiment(contoh_teks)

for teks, label in zip(contoh_teks, hasil_prediksi):
    print(f"Teks     : {teks}")
    print(f"Prediksi : {label} ({LABEL_MAP[label]})")
    print("-" * 50)


Teks     : Layanan ini sering mengalami gangguan dan membuat pengguna merasa dirugikan.
Prediksi : 0 (negatif)
--------------------------------------------------
Teks     : Aplikasi kerap bermasalah dan sulit digunakan pada jam sibuk.
Prediksi : 2 (positif)
--------------------------------------------------
Teks     : Layanan ini digunakan oleh masyarakat untuk mendukung aktivitas transportasi harian.
Prediksi : 2 (positif)
--------------------------------------------------
Teks     : Aplikasi tersebut mulai diterapkan di beberapa wilayah tertentu.
Prediksi : 2 (positif)
--------------------------------------------------
Teks     : Pelayanan yang diberikan sangat membantu dan memudahkan pengguna.
Prediksi : 2 (positif)
--------------------------------------------------
Teks     : Petugas bekerja dengan profesional dan memberikan respons yang cepat.
Prediksi : 0 (negatif)
--------------------------------------------------
Teks     : Layanan ini cukup membantu, meskipun masih terdapat be