## 1. Setup Environment

In [None]:
#@title Twitter Auth Token
twitter_auth_token = 'e95d73bd6949da13c44da2b8acc3a6181584bef2'

In [None]:
# Mencegah Colab disconnect
from google.colab import output
output.enable_custom_widget_manager()

In [None]:
# Import required Python package
!pip install pandas

# Install Node.js (because tweet-harvest built using Node.js)
!sudo apt-get update
!sudo apt-get install -y ca-certificates curl gnupg
!sudo mkdir -p /etc/apt/keyrings
!curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key | sudo gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg
!NODE_MAJOR=20 && echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_$NODE_MAJOR.x nodistro main" | sudo tee /etc/apt/sources.list.d/nodesource.list
!sudo apt-get update
!sudo apt-get install nodejs -y
!node -v

In [None]:
!pip install playwright
!playwright install-deps
!playwright install chromium

## 2. Scraping Data Twitter

In [None]:
filename = 'penggundulanhutan.csv'
search_keyword = '"Deforestasi", "Hutan Gundul", " since:2025-11-20 until:2025-12-20 lang:id'
limit = 1000

!npx -y tweet-harvest@2.6.1 -o "{filename}" -s "{search_keyword}" --tab "LATEST" -l {limit} --token {twitter_auth_token}

In [None]:
# Scraping Twitter - PERBAIKAN PATH
filename = 'penggundulanhutan.csv'
!npx --yes tweet-harvest@latest \
    -o "penggundulanhutan.csv" \
    -s "deforestasi lang:id" \
    --tab "LATEST" \
    -l 1000 \
    --token "{twitter_auth_token}"

In [None]:
import pandas as pd

file_path = f"tweets-data/{filename}"
df = pd.read_csv(file_path, delimiter=",")
print(f"Jumlah tweet: {len(df)}")
display(df)

## 3. Preprocessing Data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import re

# Load dataset dari direktori Colab
file_path = '/content/tweets-data/penggundulanhutan.csv'
df = pd.read_csv(file_path)

print(f"Data berhasil dimuat: {len(df)} baris.")
df.head()

# DESKRIPSI DATASET
print("=" * 50)
print("DESKRIPSI DATASET")
print("=" * 50)
print(f"Sumber Data      : Twitter (X)")
print(f"Topik            : Deforestasi & Pengundulan Hutan")
print(f"Periode          : 20 Nov - 20 Des 2025")
print(f"Ukuran Dataset   : {len(df)} tweets")
print(f"Jumlah Kolom     : {len(df.columns)}")
print(f"Kolom yang ada   : {list(df.columns)}")
print("=" * 50)

In [None]:
# Fungsi preprocessing
def clean_text(text):
    if pd.isna(text):
        return ""
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)  # Hapus URL
    text = re.sub(r'@\w+', '', text)  # Hapus mention
    text = re.sub(r'#\w+', '', text)  # Hapus hashtag
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Hapus karakter non-alfabet
    text = re.sub(r'\s+', ' ', text).strip()  # Normalisasi spasi
    return text

print(f"Data awal: {len(df)} tweets")

# Terapkan preprocessing
df['cleaned_text'] = df['full_text'].apply(clean_text)
print("Pembersihan teks selesai.")

# Filter tweet dengan minimal 5 kata
df = df[df['cleaned_text'].apply(lambda x: len(x.split()) >= 5)]
print(f"Data setelah filter (min 5 kata): {len(df)} tweets")

# Stopwords Indonesia
indo_stopwords = set(['yang', 'dan', 'di', 'ke', 'dari', 'ini', 'itu', 'dengan', 'untuk',
                      'pada', 'adalah', 'dalam', 'tidak', 'akan', 'atau', 'juga', 'ada',
                      'bisa', 'lebih', 'sudah', 'saja', 'karena', 'seperti', 'oleh', 'mereka',
                      'kita', 'saya', 'anda', 'dia', 'kalau', 'jadi', 'harus', 'lagi', 'pun'])

## 4. Labeling Sentimen (2 Kelas: Positif/Negatif)

**REVISI:** Labeling langsung 2 kelas tanpa kelas Netral dan tanpa confidence score

In [None]:
# LABELING 2 KELAS (TANPA NETRAL, TANPA CONFIDENCE SCORE)

import re

def label_sentiment(text):
    """
    Labeling dengan pattern matching untuk 2 kelas: Positif dan Negatif.
    Tanpa kelas Netral dan tanpa confidence score.
    """
    text = str(text).lower()

    # POLA POSITIF - Konteks solusi, keberhasilan, dukungan
    positive_patterns = [
        # Keberhasilan menurunkan/menekan deforestasi
        r'deforestasi.{0,30}(turun|berkurang|ditekan|menurun|rendah|nol|berhenti)',
        r'(turun|berkurang|ditekan|menurun).{0,30}deforestasi',
        r'(nol|zero|tanpa|stop|anti).{0,10}deforestasi',
        r'(berhasil|sukses|mampu|bisa).{0,30}(tekan|kurangi|hentikan|cegah).{0,20}deforestasi',

        # Reboisasi dan restorasi
        r'(reboisasi|restorasi|penghijauan|penanaman pohon)',
        r'(tanam|menanam).{0,20}(pohon|bibit|hutan)',
        r'(pohon|hutan).{0,20}(ditanam|bertambah|pulih)',
        r'(juta|ribu|ribuan).{0,10}(pohon|bibit).{0,10}(ditanam|tanam)',

        # Konservasi dan perlindungan
        r'(konservasi|pelestarian|perlindungan).{0,20}hutan',
        r'(melindungi|menjaga|melestarikan|merawat).{0,20}hutan',
        r'(lindungi|jaga|selamatkan|lestarikan).{0,20}hutan',
        r'hutan.{0,20}(dilindungi|dijaga|dilestarikan|terjaga)',

        # Hasil positif lingkungan
        r'(emisi|karbon).{0,20}(turun|berkurang|menurun)',
        r'(hijau|lestari|asri).{0,20}(kembali|lagi|tetap)',
        r'hutan.{0,20}(pulih|membaik|hijau kembali)',
        r'(net gain|forest gain|penambahan hutan)',

        # Kebijakan positif
        r'(moratorium|larangan).{0,20}(deforestasi|penebangan|pembukaan lahan)',
        r'(kebijakan|program|komitmen).{0,20}(anti deforestasi|pro lingkungan|hijau)',

        # Ekspresi dukungan untuk lingkungan
        r'(setuju|dukung|mendukung).{0,30}(lindungi|jaga|lestarikan|konservasi).{0,20}hutan',
        r'(prioritas|utama|penting).{0,20}(melindungi|menjaga|konservasi).{0,20}hutan',

        # Sustainable/berkelanjutan
        r'(sawit|pertanian|industri).{0,20}(berkelanjutan|sustainable|ramah lingkungan)',
        r'(tanpa|bebas).{0,10}deforestasi',

        # Apresiasi dan pencapaian
        r'(bukti|contoh|teladan).{0,20}(nyata|baik|positif)',
        r'(apresiasi|bangga|senang).{0,30}(lingkungan|hutan|konservasi)',
    ]

    # POLA NEGATIF - Konteks masalah, kerusakan, kritik
    negative_patterns = [
        # Kerusakan hutan aktif
        r'hutan.{0,20}(rusak|hancur|hilang|habis|gundul|musnah|terbakar)',
        r'(rusak|hancur|hilang|habis|gundul).{0,20}hutan',
        r'(kerusakan|kehancuran|kehilangan).{0,20}hutan',
        r'deforestasi.{0,20}(parah|masif|besar|meningkat|meluas)',
        r'(laju|tingkat|angka).{0,10}deforestasi.{0,10}(tinggi|naik|meningkat)',

        # Penebangan ilegal
        r'(ilegal|liar|illegal).{0,20}(logging|penebangan|pembalakan)',
        r'(pembalakan|penebangan|pembabatan).{0,20}(liar|ilegal|masif)',
        r'(mafia|kartel|sindikat).{0,20}(hutan|kayu|logging)',

        # Bencana akibat deforestasi
        r'(banjir|longsor|kekeringan|bencana).{0,30}(akibat|karena|dampak|efek).{0,20}(deforestasi|penebangan|gundul)',
        r'(deforestasi|penebangan|hutan hilang).{0,30}(sebab|penyebab|akibatkan).{0,20}(banjir|longsor|bencana)',
        r'(korban|mengungsi|evakuasi|meninggal).{0,30}(banjir|longsor|bencana)',

        # Kebakaran hutan
        r'(kebakaran|terbakar|api).{0,20}(hutan|lahan)',
        r'(hutan|lahan).{0,20}(terbakar|kebakaran|dilahap api)',
        r'(asap|kabut asap|polusi).{0,20}(kebakaran|hutan)',

        # Eksploitasi dan perusakan
        r'(eksploitasi|mengeruk|menguras|membabat).{0,20}(hutan|alam|sumber daya)',
        r'(sawit|tambang|perkebunan).{0,30}(rusak|hancur|habiskan).{0,20}hutan',
        r'hutan.{0,20}(dikorbankan|dihabiskan|dibabat).{0,20}(sawit|tambang|bisnis)',

        # Kritik kebijakan/pemerintah
        r'(pemerintah|rezim|penguasa).{0,30}(gagal|bobrok|korup).{0,20}(lingkungan|hutan)',
        r'(izin|konsesi|hgu).{0,30}(bermasalah|ilegal|kontroversial)',
        r'(korupsi|suap|kolusi).{0,30}(hutan|lingkungan|kehutanan)',

        # Dampak negatif
        r'(habitat|ekosistem|satwa).{0,20}(hilang|rusak|hancur|terancam)',
        r'(spesies|flora|fauna).{0,20}(punah|terancam|hilang)',
        r'(krisis|darurat|ancaman).{0,20}(lingkungan|iklim|ekologi)',

        # Ekspresi kemarahan/kritik
        r'(miris|tragis|memprihatinkan|mengerikan|parah).{0,30}(hutan|deforestasi|lingkungan)',
        r'(stop|hentikan|tolak).{0,20}(deforestasi|penebangan|perusakan)',
        r'(jangan|dilarang).{0,20}(rusak|tebang|bakar).{0,20}hutan',

        # Kegagalan dan masalah
        r'(gagal|tidak berhasil|sia-sia).{0,30}(lindungi|jaga|konservasi)',
        r'(masalah|problem|isu).{0,20}(deforestasi|lingkungan|hutan)',
    ]

    # SCORING DENGAN PATTERN MATCHING
    pos_score = 0
    neg_score = 0

    # Cek pola positif
    for pattern in positive_patterns:
        matches = re.findall(pattern, text)
        if matches:
            pos_score += len(matches)

    # Cek pola negatif
    for pattern in negative_patterns:
        matches = re.findall(pattern, text)
        if matches:
            neg_score += len(matches)

    # KEYWORD FALLBACK (jika tidak ada pattern yang cocok)
    if pos_score == 0 and neg_score == 0:
        # Gunakan keyword sederhana sebagai fallback
        simple_neg = ['rusak', 'hancur', 'banjir', 'longsor', 'ilegal', 'gagal', 'parah',
                      'korban', 'bencana', 'terbakar', 'gundul', 'habis', 'musnah']
        simple_pos = ['reboisasi', 'konservasi', 'lindungi', 'tanam pohon', 'lestari',
                      'berkelanjutan', 'berhasil', 'pulih', 'hijau']

        for word in simple_neg:
            if word in text:
                neg_score += 0.5

        for word in simple_pos:
            if word in text:
                pos_score += 0.5

    # TENTUKAN LABEL

    if pos_score > neg_score:
        return 'Positif'
    else:
        return 'Negatif'


# TERAPKAN LABELING
df['label'] = df['cleaned_text'].apply(label_sentiment)

# ANALISIS HASIL
print("=" * 60)
print("HASIL LABELING SENTIMEN (2 KELAS)")
print("=" * 60)

print("\n DISTRIBUSI LABEL:")
print(df['label'].value_counts())
print(f"\nPersentase:")
print((df['label'].value_counts(normalize=True) * 100).round(1))

In [None]:
# SAMPLE HASIL LABELING
print("\n" + "=" * 60)
print("SAMPLE HASIL LABELING")
print("=" * 60)

print("\n SAMPLE POSITIF:")
for idx, row in df[df['label'] == 'Positif'].head(5).iterrows():
    print(f"‚Ä¢ {row['cleaned_text'][:100]}...")
    print()

print("\n SAMPLE NEGATIF:")
for idx, row in df[df['label'] == 'Negatif'].head(5).iterrows():
    print(f"‚Ä¢ {row['cleaned_text'][:100]}...")
    print()

In [None]:
# Simpan dataset yang sudah dilabel
df.to_csv('tweets_labeled.csv', index=False)
print(f"Dataset tersimpan: {len(df)} tweets dengan label Positif/Negatif")

## 5. Visualisasi Distribusi Label

In [None]:
import matplotlib.pyplot as plt

label_counts = df['label'].value_counts()
n_labels = len(label_counts)

colors_dict = {'Negatif': '#e74c3c', 'Positif': '#2ecc71'}
color_list = [colors_dict[label] for label in label_counts.index]
explode_tuple = tuple([0.05] * n_labels)

fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Pie chart
axes[0].pie(label_counts,
            labels=label_counts.index,
            autopct='%1.1f%%',
            startangle=140,
            colors=color_list,
            explode=explode_tuple)
axes[0].set_title('Distribusi Sentiment\nTweet Deforestasi (2 Kelas)', fontweight='bold', fontsize=12)

# Bar chart
bars = axes[1].bar(label_counts.index,
                   label_counts.values,
                   color=color_list)
axes[1].set_title('Jumlah Tweet per Label', fontweight='bold', fontsize=12)
axes[1].set_xlabel('Label')
axes[1].set_ylabel('Jumlah')

for bar in bars:
    height = bar.get_height()
    axes[1].text(bar.get_x() + bar.get_width()/2., height,
                 f'{int(height)}',
                 ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

In [None]:
# VERIFIKASI SAMPLE PER LABEL
print("=== SAMPLE TWEET YANG DILABEL POSITIF ===\n")
samples_pos = df[df['label'] == 'Positif']['cleaned_text'].head(10).tolist()
for i, text in enumerate(samples_pos, 1):
    print(f"{i}. {text[:100]}...")

print("\n=== SAMPLE TWEET YANG DILABEL NEGATIF ===\n")
samples_neg = df[df['label'] == 'Negatif']['cleaned_text'].head(10).tolist()
for i, text in enumerate(samples_neg, 1):
    print(f"{i}. {text[:100]}...")

In [None]:
# WordCloud per sentiment
from wordcloud import WordCloud

for label in df['label'].unique():
    text = ' '.join(df[df['label']==label]['cleaned_text'])
    wordcloud = WordCloud(stopwords=indo_stopwords, width=800, height=400).generate(text)

    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(f'WordCloud - {label}')
    plt.axis('off')
    plt.show()

## 6. Split Data Train/Validation/Test

In [None]:
from sklearn.model_selection import train_test_split

train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42, stratify=df['label'])
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['label'])

print(f"Train: {len(train_df)}, Validation: {len(val_df)}, Test: {len(test_df)}")

In [None]:
# ANALISIS & HANDLING DATA IMBALANCE

print("Distribusi Label pada Data Training:")
print(train_df['label'].value_counts())
print(f"\nPersentase:")
print((train_df['label'].value_counts(normalize=True) * 100).round(1))

# Visualisasi distribusi
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for ax, (name, data) in zip(axes, [('Train', train_df), ('Validation', val_df), ('Test', test_df)]):
    data['label'].value_counts().plot(kind='bar', ax=ax, color=['#2ecc71', '#e74c3c'])
    ax.set_title(f'Distribusi {name} Set')
    ax.set_xlabel('Label')
    ax.set_ylabel('Jumlah')
    ax.tick_params(axis='x', rotation=0)

plt.tight_layout()
plt.show()

## 7. Setup Model IndoBERT



In [None]:
!pip install transformers datasets accelerate evaluate scikit-learn

from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
import evaluate
import numpy as np

# REVISI: Menggunakan IndoBERT untuk hyperparameter tuning (sama dengan model comparison)
model_name = "indobenchmark/indobert-base-p1"

tokenizer = AutoTokenizer.from_pretrained(model_name)

# Mapping label ke angka (2 kelas)
label2id = {'Negatif': 0, 'Positif': 1}
id2label = {0: 'Negatif', 1: 'Positif'}

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,
    id2label=id2label,
    label2id=label2id
)

# HANDLING IMBALANCE DENGAN CLASS WEIGHTS
from sklearn.utils.class_weight import compute_class_weight
import torch
from torch import nn
import numpy as np

train_labels = [label2id[l] for l in train_df['label']]
existing_classes = np.unique(train_labels)

# Hitung bobot hanya untuk kelas yang tersedia
weights = compute_class_weight(
    class_weight='balanced',
    classes=existing_classes,
    y=train_labels
)

class_weights = torch.zeros(2, dtype=torch.float)
for i, cls_idx in enumerate(existing_classes):
    class_weights[cls_idx] = float(weights[i])

print("--- Hasil Perhitungan Class Weights ---")
for i, label in id2label.items():
    status = "Tersedia" if i in existing_classes else "KOSONG"
    print(f"  {label} ({i}): {class_weights[i]:.3f} - {status}")

# CUSTOM TRAINER
class WeightedTrainer(Trainer):
    def __init__(self, class_weights, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        weight = self.class_weights.to(logits.device)
        loss_fn = nn.CrossEntropyLoss(weight=weight)
        loss = loss_fn(logits, labels)

        return (loss, outputs) if return_outputs else loss

In [None]:
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

# Convert to HuggingFace Dataset
train_dataset = Dataset.from_pandas(train_df[['cleaned_text', 'label']].rename(columns={'cleaned_text': 'text'}))
val_dataset = Dataset.from_pandas(val_df[['cleaned_text', 'label']].rename(columns={'cleaned_text': 'text'}))
test_dataset = Dataset.from_pandas(test_df[['cleaned_text', 'label']].rename(columns={'cleaned_text': 'text'}))

# Encode labels
train_dataset = train_dataset.map(lambda x: {'label': label2id[x['label']]})
val_dataset = val_dataset.map(lambda x: {'label': label2id[x['label']]})
test_dataset = test_dataset.map(lambda x: {'label': label2id[x['label']]})

# Tokenize
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

## 8. Hyperparameter Tuning dengan IndoBERT



In [None]:
import torch
import gc
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
import evaluate

# Define metrics
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Hyperparameter yang akan diuji
learning_rates = [1e-5, 2e-5, 5e-5]
batch_sizes = [8, 16]
epochs_list = [3, 5]

# REVISI: Model yang digunakan untuk hyperparameter tuning = IndoBERT (sama dengan model comparison)
hp_model_name = "indobenchmark/indobert-base-p1"

experiment_results = []
experiment_id = 1

print("="*60)
print(f"HYPERPARAMETER TUNING DENGAN MODEL: {hp_model_name}")
print("="*60)

# Loop untuk setiap kombinasi
for lr in learning_rates:
    for batch in batch_sizes:
        for epochs in epochs_list:

            print(f"\n{'='*60}")
            print(f"Eksperimen {experiment_id}: LR={lr}, Batch={batch}, Epochs={epochs}")
            print(f"{'='*60}")

            # Load ulang model untuk setiap eksperimen (PENTING!)
            model = AutoModelForSequenceClassification.from_pretrained(
                hp_model_name,
                num_labels=2,
                id2label=id2label,
                label2id=label2id
            )

            training_args = TrainingArguments(
                output_dir=f"./results/exp_{experiment_id}",
                eval_strategy="epoch",
                save_strategy="epoch",
                learning_rate=lr,
                per_device_train_batch_size=batch,
                per_device_eval_batch_size=batch,
                num_train_epochs=epochs,
                weight_decay=0.01,
                load_best_model_at_end=True,
                metric_for_best_model="accuracy",
                report_to="none",
                save_total_limit=1,
            )

            # Trainer
            trainer = WeightedTrainer(
                class_weights=class_weights,
                model=model,
                args=training_args,
                train_dataset=train_dataset,
                eval_dataset=val_dataset,
                compute_metrics=compute_metrics,
            )

            # Train!
            train_result = trainer.train()

            # Evaluasi
            eval_result = trainer.evaluate()

            # Simpan hasil
            experiment_results.append({
                'experiment_id': experiment_id,
                'learning_rate': lr,
                'batch_size': batch,
                'epochs': epochs,
                'train_loss': train_result.training_loss,
                'val_accuracy': eval_result['eval_accuracy'],
                'val_loss': eval_result['eval_loss'],
            })

            print(f" Val Accuracy: {eval_result['eval_accuracy']:.4f}")

            # Bersihkan memory
            del model, trainer
            import gc
            gc.collect()
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

            experiment_id += 1

print("\n" + "="*60)
print("SEMUA EKSPERIMEN SELESAI!")
print("="*60)

In [None]:
# Hasil semua eksperimen
import pandas as pd

results_df = pd.DataFrame(experiment_results)
results_df = results_df.sort_values('val_accuracy', ascending=False)

print("=== HASIL HYPERPARAMETER TUNING ===\n")
display(results_df)

# Best configuration
best = results_df.iloc[0]
print(f"\nüèÜ BEST CONFIGURATION:")
print(f"   Learning Rate: {best['learning_rate']}")
print(f"   Batch Size: {best['batch_size']}")
print(f"   Epochs: {best['epochs']}")
print(f"   Val Accuracy: {best['val_accuracy']:.4f}")

import matplotlib.pyplot as plt
import seaborn as sns

fig, axes = plt.subplots(1, 3, figsize=(15, 4))
fig.suptitle('Hasil Hyperparameter Tuning (IndoBERT)', fontsize=14, fontweight='bold')

# 1. Accuracy by Learning Rate
ax1 = axes[0]
for batch in batch_sizes:
    data = results_df[results_df['batch_size'] == batch]
    data_grouped = data.groupby('learning_rate')['val_accuracy'].mean()
    ax1.plot(data_grouped.index, data_grouped.values, marker='o', label=f'Batch {batch}')
ax1.set_xlabel('Learning Rate')
ax1.set_ylabel('Val Accuracy')
ax1.set_xscale('log')
ax1.legend()
ax1.set_title('Accuracy vs Learning Rate')
ax1.grid(True, alpha=0.3)

# 2. Accuracy by Batch Size
ax2 = axes[1]
batch_acc = results_df.groupby('batch_size')['val_accuracy'].mean()
ax2.bar(batch_acc.index.astype(str), batch_acc.values, color=['#2E86AB', '#A23B72'])
ax2.set_xlabel('Batch Size')
ax2.set_ylabel('Val Accuracy')
ax2.set_title('Accuracy vs Batch Size')

# 3. Accuracy by Epochs
ax3 = axes[2]
epoch_acc = results_df.groupby('epochs')['val_accuracy'].mean()
ax3.bar(epoch_acc.index.astype(str), epoch_acc.values, color=['#06A77D', '#F18F01'])
ax3.set_xlabel('Epochs')
ax3.set_ylabel('Val Accuracy')
ax3.set_title('Accuracy vs Epochs')

plt.tight_layout()
plt.savefig('hyperparameter_results.png', dpi=300, bbox_inches='tight')
plt.show()

# Generate tabel markdown
print("\n### Tabel Hasil Hyperparameter Tuning\n")
print("| No | Learning Rate | Batch Size | Epochs | Val Accuracy | Val Loss |")
print("|:--:|:-------------:|:----------:|:------:|:------------:|:--------:|")
for _, row in results_df.iterrows():
    print(f"| {row['experiment_id']} | {row['learning_rate']} | {int(row['batch_size'])} | "
          f"{int(row['epochs'])} | {row['val_accuracy']:.4f} | {row['val_loss']:.4f} |")

## 9. Perbandingan Model

Menggunakan hyperparameter terbaik dari hasil tuning

In [None]:
# EKSPERIMEN: PERBANDINGAN MODEL

import torch
import gc
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments
from datasets import Dataset
import pandas as pd

# Ambil best hyperparameter dari eksperimen sebelumnya
best = results_df.iloc[0]
BEST_LR = best['learning_rate']
BEST_BATCH = int(best['batch_size'])
BEST_EPOCHS = int(best['epochs'])

print("="*60)
print("EKSPERIMEN PERBANDINGAN MODEL")
print(f"Menggunakan: LR={BEST_LR}, Batch={BEST_BATCH}, Epochs={BEST_EPOCHS}")
print("="*60)

# Daftar model yang akan dibandingkan
MODELS = [
    {
        'name': 'IndoLEM-IndoBERT',
        'model_id': 'indolem/indobert-base-uncased'
    },
    {
        'name': 'IndoBenchmark-IndoBERT',
        'model_id': 'indobenchmark/indobert-base-p1'
    },
    {
        'name': 'mBERT-Multilingual',
        'model_id': 'bert-base-multilingual-uncased'
    }
]

# Simpan hasil
model_results = []

for model_info in MODELS:
    print(f"\n{'='*60}")
    print(f"Training: {model_info['name']}")
    print(f"{'='*60}")

    # Cleanup memory sebelum load model baru
    gc.collect()
    torch.cuda.empty_cache()

    tokenizer = AutoTokenizer.from_pretrained(model_info['model_id'])

    train_ds = Dataset.from_pandas(
        train_df[['cleaned_text', 'label']].rename(columns={'cleaned_text': 'text'}).reset_index(drop=True)
    )
    val_ds = Dataset.from_pandas(
        val_df[['cleaned_text', 'label']].rename(columns={'cleaned_text': 'text'}).reset_index(drop=True)
    )

    def tokenize_function(examples):
        return tokenizer(
            examples['text'],
            padding='max_length',
            truncation=True,
            max_length=128
        )

    train_ds = train_ds.map(lambda x: {'label': label2id[x['label']]})
    val_ds = val_ds.map(lambda x: {'label': label2id[x['label']]})

    train_ds = train_ds.map(tokenize_function, batched=True, remove_columns=['text'])
    val_ds = val_ds.map(tokenize_function, batched=True, remove_columns=['text'])

    train_ds.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
    val_ds.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

    model = AutoModelForSequenceClassification.from_pretrained(
        model_info['model_id'],
        num_labels=2,
        id2label=id2label,
        label2id=label2id
    )

    # Training arguments
    training_args = TrainingArguments(
        output_dir=f"./results/{model_info['name'].replace(' ', '_')}",
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=BEST_LR,
        per_device_train_batch_size=BEST_BATCH,
        per_device_eval_batch_size=BEST_BATCH,
        num_train_epochs=BEST_EPOCHS,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        report_to="none",
        save_total_limit=1,
    )

    # Trainer dengan dataset yang sudah di-tokenize
    trainer = WeightedTrainer(
        class_weights=class_weights,
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        compute_metrics=compute_metrics,
    )

    # Train
    train_result = trainer.train()
    eval_result = trainer.evaluate()

    # Simpan hasil
    model_results.append({
        'model_name': model_info['name'],
        'model_id': model_info['model_id'],
        'val_accuracy': eval_result['eval_accuracy'],
        'val_loss': eval_result['eval_loss'],
        'train_loss': train_result.training_loss,
    })

    print(f" {model_info['name']} - Val Accuracy: {eval_result['eval_accuracy']:.4f}")

    # Cleanup setelah selesai
    del model, trainer, tokenizer, train_ds, val_ds
    gc.collect()
    torch.cuda.empty_cache()

print("\n" + "="*60)
print("PERBANDINGAN MODEL SELESAI!")
print("="*60)

model_results_df = pd.DataFrame(model_results).sort_values('val_accuracy', ascending=False)
display(model_results_df)

In [None]:
# Visualisasi perbandingan model
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# 1. Accuracy comparison
ax1 = axes[0]
colors = ['#2E86AB', '#A23B72', '#F18F01']
bars = ax1.bar(model_results_df['model_name'], model_results_df['val_accuracy'], color=colors)
ax1.set_xlabel('Model')
ax1.set_ylabel('Validation Accuracy')
ax1.set_title('Perbandingan Accuracy antar Model', fontweight='bold')
ax1.set_ylim(0.5, 1.0)
for bar, acc in zip(bars, model_results_df['val_accuracy']):
    ax1.text(bar.get_x() + bar.get_width()/2., bar.get_height() + 0.01,
             f'{acc:.4f}', ha='center', va='bottom', fontweight='bold')
ax1.tick_params(axis='x', rotation=15)

# 2. Loss comparison
ax2 = axes[1]
bars = ax2.bar(model_results_df['model_name'], model_results_df['val_loss'], color=colors)
ax2.set_xlabel('Model')
ax2.set_ylabel('Validation Loss')
ax2.set_title('Perbandingan Loss antar Model', fontweight='bold')
for bar, loss in zip(bars, model_results_df['val_loss']):
    ax2.text(bar.get_x() + bar.get_width()/2., bar.get_height() + 0.01,
             f'{loss:.4f}', ha='center', va='bottom', fontweight='bold')
ax2.tick_params(axis='x', rotation=15)

plt.tight_layout()
plt.savefig('model_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

# Best model
best_model = model_results_df.iloc[0]
print(f"\nüèÜ MODEL TERBAIK: {best_model['model_name']}")
print(f"   Accuracy: {best_model['val_accuracy']:.4f}")
print(f"   Loss: {best_model['val_loss']:.4f}")

## 10. Training Model Terbaik & Evaluasi Final

In [None]:
# Training model terbaik untuk evaluasi final
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns

# Gunakan model terbaik
best_model_id = model_results_df.iloc[0]['model_id']
best_model_name = model_results_df.iloc[0]['model_name']

print(f"Training final model: {best_model_name}")

# Load tokenizer dan model
final_tokenizer = AutoTokenizer.from_pretrained(best_model_id)

# Prepare datasets
train_ds = Dataset.from_pandas(
    train_df[['cleaned_text', 'label']].rename(columns={'cleaned_text': 'text'}).reset_index(drop=True)
)
test_ds = Dataset.from_pandas(
    test_df[['cleaned_text', 'label']].rename(columns={'cleaned_text': 'text'}).reset_index(drop=True)
)

def tokenize_function(examples):
    return final_tokenizer(
        examples['text'],
        padding='max_length',
        truncation=True,
        max_length=128
    )

train_ds = train_ds.map(lambda x: {'label': label2id[x['label']]})
test_ds = test_ds.map(lambda x: {'label': label2id[x['label']]})

train_ds = train_ds.map(tokenize_function, batched=True, remove_columns=['text'])
test_ds = test_ds.map(tokenize_function, batched=True, remove_columns=['text'])

train_ds.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_ds.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

# Load model
final_model = AutoModelForSequenceClassification.from_pretrained(
    best_model_id,
    num_labels=2,
    id2label=id2label,
    label2id=label2id
)

# Training
training_args = TrainingArguments(
    output_dir="./results/final_model",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=BEST_LR,
    per_device_train_batch_size=BEST_BATCH,
    per_device_eval_batch_size=BEST_BATCH,
    num_train_epochs=BEST_EPOCHS,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none",
    save_total_limit=1,
)

final_trainer = WeightedTrainer(
    class_weights=class_weights,
    model=final_model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    compute_metrics=compute_metrics,
)

final_trainer.train()

In [None]:
# Evaluasi pada test set
predictions = final_trainer.predict(test_ds)
pred_labels = np.argmax(predictions.predictions, axis=-1)
true_labels = predictions.label_ids

# Classification report
print("=" * 60)
print("EVALUASI FINAL PADA TEST SET")
print("=" * 60)
print("\nClassification Report:")
print(classification_report(true_labels, pred_labels, target_names=['Negatif', 'Positif']))

# Confusion matrix
cm = confusion_matrix(true_labels, pred_labels)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Negatif', 'Positif'],
            yticklabels=['Negatif', 'Positif'])
plt.title(f'Confusion Matrix - {best_model_name}', fontweight='bold')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.tight_layout()
plt.savefig('confusion_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

# Test accuracy
test_accuracy = (pred_labels == true_labels).mean()
print(f"\n TEST ACCURACY: {test_accuracy:.4f}")

In [None]:
# Simpan model
final_model.save_pretrained('./final_sentiment_model')
final_tokenizer.save_pretrained('./final_sentiment_model')
print("Model tersimpan di ./final_sentiment_model")

## 11. Ringkasan Hasil


In [None]:
# RINGKASAN AKHIR
print("="*60)
print("RINGKASAN HASIL ANALISIS SENTIMEN")
print("="*60)

print(f"\n DATASET:")
print(f"   - Total data setelah preprocessing: {len(df)} tweets")
print(f"   - Kelas: 2 (Positif, Negatif)")
print(f"   - Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")

print(f"\n HYPERPARAMETER TERBAIK:")
print(f"   - Learning Rate: {BEST_LR}")
print(f"   - Batch Size: {BEST_BATCH}")
print(f"   - Epochs: {BEST_EPOCHS}")

print(f"\n MODEL TERBAIK: {best_model_name}")
print(f"   - Validation Accuracy: {model_results_df.iloc[0]['val_accuracy']:.4f}")
print(f"   - Test Accuracy: {test_accuracy:.4f}")
