In [12]:
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from collections import Counter
import numpy as np

# Unduh stopwords bahasa Indonesia (jika belum)
nltk.download('stopwords')

# Load dataset
df = pd.read_csv('dataset/ikn.csv')

# Filter berdasarkan kata kunci tertentu
keywords = ['ikn', 'nusantara', 'ibu kota', 'ibukota', 'pemindahan', 'perpindahan']
filtered_df = df[df['full_text'].str.contains('|'.join(keywords), case=False, na=False)].copy()
filtered_df = filtered_df[filtered_df['full_text'].str.strip() != '']
filtered_df = filtered_df.drop_duplicates(subset='full_text')

# Inisialisasi stemmer Sastrawi
stemmer = StemmerFactory().create_stemmer()

# Muat kamus kata dasar dari file
with open('sastrawi/kata-dasar.txt', 'r', encoding='utf-8') as f:
    kamus_sastrawi = set(word.strip() for word in f.readlines())

# Kata penting yang harus dipertahankan
kata_penting = {'ikn', 'nusantara', 'ibu', 'kota', 'ibukota', 'pemindahan', 'perpindahan'}

# Stopwords bahasa Indonesia + tambahan slang
stop_words = set(stopwords.words('indonesian'))
tambahan_stop = {
    'gw', 'gue', 'gua', 'lu', 'loe', 'lo', 'elu', 'nya', 'ya', 'aja', 'sih', 'lah', 'deh', 'dong',
    'kok', 'nih', 'tuh', 'lagi', 'kayak', 'gak', 'ga', 'nggak', 'ngga', 'yg', 'yang', 'saya', 'kamu'
}
stop_words.update(tambahan_stop)

# Kata yang berkonotasi positif dan negatif
kata_positif = {'baik', 'bagus', 'maju', 'dukung', 'setuju', 'positif', 'indah', 'hebat'}
kata_negatif = {'tidak', 'buruk', 'tolak', 'negatif', 'korup', 'jelek', 'hancur', 'bencana', 'rusak'}

def preprocess_steps(text):
    # Bersihkan teks dari url, tag, angka
    data_clean = re.sub(r'http\S+|#\w+|@\w+|\d+', '', text)
    lower = data_clean.lower()
    # Hilangkan tanda baca
    no_punct = lower.translate(str.maketrans('', '', string.punctuation))
    # Ganti kata slang
    replaced = re.sub(r'\bgw\b|\bgue\b|\bgua\b', 'saya', no_punct)
    replaced = re.sub(r'\blu\b|\bloe\b|\belo\b|\belu\b', 'kamu', replaced)
    replaced = re.sub(r'\bnggak\b|\bngga\b|\bga\b|\bgak\b', 'tidak', replaced)
    tokens = replaced.split()
    # Stopword removal
    tokens_stop_removed = [w for w in tokens if w not in stop_words]
    joined_for_stem = ' '.join(tokens_stop_removed)
    # Stemming
    stemmed_text = stemmer.stem(joined_for_stem)
    stemmed_tokens = stemmed_text.split()
    # Filter hanya kata dari kamus atau kata penting
    tokens_filtered = [w for w in stemmed_tokens if w in kamus_sastrawi or w in kata_penting]
    tokens_sorted = sorted(tokens_filtered)
    final_cleaned = ' '.join(tokens_sorted)
    # Label sentimen berdasarkan kemunculan kata positif atau negatif
    sentimen = 'positif' if any(k in tokens_sorted for k in kata_positif) else 'negatif'
    return pd.Series([final_cleaned, sentimen])

# Preprocessing seluruh data
processed = filtered_df['full_text'].apply(preprocess_steps)
processed.columns = ['clean_text', 'sentimen']

# Filter data dengan minimal 3 kata
processed['jumlah_kata'] = processed['clean_text'].str.split().str.len()
processed = processed[processed['jumlah_kata'] >= 3].drop(columns='jumlah_kata').reset_index(drop=True)
processed.insert(0, 'id', processed.index + 1)

# Tampilkan jumlah sentimen setelah preprocessing dan penentuan label
print("📌 Jumlah data dan distribusi sentimen setelah preprocessing:")
print(processed['sentimen'].value_counts())

# Simpan hasil preprocessing jika perlu
processed.to_csv('dataset/processed.csv', index=False)

# Split data latih dan uji (train 75% test 25%) dengan stratify agar distribusi label sama
X = processed['clean_text']
y = processed['sentimen']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y)

# Debug print data train
print(f"Total data latih: {len(X_train)}")
print(f"Distribusi label train: {Counter(y_train)}")

# TF-IDF Vectorizer
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Latih Multinomial Naive Bayes dengan alpha=1
nb = MultinomialNB(alpha=1.0)
nb.fit(X_train_tfidf, y_train)

# Tampilkan probabilitas prior dari model sklearn
print("\n🔢 Probabilitas Prior (model sklearn):")
for label, prior_log in zip(nb.classes_, nb.class_log_prior_):
    print(f"P({label}) = {np.exp(prior_log):.6f}")

# Prediksi data test dan evaluasi
y_pred = nb.predict(X_test_tfidf)
acc = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Tampilkan confusion matrix dengan label yang jelas
cm = confusion_matrix(y_test, y_pred, labels=['positif', 'negatif'])
cm_df = pd.DataFrame(cm, index=['Aktual Positif', 'Aktual Negatif'], columns=['Prediksi Positif', 'Prediksi Negatif'])

print(f"\n✅ Akurasi: {acc * 100:.2f}%")
print("\n📋 Classification Report:")
print(report)

print("\n📊 Confusion Matrix:")
print(cm_df)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Acer/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


📌 Jumlah data dan distribusi sentimen setelah preprocessing:
sentimen
negatif    695
positif     80
Name: count, dtype: int64
Total data latih: 581
Distribusi label train: Counter({'negatif': 521, 'positif': 60})

🔢 Probabilitas Prior (model sklearn):
P(negatif) = 0.896730
P(positif) = 0.103270

✅ Akurasi: 91.75%

📋 Classification Report:
              precision    recall  f1-score   support

     negatif       0.92      1.00      0.96       174
     positif       1.00      0.20      0.33        20

    accuracy                           0.92       194
   macro avg       0.96      0.60      0.64       194
weighted avg       0.92      0.92      0.89       194


📊 Confusion Matrix:
                Prediksi Positif  Prediksi Negatif
Aktual Positif                 4                16
Aktual Negatif                 0               174
