In [None]:
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import json

# Unduh stopwords jika belum
nltk.download('stopwords')

# Load CSV
df = pd.read_csv('dataset/ikn.csv')

# Filter hanya data relevan
keywords = ['ikn', 'nusantara', 'ibu kota', 'ibukota', 'pemindahan', 'perpindahan']
filtered_df = df[df['full_text'].str.contains('|'.join(keywords), case=False, na=False)].copy()
filtered_df = filtered_df[filtered_df['full_text'].str.strip() != '']
filtered_df = filtered_df.drop_duplicates(subset='full_text')

# Inisialisasi Stemmer
stemmer = StemmerFactory().create_stemmer()

# Muat kamus kata dasar
with open('sastrawi/kata-dasar.txt', 'r', encoding='utf-8') as f:
    kamus_sastrawi = set(word.strip() for word in f.readlines())

# Kata penting
kata_penting = {'ikn', 'nusantara', 'ibu', 'kota', 'ibukota', 'pemindahan', 'perpindahan'}

# Stopwords
stop_words = set(stopwords.words('indonesian'))
tambahan_stop = {
    'gw', 'gue', 'gua', 'lu', 'loe', 'lo', 'elu', 'nya', 'ya', 'aja', 'sih', 'lah', 'deh', 'dong',
    'kok', 'nih', 'tuh', 'lagi', 'kayak', 'gak', 'ga', 'nggak', 'ngga', 'yg', 'yang', 'saya', 'kamu'
}
stop_words.update(tambahan_stop)

# Kata sentimen
kata_positif = {'baik', 'bagus', 'maju', 'dukung', 'setuju', 'positif', 'indah', 'hebat'}
kata_negatif = {'tidak', 'buruk', 'tolak', 'negatif', 'korup', 'jelek', 'hancur', 'bencana', 'rusak'}

# Fungsi preprocessing
def preprocess_steps(text):
    # Cleaning
    data_clean = re.sub(r'http\S+|#\w+|@\w+|\d+', '', text)
    lower = data_clean.lower()
    no_punct = lower.translate(str.maketrans('', '', string.punctuation))
    replaced = re.sub(r'\bgw\b|\bgue\b|\bgua\b', 'saya', no_punct)
    replaced = re.sub(r'\blu\b|\bloe\b|\belo\b|\belu\b', 'kamu', replaced)
    replaced = re.sub(r'\bnggak\b|\bngga\b|\bga\b|\bgak\b', 'tidak', replaced)

    tokens = replaced.split()

    # Stopword removal
    tokens_stop_removed = [w for w in tokens if w not in stop_words]

    # Stemming
    joined_for_stem = ' '.join(tokens_stop_removed)
    stemmed_text = stemmer.stem(joined_for_stem)

    stemmed_tokens = stemmed_text.split()

    # Filter kata dasar atau penting
    tokens_filtered = [w for w in stemmed_tokens if w in kamus_sastrawi or w in kata_penting]
    tokens_sorted = sorted(tokens_filtered)
    final_cleaned = ' '.join(tokens_sorted)

    # Deteksi sentimen
    sentimen = 'positif' if any(k in tokens_sorted for k in kata_positif) else (
        'negatif' if any(k in tokens_sorted for k in kata_negatif) else 'negatif'
    )

    return pd.Series([final_cleaned, sentimen])

# Terapkan preprocessing
processed = filtered_df['full_text'].apply(preprocess_steps)
processed.columns = ['clean_text', 'sentimen']

# Hapus teks pendek
processed['jumlah_kata'] = processed['clean_text'].str.split().str.len()
processed = processed[processed['jumlah_kata'] >= 3].drop(columns='jumlah_kata')

# Tambahkan ID
processed.reset_index(drop=True, inplace=True)
processed.insert(0, 'id', processed.index + 1)

# Simpan hasil preprocessing ke CSV
processed.to_csv('dataset/processed.csv', index=False)

# TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(processed['clean_text'])
y = processed['sentimen']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Naive Bayes
nb = MultinomialNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)

# Evaluasi
cm = confusion_matrix(y_test, y_pred, labels=['positif', 'negatif'])
acc = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Tampilkan hasil
print("✅ TF-IDF + Naive Bayes selesai.")
print(f"📌 Akurasi: {acc * 100:.2f}%")
print("\n📊 Confusion Matrix:")
print(pd.DataFrame(cm, index=['Actual_Pos', 'Actual_Neg'], columns=['Pred_Pos', 'Pred_Neg']))
print("\n📋 Classification Report:")
print(report)
