In [None]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt

nltk.download('stopwords')

# === 1. Load dan filter data ===
df = pd.read_csv('dataset/ikn.csv')
keywords = ['ikn', 'nusantara', 'ibu kota', 'ibukota', 'pemindahan', 'perpindahan']
filtered_df = df[df['full_text'].str.contains('|'.join(keywords), case=False)].copy()

filtered_df = filtered_df[filtered_df['full_text'].str.strip() != '']
filtered_df = filtered_df.drop_duplicates(subset='full_text')
filtered_df['full_text'] = filtered_df['full_text'].str.replace(r'http\S+', '', regex=True)
filtered_df['word_count'] = filtered_df['full_text'].apply(lambda x: len(x.split()))
filtered_df = filtered_df[filtered_df['word_count'] >= 5]
filtered_df.drop(columns=['word_count'], inplace=True)

# === 2. Preprocessing ===
factory = StemmerFactory()
stemmer = factory.create_stemmer()
stop_words = set(stopwords.words('indonesian'))
tambahan_stop = {
    'gw', 'gue', 'gua', 'lu', 'loe', 'lo', 'elu', 'nya', 'ya', 'aja', 'sih', 'lah', 'deh', 'dong',
    'kok', 'nih', 'tuh', 'lagi', 'kayak', 'gak', 'ga', 'nggak', 'ngga', 'yg', 'yang', 'saya', 'kamu'
}
stop_words.update(tambahan_stop)

def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.strip()
    text = re.sub(r'\bgw\b|\bgue\b|\bgua\b', 'saya', text)
    text = re.sub(r'\blu\b|\bloe\b|\belo\b|\belu\b', 'kamu', text)
    text = re.sub(r'\bnggak\b|\bngga\b|\bga\b|\bgak\b', 'tidak', text)
    text = stemmer.stem(text)
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

filtered_df['cleaned'] = filtered_df['full_text'].apply(clean_text)

# === 2.1 Perkaya dan Gabungkan Kamus Kata Dasar ===
with open('sastrawi/kata-dasar.txt', 'r', encoding='utf-8') as f:
    kamus_sastrawi = set(f.read().splitlines())

pos_words = [
    'baik', 'bagus', 'mendukung', 'hebat', 'mantap', 'setuju',
    'top', 'luar biasa', 'sip', 'apresiasi', 'bangga',
    'kemajuan', 'ramah', 'berkelanjutan', 'peluang', 'strategis',
    'pertumbuhan', 'investasi', 'unggulan', 'mendorong', 'semangat',
    'kondusif', 'cocok', 'positif', 'cerah', 'maju', 'bermanfaat',
    'berkembang', 'terdepan', 'mendorong', 'sukses', 'membangun'
]
neg_words = [
    'buruk', 'jelek', 'korupsi', 'parah', 'gagal', 'tolak',
    'tidak setuju', 'menolak', 'ancur', 'bubar', 'kritik', 'macet',
    'tidak ramah', 'tidak layak', 'tidak manusiawi', 'tidak cocok',
    'bohong', 'salah', 'mundur', 'krisis', 'masalah', 'keluhan',
    'cacat', 'curang', 'konflik', 'resah'
]

pos_words_stemmed = [stemmer.stem(w.lower()) for w in pos_words]
neg_words_stemmed = [stemmer.stem(w.lower()) for w in neg_words]

kamus_kata = kamus_sastrawi.union(set(pos_words_stemmed)).union(set(neg_words_stemmed))

def hapus_kata_non_kamus(text):
    filtered_words = [word for word in text.split() if word in kamus_kata]
    if len(filtered_words) == 0:
        return text
    else:
        return ' '.join(filtered_words)

filtered_df['final_cleaned'] = filtered_df['cleaned'].apply(hapus_kata_non_kamus)

# === 3. Sentiment label dari fungsi simple_sentiment ===
def simple_sentiment(text):
    score = 0
    for word in text.split():
        if word in pos_words_stemmed:
            score += 1
        elif word in neg_words_stemmed:
            score -= 1
    return 'positif' if score > 0 else 'negatif'

filtered_df['sentiment'] = filtered_df['final_cleaned'].apply(simple_sentiment)

# === 4. Simpan data bersih ===
filtered_df[['full_text', 'final_cleaned', 'sentiment']].to_csv('dataset/data_bersih_sentimen.csv', index=False)
print("✅ Data berhasil disimpan ke dataset/data_bersih_sentimen.csv")

# === 5. Label encoding ===
le = LabelEncoder()
filtered_df['label'] = le.fit_transform(filtered_df['sentiment'])

# === 6. TF-IDF + Naive Bayes ===
tfidf = TfidfVectorizer(max_features=5000)

X = filtered_df['final_cleaned']
y = filtered_df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

# === 7. Evaluasi model ===
y_pred = model.predict(X_test_tfidf)

print("Akurasi:", accuracy_score(y_test, y_pred))

labels = le.transform(['negatif', 'positif'])

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred, labels=labels))
print("Classification Report:\n", classification_report(y_test, y_pred, labels=labels, target_names=['negatif', 'positif'], zero_division=1))

# === 8. Visualisasi distribusi sentimen ===
filtered_df['sentiment'].value_counts().plot(kind='bar', color=['red', 'green'])
plt.title('Distribusi Sentimen')
plt.show()
