In [8]:
import pandas as pd
import re
import string
import nltk
import json
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import numpy as np
import random

nltk.download('stopwords')

# Load dataset
df = pd.read_csv('../dataset/ikn.csv')

# Filter kata kunci
keywords = ['ikn', 'nusantara', 'ibu kota', 'ibukota', 'pemindahan', 'perpindahan']
filtered_df = df[df['full_text'].str.contains('|'.join(keywords), case=False, na=False)].copy()
filtered_df = filtered_df[filtered_df['full_text'].str.strip() != '']
filtered_df = filtered_df.drop_duplicates(subset='full_text')

print(f"📌 Jumlah data setelah filter kata kunci: {len(filtered_df)}")

# Inisialisasi stemmer
stemmer = StemmerFactory().create_stemmer()

# Muat kamus kata dasar
with open('../sastrawi/kata-dasar.txt', 'r', encoding='utf-8') as f:
    kamus_sastrawi = set(word.strip() for word in f.readlines())

# Kata penting
kata_penting = {'ikn', 'nusantara', 'ibu', 'kota', 'ibukota', 'pemindahan', 'perpindahan'}

# Stopwords
stop_words = set(stopwords.words('indonesian'))
tambahan_stop = {
    'gw', 'gue', 'gua', 'lu', 'loe', 'lo', 'elu', 'nya', 'ya', 'aja', 'sih', 'lah', 'deh', 'dong',
    'kok', 'nih', 'tuh', 'lagi', 'kayak', 'gak', 'ga', 'nggak', 'ngga', 'yg', 'yang', 'saya', 'kamu'
}
stop_words.update(tambahan_stop)

# Kata sentimen
kata_positif = {
    'baik', 'bagus', 'maju', 'dukung', 'setuju', 'positif', 'indah', 'hebat',
    'sejahtera', 'aman', 'nyaman', 'modern', 'teratur', 'subur', 'makmur',
    'berhasil', 'mantap', 'sukses', 'optimal', 'unggul',
    'ceria', 'produktif', 'stabil', 'harmonis', 'adil', 'bersih',
    'ramah', 'berkah', 'amanah', 'visioner', 'cerdas',
    'terdepan', 'efisien', 'ekonomis', 'peduli', 'inovatif',
    'terpercaya', 'terkendali', 'berdaya', 'kompeten'
}

kata_negatif = {
    'tidak', 'buruk', 'tolak', 'negatif', 'korup', 'jelek', 'hancur',
    'bencana', 'rusak', 'gagal', 'macet', 'rawan', 'ancam', 'bahaya',
    'rugi', 'protes', 'kritik', 'sesat', 'merugikan', 'sengketa',
    'sulit', 'gelap', 'curang', 'cacat', 'terbelakang', 'parah',
    'lemah', 'krisis', 'konflik', 'tidak adil', 'semrawut',
    'terbengkalai', 'merosot', 'miskin', 'terancam', 'tercela',
    'tidak layak', 'bising', 'polusi', 'biaya tinggi'
}

# Preprocessing
def preprocess_steps(text):
    data_clean = re.sub(r'http\S+|#\w+|@\w+|\d+', '', text)
    lower = data_clean.lower()
    no_punct = lower.translate(str.maketrans('', '', string.punctuation))
    replaced = re.sub(r'\bgw\b|\bgue\b|\bgua\b', 'saya', no_punct)
    replaced = re.sub(r'\blu\b|\bloe\b|\belo\b|\belu\b', 'kamu', replaced)
    replaced = re.sub(r'\bnggak\b|\bngga\b|\bga\b|\bgak\b', 'tidak', replaced)
    tokens = replaced.split()
    tokens_stop_removed = [w for w in tokens if w not in stop_words]
    joined_for_stem = ' '.join(tokens_stop_removed)
    stemmed_text = stemmer.stem(joined_for_stem)
    stemmed_tokens = stemmed_text.split()
    tokens_filtered = [w for w in stemmed_tokens if w in kamus_sastrawi or w in kata_penting]
    final_cleaned = ' '.join(tokens_filtered)
    sentimen = 'positif' if any(k in tokens_filtered for k in kata_positif) else 'negatif'
    return pd.Series([final_cleaned, sentimen, tokens_filtered])

processed = filtered_df['full_text'].apply(preprocess_steps)
processed.columns = ['clean_text', 'sentimen', 'tokens']
processed['jumlah_kata'] = processed['tokens'].apply(len)
processed = processed[processed['jumlah_kata'] >= 3].reset_index(drop=True)
processed.insert(0, 'id', processed.index + 1)
processed.drop(columns=['jumlah_kata'], inplace=True)

print("\n📈 Jumlah sentimen setelah pre-processing:")
print(processed['sentimen'].value_counts().to_dict())

# Augmentasi Data: Oversampling dan sinonim sederhana
def augment_text(text):
    words = text.split()
    if len(words) < 3:
        return text
    # Randomly duplicate a word (contoh augmentasi sederhana)
    idx = random.randint(0, len(words)-1)
    words.insert(idx, words[idx])
    return ' '.join(words)

def augment_data(df):
    pos_df = df[df['sentimen'] == 'positif']
    neg_df = df[df['sentimen'] == 'negatif']
    if len(pos_df) > len(neg_df):
        target = len(pos_df)
        aug_needed = target - len(neg_df)
        aug_samples = neg_df.sample(aug_needed, replace=True).copy()
        aug_samples['clean_text'] = aug_samples['clean_text'].apply(augment_text)
        result = pd.concat([df, aug_samples], ignore_index=True)
    elif len(neg_df) > len(pos_df):
        target = len(neg_df)
        aug_needed = target - len(pos_df)
        aug_samples = pos_df.sample(aug_needed, replace=True).copy()
        aug_samples['clean_text'] = aug_samples['clean_text'].apply(augment_text)
        result = pd.concat([df, aug_samples], ignore_index=True)
    else:
        result = df
    return result

print("\n🚀 Melakukan augmentasi untuk menyeimbangkan data...")
balanced_df = augment_data(processed)
print(balanced_df['sentimen'].value_counts())

# Split
X = balanced_df['clean_text']
y = balanced_df['sentimen']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Naive Bayes
nb = MultinomialNB(alpha=1.0)
nb.fit(X_train_tfidf, y_train)

# Evaluasi
y_pred = nb.predict(X_test_tfidf)
acc = accuracy_score(y_test, y_pred)
print(f"\n✅ Akurasi: {acc * 100:.2f}%")
print("\n📋 Classification Report:\n", classification_report(y_test, y_pred))
print("\n📊 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Acer/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


📌 Jumlah data setelah filter kata kunci: 894

📈 Jumlah sentimen setelah pre-processing:
{'negatif': 662, 'positif': 113}

🚀 Melakukan augmentasi untuk menyeimbangkan data...
sentimen
negatif    662
positif    662
Name: count, dtype: int64

✅ Akurasi: 89.43%

📋 Classification Report:
               precision    recall  f1-score   support

     negatif       0.99      0.80      0.88       133
     positif       0.83      0.99      0.90       132

    accuracy                           0.89       265
   macro avg       0.91      0.89      0.89       265
weighted avg       0.91      0.89      0.89       265


📊 Confusion Matrix:
 [[106  27]
 [  1 131]]
