In [18]:
# 1️⃣ Import library
import pandas as pd 
import re
import string
import nltk
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import random
import json

# 2️⃣ Download stopwords
nltk.download('stopwords')

# 3️⃣ Ambil data dari database
from sqlalchemy import create_engine
engine = create_engine("mysql+mysqlconnector://root:@localhost/ikn-app")
query = "SELECT id, full_text FROM dataset ORDER BY id"
df = pd.read_sql(query, con=engine)

# 4️⃣ Filter data yang mengandung kata kunci
keywords = ['ikn', 'nusantara', 'ibu kota', 'ibukota', 'pemindahan', 'perpindahan']
filtered_df = df[df['full_text'].str.contains('|'.join(keywords), case=False, na=False)].copy()
filtered_df = filtered_df[filtered_df['full_text'].str.strip() != '']
filtered_df = filtered_df.drop_duplicates(subset='full_text')

# 5️⃣ Persiapan stemming dan stopwords
factory = StemmerFactory()
stemmer = factory.create_stemmer()

kamus_path = r'C:\xampp\htdocs\ikn-preprocessing\sastrawi\kata-dasar.txt'
with open(kamus_path, 'r', encoding='utf-8') as f:
    kamus_sastrawi = set(word.strip() for word in f.readlines())

kata_penting = {'ikn', 'nusantara', 'ibu', 'kota', 'ibukota', 'pemindahan', 'perpindahan'}
stop_words = set(stopwords.words('indonesian'))
tambahan_stop = {
    'gw', 'gue', 'gua', 'lu', 'loe', 'lo', 'elu', 'nya', 'ya', 'aja', 'sih', 'lah', 'deh', 'dong',
    'kok', 'nih', 'tuh', 'lagi', 'kayak', 'gak', 'ga', 'nggak', 'ngga', 'yg', 'yang', 'saya', 'kamu'
}
stop_words.update(tambahan_stop)

kata_positif = {
        'baik', 'bagus', 'maju', 'dukung', 'setuju', 'positif', 'indah', 'hebat',
        'sejahtera', 'aman', 'nyaman', 'modern', 'teratur', 'subur', 'makmur',
        'berhasil', 'mantap', 'sukses', 'optimal', 'unggul',
        'ceria', 'produktif', 'stabil', 'harmonis', 'adil', 'bersih',
        'ramah', 'berkah', 'amanah', 'visioner', 'cerdas',
        'terdepan', 'efisien', 'ekonomis', 'peduli', 'inovatif',
        'terpercaya', 'terkendali', 'berdaya', 'kompeten'
    }
kata_negatif = {
        'tidak', 'buruk', 'tolak', 'negatif', 'korup', 'jelek', 'hancur',
        'bencana', 'rusak', 'gagal', 'macet', 'rawan', 'ancam', 'bahaya',
        'rugi', 'protes', 'kritik', 'sesat', 'merugikan', 'sengketa',
        'sulit', 'gelap', 'curang', 'cacat', 'terbelakang', 'parah',
        'lemah', 'krisis', 'konflik', 'tidak adil', 'semrawut',
        'terbengkalai', 'merosot', 'miskin', 'terancam', 'tercela',
        'tidak layak', 'bising', 'polusi', 'biaya tinggi'
    }

# 6️⃣ Preprocessing
def preprocess_steps(text):
    data_clean = re.sub(r'http\S+|#\w+|@\w+|\d+', '', text)
    lower = data_clean.lower()
    no_punct = lower.translate(str.maketrans('', '', string.punctuation))
    replaced = re.sub(r'\bgw\b|\bgue\b|\bgua\b', 'saya', no_punct)
    replaced = re.sub(r'\blu\b|\bloe\b|\belo\b|\belu\b', 'kamu', replaced)
    replaced = re.sub(r'\bnggak\b|\bngga\b|\bga\b|\bgak\b', 'tidak', replaced)

    tokens = replaced.split()
    tokens_stop_removed = [w for w in tokens if w not in stop_words]
    joined_for_stem = ' '.join(tokens_stop_removed)
    stemmed_text = stemmer.stem(joined_for_stem)
    stemmed_tokens = stemmed_text.split()

    tokens_filtered = [w for w in stemmed_tokens if w in kamus_sastrawi or w in kata_penting]
    tokens_sorted = sorted(tokens_filtered)
    final_cleaned = ' '.join(tokens_sorted)

    sentimen = 'positif' if any(k in tokens_sorted for k in kata_positif) else (
        'negatif' if any(k in tokens_sorted for k in kata_negatif) else 'negatif'
    )

    return [final_cleaned, sentimen]

filtered_df = filtered_df.reset_index()
processed = filtered_df['full_text'].apply(preprocess_steps)
processed = pd.DataFrame(processed.tolist(), columns=['data_clean', 'sentiment'])

# Hapus data dengan < 3 kata
processed['jumlah_kata'] = processed['data_clean'].apply(lambda x: len(x.split()))
processed = processed[processed['jumlah_kata'] >= 3].drop(columns=['jumlah_kata'])

# 7️⃣ Augmentasi sederhana agar balanced (optional, atau bisa skip jika sudah balance)
def augment_text(text):
    words = text.split()
    if len(words) < 3:
        return text
    idx = random.randint(0, len(words) - 1)
    words.insert(idx, words[idx])
    return ' '.join(words)

def augment_swap_text(text):
    words = text.split()
    if len(words) < 2:
        return text
    idx1, idx2 = random.sample(range(len(words)), 2)
    words[idx1], words[idx2] = words[idx2], words[idx1]
    return ' '.join(words)

def augment_data(df):
    pos_df = df[df['sentiment'] == 'positif']
    neg_df = df[df['sentiment'] == 'negatif']
    if len(pos_df) > len(neg_df):
        target = len(pos_df)
        aug_needed = target - len(neg_df)
        aug_samples = neg_df.sample(aug_needed, replace=True).copy()
        aug_samples['data_clean'] = aug_samples['data_clean'].apply(
            lambda x: augment_swap_text(augment_text(x))
        )
        aug_samples['sentiment'] = 'negatif'
        result = pd.concat([df, aug_samples], ignore_index=True)
    elif len(neg_df) > len(pos_df):
        target = len(neg_df)
        aug_needed = target - len(pos_df)
        aug_samples = pos_df.sample(aug_needed, replace=True).copy()
        aug_samples['data_clean'] = aug_samples['data_clean'].apply(
            lambda x: augment_swap_text(augment_text(x))
        )
        aug_samples['sentiment'] = 'positif'
        result = pd.concat([df, aug_samples], ignore_index=True)
    else:
        result = df
    return result

processed = augment_data(processed)

# 8️⃣ Split data
X_train, X_test, y_train, y_test = train_test_split(
    processed['data_clean'], processed['sentiment'], test_size=0.25, random_state=42
)

# 9️⃣ TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# 🔟 Naive Bayes Classifier
clf = MultinomialNB()
clf.fit(X_train_tfidf, y_train)

# 🔍 Prediksi dan evaluasi
y_pred = clf.predict(X_test_tfidf)

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nAccuracy Score:")
print(accuracy_score(y_test, y_pred))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Acer/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Confusion Matrix:
[[125  36]
 [  1 169]]

Classification Report:
              precision    recall  f1-score   support

     negatif       0.99      0.78      0.87       161
     positif       0.82      0.99      0.90       170

    accuracy                           0.89       331
   macro avg       0.91      0.89      0.89       331
weighted avg       0.91      0.89      0.89       331


Accuracy Score:
0.8882175226586103
