In [7]:
import pandas as pd
import re
import string
import nltk
import json
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import numpy as np
import random

nltk.download('stopwords')

# Load dataset
df = pd.read_csv('../dataset/ikn.csv')

# Filter kata kunci
keywords = ['ikn', 'nusantara', 'ibu kota', 'ibukota', 'pemindahan', 'perpindahan']
filtered_df = df[df['full_text'].str.contains('|'.join(keywords), case=False, na=False)].copy()
filtered_df = filtered_df[filtered_df['full_text'].str.strip() != '']
filtered_df = filtered_df.drop_duplicates(subset='full_text')

# Inisialisasi stemmer
stemmer = StemmerFactory().create_stemmer()

# Muat kamus kata dasar
with open('../sastrawi/kata-dasar.txt', 'r', encoding='utf-8') as f:
    kamus_sastrawi = set(word.strip() for word in f.readlines())

# Kata penting
kata_penting = {'ikn', 'nusantara', 'ibu', 'kota', 'ibukota', 'pemindahan', 'perpindahan'}

# Stopwords
stop_words = set(stopwords.words('indonesian'))
stop_words.update({
    'gw', 'gue', 'gua', 'lu', 'loe', 'lo', 'elu', 'nya', 'ya', 'aja', 'sih', 'lah', 'deh', 'dong',
    'kok', 'nih', 'tuh', 'lagi', 'kayak', 'gak', 'ga', 'nggak', 'ngga', 'yg', 'yang', 'saya', 'kamu'
})

# Kata sentimen
kata_positif = {
        'baik', 'bagus', 'maju', 'dukung', 'setuju', 'positif', 'indah', 'hebat',
        'sejahtera', 'aman', 'nyaman', 'modern', 'teratur', 'subur', 'makmur',
        'berhasil', 'mantap', 'sukses', 'optimal', 'unggul',
        'ceria', 'produktif', 'stabil', 'harmonis', 'adil', 'bersih',
        'ramah', 'berkah', 'amanah', 'visioner', 'cerdas',
        'terdepan', 'efisien', 'ekonomis', 'peduli', 'inovatif',
        'terpercaya', 'terkendali', 'berdaya', 'kompeten'
}
kata_negatif = {
        'tidak', 'buruk', 'tolak', 'negatif', 'korup', 'jelek', 'hancur',
        'bencana', 'rusak', 'gagal', 'macet', 'rawan', 'ancam', 'bahaya',
        'rugi', 'protes', 'kritik', 'sesat', 'merugikan', 'sengketa',
        'sulit', 'gelap', 'curang', 'cacat', 'terbelakang', 'parah',
        'lemah', 'krisis', 'konflik', 'tidak adil', 'semrawut',
        'terbengkalai', 'merosot', 'miskin', 'terancam', 'tercela',
        'tidak layak', 'bising', 'polusi', 'biaya tinggi'
}

def preprocess_steps(text):
    data_clean = re.sub(r'http\S+|#\w+|@\w+|\d+', '', text)
    lower = data_clean.lower()
    no_punct = lower.translate(str.maketrans('', '', string.punctuation))
    replaced = re.sub(r'\bgw\b|\bgue\b|\bgua\b', 'saya', no_punct)
    replaced = re.sub(r'\blu\b|\bloe\b|\belo\b|\belu\b', 'kamu', replaced)
    replaced = re.sub(r'\bnggak\b|\bngga\b|\bga\b|\bgak\b', 'tidak', replaced)
    tokens = replaced.split()
    tokens_stop_removed = [w for w in tokens if w not in stop_words]
    joined_for_stem = ' '.join(tokens_stop_removed)
    stemmed_text = stemmer.stem(joined_for_stem)
    stemmed_tokens = stemmed_text.split()
    tokens_filtered = [w for w in stemmed_tokens if w in kamus_sastrawi or w in kata_penting]
    final_cleaned = ' '.join(tokens_filtered)
    sentimen = 'positif' if any(k in tokens_filtered for k in kata_positif) else 'negatif'
    return pd.Series([final_cleaned, sentimen, tokens_filtered])

processed = filtered_df['full_text'].apply(preprocess_steps)
processed.columns = ['clean_text', 'sentimen', 'tokens']
processed['jumlah_kata'] = processed['tokens'].apply(len)
processed = processed[processed['jumlah_kata'] >= 3].reset_index(drop=True)
processed.insert(0, 'id', processed.index + 1)
processed.drop(columns=['jumlah_kata', 'tokens'], inplace=True)

print(f"Jumlah data sebelum augmentasi: {len(processed)}")
print(f"Jumlah data positif: {len(processed[processed['sentimen']=='positif'])}")
print(f"Jumlah data negatif: {len(processed[processed['sentimen']=='negatif'])}")

def augment_text(text):
    words = text.split()
    if len(words) < 3:
        return text
    idx = random.randint(0, len(words)-1)
    words.insert(idx, words[idx])
    return ' '.join(words)

def augment_swap_text(text):
    words = text.split()
    if len(words) < 2:
        return text
    # Ambil dua index berbeda
    idx1, idx2 = random.sample(range(len(words)), 2)
    # Tukar posisi kata
    words[idx1], words[idx2] = words[idx2], words[idx1]
    return ' '.join(words)

def augment_data(df):
    pos_df = df[df['sentimen'] == 'positif']
    neg_df = df[df['sentimen'] == 'negatif']
    if len(pos_df) > len(neg_df):
        aug_needed = len(pos_df) - len(neg_df)
        aug_samples = neg_df.sample(aug_needed, replace=True).copy()
    elif len(neg_df) > len(pos_df):
        aug_needed = len(neg_df) - len(pos_df)
        aug_samples = pos_df.sample(aug_needed, replace=True).copy()
    else:
        return df  # sudah seimbang

    # Terapkan augmentasi
    def apply_random_augment(text):
        if random.random() < 0.5:
            return augment_text(text)  # augmentasi duplikasi kata
        else:
            return augment_swap_text(text)  # augmentasi swap kata

    aug_samples['clean_text'] = aug_samples['clean_text'].apply(apply_random_augment)
    df = pd.concat([df, aug_samples], ignore_index=True)
    return df

balanced_df = augment_data(processed)

# Simpan hasil augmentasi ke CSV
balanced_df.to_csv('hasil_augmentasi.csv', index=False, encoding='utf-8')

print(f"Jumlah data setelah augmentasi: {len(balanced_df)}")
print(f"Jumlah data positif: {len(balanced_df[balanced_df['sentimen']=='positif'])}")
print(f"Jumlah data negatif: {len(balanced_df[balanced_df['sentimen']=='negatif'])}")

X = balanced_df['clean_text']
y = balanced_df['sentimen']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)
feature_names = vectorizer.get_feature_names_out()

# === Simpan TF-IDF ke JSON ===
tfidf_json_data = {}

for idx, (doc_vector, doc_text) in enumerate(zip(X_train_tfidf, X_train)):
    row = doc_vector.toarray().flatten()
    term_idx = np.where(row > 0)[0]
    tfidf_terms = {feature_names[i]: float(row[i]) for i in term_idx}
    tfidf_json_data[f'dokumen_{idx+1}'] = tfidf_terms

with open('tfidf.json', 'w', encoding='utf-8') as f:
    json.dump(tfidf_json_data, f, indent=2, ensure_ascii=False)

# === Train Naive Bayes ===
nb = MultinomialNB(alpha=1.0)
nb.fit(X_train_tfidf, y_train)

# === Likelihood ===
likelihood_data = []

for idx, (doc_vector, doc_text) in enumerate(zip(X_train_tfidf, X_train)):
    row = doc_vector.toarray().flatten()
    term_idx = np.where(row > 0)[0]
    for i in term_idx:
        term = feature_names[i]
        p_term_negatif = np.exp(nb.feature_log_prob_[nb.classes_ == 'negatif'][0][i])
        p_term_positif = np.exp(nb.feature_log_prob_[nb.classes_ == 'positif'][0][i])
        likelihood_data.append({
            'dokumen': f'dokumen_{idx+1}',
            'term': term,
            'P(term|negatif)': p_term_negatif,
            'P(term|positif)': p_term_positif
        })

likelihood_df = pd.DataFrame(likelihood_data)
likelihood_df.to_csv('likelihood.csv', index=False)

# === Posterior ===
posterior_data = []
preds = nb.predict(X_test_tfidf)
probs = nb.predict_proba(X_test_tfidf)

for i in range(len(X_test)):
    posterior_data.append({
        'P(negatif|x)': probs[i][nb.classes_ == 'negatif'][0],
        'P(positif|x)': probs[i][nb.classes_ == 'positif'][0],
        'prediksi': preds[i],
        'aktual': y_test.iloc[i],
        'clean_text': X_test.iloc[i]
    })

posterior_df = pd.DataFrame(posterior_data)
posterior_df.to_csv('posterior.csv', index=False)

# === Evaluasi ===
print(confusion_matrix(y_test, preds))
print(classification_report(y_test, preds))
print(f"Akurasi: {accuracy_score(y_test, preds) * 100:.2f}%")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Acer/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Jumlah data sebelum augmentasi: 775
Jumlah data positif: 113
Jumlah data negatif: 662
Jumlah data setelah augmentasi: 1324
Jumlah data positif: 662
Jumlah data negatif: 662
[[104  29]
 [  0 132]]
              precision    recall  f1-score   support

     negatif       1.00      0.78      0.88       133
     positif       0.82      1.00      0.90       132

    accuracy                           0.89       265
   macro avg       0.91      0.89      0.89       265
weighted avg       0.91      0.89      0.89       265

Akurasi: 89.06%
