In [97]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

In [98]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ghari\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ghari\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [99]:
data = pd.read_csv("data.csv")

In [100]:
data = data[['content', 'summary']] # Pilih kolom yang akan digunakan
data = data.dropna() # Hapus baris dengan nilai null

In [101]:
stop_words = set(stopwords.words('indonesian'))
stemmer = PorterStemmer()

In [102]:
# Tambahkan kata-kata yang merepresentasikan sentimen positif dan negatif
positive_words = ["kemenangan", "keberhasilan", "berhasil", "senang", "gemilang", "terpuji", "pembebasan", "kebahagiaan", "mudah", "mencapai", "juara", "sukses", "cemerlang", "menyenangkan", "optimis", "bergairah", "menang", "puas", "mengagumkan", "beruntung", "menonjol", "positif", "bangga", "berjaya", "maju", "terhormat", "menyenangkan", "bersemangat", "penuh harapan", "berkembang"]
negative_words = ["krisis", "korupsi", "pembunuhan", "pemerkosaan", "pencurian", "pemerkosaan", "kegagalan", "masalah", "merugikan", "bencana", "kerugian", "mengerikan", "teror", "dosa", "penyakit", "kesedihan", "kemiskinan", "konflik", "ketakutan", "marah", "kejahatan", "negatif", "malapetaka", "menakutkan", "merusak", "menyedihkan", "frustrasi", "menghancurkan", "putus asa", "kecewa", "korban","tersangka","kasus","kecelakaan","kebakaran"]

In [103]:
def preprocess_text(text):
    tokens = word_tokenize(text.lower())  # Tokenisasi
    filtered_tokens = [stemmer.stem(token) for token in tokens if token not in stop_words and re.match("^[a-zA-Z0-9]+$", token)]  # Penghapusan stopword & stemming
    return " ".join(filtered_tokens)

In [104]:
def categorize_sentiment(summary):
    for word in positive_words:
        if word in summary:
            return 'positif'
    for word in negative_words:
        if word in summary:
            return 'negatif'
    return 'netral'

In [105]:
data['processed_text'] = data['content'].apply(preprocess_text)

In [106]:
# Label data berdasarkan kata-kata sentimen
data['label'] = data['summary'].apply(categorize_sentiment)

In [107]:
# Pembagian data menjadi data latih dan data uji
X_train, X_test, y_train, y_test = train_test_split(data['processed_text'], data['label'], test_size=0.2, random_state=42)

In [108]:
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [109]:
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

MultinomialNB()

In [110]:
y_pred = model.predict(X_test_tfidf)

In [111]:
print("Akurasi:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Akurasi: 0.6027566981570389
              precision    recall  f1-score   support

     negatif       0.66      0.37      0.48      1416
      netral       0.59      0.96      0.73      3349
     positif       0.82      0.09      0.15      1692

    accuracy                           0.60      6457
   macro avg       0.69      0.47      0.45      6457
weighted avg       0.67      0.60      0.52      6457

