In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt

In [2]:
# Memuat dataset
df = pd.read_csv('Dataset_Manual_stemming.csv', encoding='latin1', delimiter=';')
df.head()

Unnamed: 0,No,Type,Mentions,Date,Media,Sentiment,Author,Followers,Retweeted,Favourited,Mentions1,Sentiment1,stemmed_text
0,1,rt,RT @LANGKAHANIES: Jangan ada intervensi politi...,31/05/2023 23:59,Twitter,Positive,@YTrigusmintara (ManusiaBebas),19.0,64.0,0.0,jangan ada intervensi politik penjegalan pilpr...,negatif,jangan ada intervensi politik jegal pilpres 20...
1,2,rt,RT @triwul82: Sejumlah perwakilan Koalisi Peru...,31/05/2023 23:59,Twitter,Negative,@INA_NKRI (100% Indonesia ÃÂÃÂÃÂÃÂ°ÃÂ...,1250.0,83.0,0.0,sejumlah perwakilan koalisi perubahan yang men...,positif,jumlah wakil koalisi ubah yang usung anies bag...
2,3,rt,RT @ajengcute16__: Merupakan Open Legal Policy...,31/05/2023 23:59,Twitter,Positive,@sri08054 (Sri anies),356.0,50.0,0.0,merupakan open legal policy perludem sangat be...,negatif,rupa open legal policy perludem sangat bahaya ...
3,4,rt,RT @Jatayu_45: JOKOWI HARUS MUNDUR DARI JABATA...,31/05/2023 23:59,Twitter,Neutral,@wongedan1708 (BAGong Modern),16.0,108.0,0.0,jokowi harus mundur dari jabatan presiden kala...,negatif,jokowi harus mundur dari jabat presiden kalau ...
4,5,mention,"Langkahi Presiden dan DPR, Demokrat: Bukan Wew...",31/05/2023 23:59,Twitter,Negative,@Simanjunta9Nico (Nico Simanjuntak),650.0,0.0,0.0,langkahi presiden dan dpr demokrat bukan wewen...,negatif,langkah presiden dan dpr demokrat bukan wewena...


In [3]:
# Memastikan dataset yang digunakan berupa string
df['stemmed_text'] = df['stemmed_text'].fillna('').astype(str)

In [4]:
# Memberi kode pada label
label_encoder = LabelEncoder()
df['Sentiment1'] = label_encoder.fit_transform(df['Sentiment1'])

In [5]:
# Tokenize dan padding
vocab_size = 10000
embedding_dim = 128
max_length = 200

tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
tokenizer.fit_on_texts(df['stemmed_text'])
sequences = tokenizer.texts_to_sequences(df['stemmed_text'])
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')

In [6]:
# Mempersiapkan labels
labels = to_categorical(df['Sentiment1'])

In [7]:
# Definisikan model CNN
def create_model(vocab_size, embedding_dim, max_length):
    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length),
        Conv1D(filters=128, kernel_size=5, activation='relu'),
        MaxPooling1D(pool_size=2),
        Conv1D(filters=128, kernel_size=5, activation='relu'),
        MaxPooling1D(pool_size=2),
        Conv1D(filters=128, kernel_size=5, activation='relu'),
        GlobalMaxPooling1D(),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(3, activation='softmax')
    ])
    return model

In [11]:
# K-Fold Cross Validation
k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=42)
cvscores = []

for train_index, test_index in kf.split(padded_sequences):
    X_train, X_val = padded_sequences[train_index], padded_sequences[test_index]
    y_train, y_val = labels[train_index], labels[test_index]
    
    model = create_model(vocab_size, embedding_dim, max_length)
    model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])
    
    history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_val, y_val), verbose=1)   
    # Evaluasi model pada data validasi
    loss, accuracy = model.evaluate(X_val, y_val, verbose=0)
    cvscores.append(accuracy * 100)
    # Prediksi data validasi
    y_pred_prob = model.predict(X_val)
    y_pred = np.argmax(y_pred_prob, axis=1)
    y_true = np.argmax(y_val, axis=1)

Epoch 1/20




[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 36ms/step - accuracy: 0.5597 - loss: 0.8842 - val_accuracy: 0.8560 - val_loss: 0.3843
Epoch 2/20
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 34ms/step - accuracy: 0.9073 - loss: 0.2670 - val_accuracy: 0.8840 - val_loss: 0.3094
Epoch 3/20
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 32ms/step - accuracy: 0.9685 - loss: 0.1095 - val_accuracy: 0.8960 - val_loss: 0.3887
Epoch 4/20
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 33ms/step - accuracy: 0.9864 - loss: 0.0613 - val_accuracy: 0.8880 - val_loss: 0.4068
Epoch 5/20
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 34ms/step - accuracy: 0.9859 - loss: 0.0614 - val_accuracy: 0.8960 - val_loss: 0.4481
Epoch 6/20
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 33ms/step - accuracy: 0.9882 - loss: 0.0533 - val_accuracy: 0.8970 - val_loss: 0.3787
Epoch 7/20
[1m125/125[0m [32m━

[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 41ms/step - accuracy: 0.9891 - loss: 0.0383 - val_accuracy: 0.8909 - val_loss: 0.6579
Epoch 20/20
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 41ms/step - accuracy: 0.9872 - loss: 0.0380 - val_accuracy: 0.8709 - val_loss: 0.6802
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step


In [12]:
# Prediksi data validasi
y_pred_prob = model.predict(X_val)
y_pred = np.argmax(y_pred_prob, axis=1)
y_true = np.argmax(y_val, axis=1)

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step


In [13]:
# Rata-rata akurasi dari semua fold
print(f"Mean Accuracy: {np.mean(cvscores):.2f}%\n")
print(classification_report(y_true, y_pred, target_names=label_encoder.classes_))

Mean Accuracy: 88.19%

              precision    recall  f1-score   support

     negatif       0.81      0.93      0.86       306
      netral       0.88      0.87      0.87       293
     positif       0.93      0.82      0.87       400

    accuracy                           0.87       999
   macro avg       0.87      0.88      0.87       999
weighted avg       0.88      0.87      0.87       999

