In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import re

In [2]:
# 1. Membaca dataset
data = pd.read_excel("Indonlu_Sentiment.xlsx")  # Ganti dengan lokasi dataset Anda
print("Sample Data Sebelum Preprocessing:")
print(data.head())
# 2. Preprocessing
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Menghapus angka dan tanda baca
    text = text.lower()  # Mengubah semua huruf menjadi huruf kecil
    return text
# Preprocessing data
data['cleaned_review'] = data['Tweet'].apply(preprocess_text)
# Menampilkan sample setelah preprocessing
print("\nSample Data Setelah Preprocessing:")
print(data['cleaned_review'].head())
# 3. Mengonversi label menjadi numerik
le = LabelEncoder()
data['label'] = le.fit_transform(data['Label'])
print(data['label'].head())

Sample Data Sebelum Preprocessing:
                                               Tweet     Label
0  warung ini dimiliki oleh pengusaha pabrik tahu...  positive
1  mohon ulama lurus dan k212 mmbri hujjah partai...   neutral
2  lokasi strategis di jalan sumatera bandung . t...  positive
3  betapa bahagia nya diri ini saat unboxing pake...  positive
4  duh . jadi mahasiswa jangan sombong dong . kas...  negative

Sample Data Setelah Preprocessing:
0    warung ini dimiliki oleh pengusaha pabrik tahu...
1    mohon ulama lurus dan k mmbri hujjah partai ap...
2    lokasi strategis di jalan sumatera bandung  te...
3    betapa bahagia nya diri ini saat unboxing pake...
4    duh  jadi mahasiswa jangan sombong dong  kasih...
Name: cleaned_review, dtype: object
0    2
1    1
2    2
3    2
4    0
Name: label, dtype: int32


In [3]:
# 4. Memisahkan data latih dan data uji
X_train, X_test, y_train, y_test = train_test_split(data['cleaned_review'], data['label'], 
                                                    test_size=0.2, random_state=42)
# Tokenisasi teks
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
# Padding sequences
max_length = 100
X_train_padded = pad_sequences(X_train_seq, maxlen=max_length, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_length, padding='post', truncating='post')

In [4]:
# 5. Membangun model LSTM
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=5000, output_dim=64, input_length=max_length),
    tf.keras.layers.LSTM(64, return_sequences=True),
    tf.keras.layers.LSTM(32),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(3, activation='softmax')  # Menggunakan 3 kelas
])
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])  # Menggunakan sparse_categorical_crossentropy
model.summary()
# 6. Melatih model
model.fit(X_train_padded, y_train, epochs=5, validation_data=(X_test_padded, y_test), batch_size=32)



Epoch 1/5
[1m319/319[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m89s[0m 235ms/step - accuracy: 0.5641 - loss: 0.9430 - val_accuracy: 0.5666 - val_loss: 0.9195
Epoch 2/5
[1m319/319[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 210ms/step - accuracy: 0.5760 - loss: 0.9287 - val_accuracy: 0.5666 - val_loss: 0.9216
Epoch 3/5
[1m319/319[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 101ms/step - accuracy: 0.5739 - loss: 0.9242 - val_accuracy: 0.5666 - val_loss: 0.9206
Epoch 4/5
[1m319/319[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 83ms/step - accuracy: 0.5792 - loss: 0.9192 - val_accuracy: 0.5666 - val_loss: 0.9251
Epoch 5/5
[1m319/319[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 184ms/step - accuracy: 0.5866 - loss: 0.9144 - val_accuracy: 0.5666 - val_loss: 0.9196


<keras.src.callbacks.history.History at 0x187cb191070>

In [5]:
# 7. Uji Sentimen dengan Teks Baru
new_text = ["Film ini sangat bagus, saya menyukainya"]
new_text_cleaned = [preprocess_text(text) for text in new_text]
new_text_seq = tokenizer.texts_to_sequences(new_text_cleaned)
new_text_padded = pad_sequences(new_text_seq, maxlen=max_length, padding='post', truncating='post')

# Prediksi
prediction = model.predict(new_text_padded)
predicted_label = np.argmax(prediction, axis=1)  # Mengambil kelas dengan probabilitas tertinggi

# Mengambil label dari encoded predictions
sentiment_map = {0: 'Negatif', 1: 'Netral', 2: 'Positif'}
predicted_sentiment = sentiment_map[predicted_label[0]]

print("\nHasil Uji Sentimen:")
print(f"Teks: {new_text[0]}")
print(f"Sentimen: {predicted_sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step

Hasil Uji Sentimen:
Teks: Film ini sangat bagus, saya menyukainya
Sentimen: Positif
