## Import Library

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ADVAN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Eksplorasi Dataset

In [3]:
# Baca data
df = pd.read_csv('data_muslimpro_reviews.csv')

print(df.head())

                                              review  rating
0                                       sangat bagus       5
1                                                bes       5
2                                        terimakasih       5
3                                        terimakasih       5
4  iklan gak bisa keluar, stak di shopee terus ik...       1


In [4]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   review  10000 non-null  object
 1   rating  10000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 156.4+ KB
None


In [5]:
# Cek jumlah missing value per kolom
print("\nJumlah Missing Value per Kolom:")
print(df.isnull().sum())



Jumlah Missing Value per Kolom:
review    0
rating    0
dtype: int64


In [6]:
# Jumlah data total
print(f"\nJumlah total data: {len(df)}")


Jumlah total data: 10000


In [7]:
# Cek apakah ada kolom label
print("\nKolom yang tersedia:")
print(df.columns)


Kolom yang tersedia:
Index(['review', 'rating'], dtype='object')


## Labeling Sentimen Otomatis

In [8]:
# Mapping rating ke sentimen
def rating_to_sentiment(rating):
    if rating <= 2:
        return 'negatif'
    elif rating == 3:
        return 'netral'
    else:
        return 'positif'

# Buat kolom label
df['sentiment'] = df['rating'].apply(rating_to_sentiment)

# Cek distribusi label
print("Distribusi Sentimen:")
print(df['sentiment'].value_counts())


Distribusi Sentimen:
sentiment
positif    5820
negatif    3380
netral      800
Name: count, dtype: int64


## Preprocessing Teks

In [9]:
nltk.download('stopwords')
stop_words = set(stopwords.words('indonesian'))

def clean_text(text):
    # Lowercase
    text = text.lower()
    # Hapus URL
    text = re.sub(r'http\S+|www\S+', '', text)
    # Hapus angka dan tanda baca
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenisasi dan hapus stopword
    tokens = text.split()
    tokens = [t for t in tokens if t not in stop_words]
    return ' '.join(tokens)

# Terapkan pembersihan teks
df['clean_review'] = df['review'].astype(str).apply(clean_text)

# Cek hasil
print(df[['review', 'clean_review']].head())


                                              review  \
0                                       sangat bagus   
1                                                bes   
2                                        terimakasih   
3                                        terimakasih   
4  iklan gak bisa keluar, stak di shopee terus ik...   

                                  clean_review  
0                                        bagus  
1                                          bes  
2                                  terimakasih  
3                                  terimakasih  
4  iklan gak stak shopee iklannya lihat sholat  


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ADVAN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# LSTM (Deep Learning)
Menggunakan arsitektur LSTM dengan layer embedding untuk klasifikasi sentimen.


### Preprocessing Teks untuk LSTM

In [10]:
# Tokenisasi untuk LSTM
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=5000)  # 5000 kata paling sering

# Fit tokenizer dengan teks yang sudah dibersihkan
tokenizer.fit_on_texts(df['clean_review'])

# Ubah teks menjadi urutan angka
X_seq = tokenizer.texts_to_sequences(df['clean_review'])

# Padding agar panjang setiap sequence sama
X_pad = pad_sequences(X_seq, padding='post', maxlen=100)  # maxlen bisa disesuaikan

# Labeling dengan LabelEncoder untuk konversi ke numerik
le = LabelEncoder()
y_encoded = le.fit_transform(df['sentiment'])


### Split Data

In [11]:
# Split data menjadi train dan test (80:20)
X_train_lstm, X_test_lstm, y_train_lstm, y_test_lstm = train_test_split(
    X_pad, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42
)

### Membangun Model LSTM

In [12]:
# Model LSTM
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100))  # +1 untuk memasukkan indeks 0
model.add(Bidirectional(LSTM(128, return_sequences=False)))
model.add(Dropout(0.5))
model.add(Dense(3, activation='softmax'))  # 3 kelas sentimen: positif, netral, negatif

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


### Training Model LSTM

In [13]:
# EarlyStopping untuk menghindari overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)


In [14]:
history = model.fit(X_train_lstm, y_train_lstm, epochs=10, batch_size=64, 
                    validation_data=(X_test_lstm, y_test_lstm), callbacks=[early_stopping])


Epoch 1/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 88ms/step - accuracy: 0.6725 - loss: 0.7306 - val_accuracy: 0.8955 - val_loss: 0.2847
Epoch 2/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 90ms/step - accuracy: 0.9195 - loss: 0.2370 - val_accuracy: 0.9610 - val_loss: 0.1231
Epoch 3/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 100ms/step - accuracy: 0.9674 - loss: 0.1131 - val_accuracy: 0.9735 - val_loss: 0.0799
Epoch 4/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 69ms/step - accuracy: 0.9779 - loss: 0.0752 - val_accuracy: 0.9795 - val_loss: 0.0616
Epoch 5/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 69ms/step - accuracy: 0.9806 - loss: 0.0639 - val_accuracy: 0.9835 - val_loss: 0.0552
Epoch 6/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 69ms/step - accuracy: 0.9841 - loss: 0.0523 - val_accuracy: 0.9840 - val_loss: 0.0499
Epoch 7/10
[1m125

### Evaluasi Model LSTM

In [15]:
# Evaluasi model di data test
y_pred_lstm = model.predict(X_test_lstm)
y_pred_lstm = np.argmax(y_pred_lstm, axis=1)  # Ubah probabilitas jadi kelas

print("Akurasi LSTM:", accuracy_score(y_test_lstm, y_pred_lstm))
print("\nClassification Report:")
print(classification_report(y_test_lstm, y_pred_lstm))


[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 22ms/step
Akurasi LSTM: 0.987

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       676
           1       0.97      0.93      0.95       160
           2       1.00      0.99      1.00      1164

    accuracy                           0.99      2000
   macro avg       0.98      0.97      0.98      2000
weighted avg       0.99      0.99      0.99      2000



### Inference dengan Model LSTM

In [16]:
# Contoh inferensi
sample = ["aplikasi ini sangat membantu dalam mendalami Islam"]

# Preprocessing
sample_cleaned = [clean_text(sample[0])]
sample_seq = tokenizer.texts_to_sequences(sample_cleaned)
sample_pad = pad_sequences(sample_seq, padding='post', maxlen=100)

# Prediksi
prediction_lstm = model.predict(sample_pad)
predicted_class = le.inverse_transform([np.argmax(prediction_lstm)])

print("Prediksi sentimen dengan LSTM:", predicted_class[0])


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
Prediksi sentimen dengan LSTM: positif


In [18]:
import os
import pickle
import tensorflow as tf

# Pastikan folder model ada
os.makedirs('model', exist_ok=True)

# Simpan model LSTM
model.save('model/lstm_model.h5')

# Simpan tokenizer
with open('model/tokenizer.pkl', 'wb') as handle:
    pickle.dump(tokenizer, handle)

# Simpan label encoder
with open('model/label_encoder.pkl', 'wb') as handle:
    pickle.dump(le, handle)

