In [5]:
import pandas as pd
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import nltk

# STOP WORD REMOVER
stop_word = StopWordRemoverFactory().create_stop_word_remover()
# STEMMER
stemmer = StemmerFactory().create_stemmer()

remove_tanda_baca = nltk.RegexpTokenizer(r"\w+")

# Fungsi untuk menghapus stopwords dari teks


def pre_process_text(text):
    text = ' '.join(remove_tanda_baca.tokenize(text))
    return stop_word.remove(stemmer.stem(text))

# Fungsi untuk mengganti nilai rating
def convert_rating(rating):
    if rating >= 3:
        return 1
    elif rating < 3:
        return 0
    else:
        return 0


df = pd.read_csv('./reviews_mandiri.csv')

rating_positif = df[df['rating'] >= 3].sample(n=10000, axis=0)
rating_negatif = df[df['rating'] < 3].sample(n=10000, axis=0)

slice_df = pd.concat([rating_positif, rating_negatif])
slice_df['is_positive'] = slice_df['rating'].apply(convert_rating)
slice_df['review'] = slice_df['review'].apply(pre_process_text)

In [6]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

attribute = slice_df['review'].values
label = slice_df['is_positive'].values

data_latih, data_test, label_latih, label_test = train_test_split(attribute,label,test_size=0.2)
tokenizer = Tokenizer(num_words=35000,oov_token='x')
tokenizer.fit_on_texts(data_latih)

sekuens_latih = tokenizer.texts_to_sequences(data_latih)
sekuens_test = tokenizer.texts_to_sequences(data_test)

pad_latih = pad_sequences(sekuens_latih,padding='post',maxlen=87)
pad_test = pad_sequences(sekuens_test,padding='post',maxlen=87)

In [11]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
from tensorflow.keras.callbacks import EarlyStopping

# Contoh model dasar
model_base = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=15000, output_dim=1, input_length=87),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Model dasar dengan penyetelan hyperparameter
model_base.compile(
    optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=0.1),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

early_stop_callback = EarlyStopping(
    monitor='val_accuracy',  # Metrik yang akan dipantau (misalnya, akurasi pada data validasi)
    patience=10,  # Jumlah epoch yang akan ditunggu tanpa peningkatan sebelum pelatihan dihentikan
    restore_best_weights=True  # Mengembalikan bobot model ke bobot terbaik yang ditemukan selama pelatihan
)
training = model_base.fit(
    pad_latih,
    label_latih,
    epochs=50,
    validation_data=(pad_test, label_test),
    callbacks=[early_stop_callback]
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50


In [124]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

test_sequence = tokenizer.texts_to_sequences(["lemottt"])
padded_sentence =  pad_sequences(test_sequence,padding='post',maxlen=87)

# Prediksi dengan model
predictions = model_base.predict(padded_sentence)
# Tentukan treshold untuk setiap kategori

treshold_positif = 0.7
treshold_negatif = 0.3

print('prob : ',predictions[0])
# Klasifikasikan hasil berdasarkan treshold
if predictions[0] >= treshold_positif:
    hasil_klasifikasi = "positif"
elif predictions[0] <= treshold_negatif:
    hasil_klasifikasi = "negatif"
else:
    hasil_klasifikasi = "netral"

print('Kalimat Anda diprediksi sebagai ', hasil_klasifikasi)


prob :  [0.17986178]
Kalimat Anda diprediksi sebagai  negatif


In [38]:
model_base.save('model_sigmoid.keras')