# **lib**

In [26]:
!pip install nltk



In [27]:
import pandas as pd
import re
import numpy as np
import nltk

from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [28]:
nltk.download("vader_lexicon")
sia = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [29]:
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Bidirectional, Dropout
from tensorflow.keras.regularizers import l2

In [9]:
from google.colab import files
uploaded = files.upload()

Saving playstore_reviews.csv to playstore_reviews.csv


In [30]:
df = pd.read_csv('playstore_reviews.csv')

# **# Preprocessing: Membersihkan teks**

In [31]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9 ]', '', text)  # Hapus karakter khusus
    return text

df['clean_review'] = df['review'].astype(str).apply(clean_text)

## **Labeling: Mengelompokkan rating menjadi Sentimen (Negatif, Netral, Positif)**

In [32]:
def label_sentiment_vader(text):
    score = sia.polarity_scores(text)['compound']
    if score <= -0.05:
        return "Negatif"
    elif score >= 0.05:
        return "Positif"
    else:
        return "Netral"

df["sentiment"] = df["clean_review"].apply(label_sentiment_vader)

In [33]:
# Ekstraksi fitur menggunakan TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['clean_review'])
y = df['sentiment']

# Split data untuk training dan testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Simpan hasil preprocessing
df.to_csv('processed_reviews.csv', index=False)

print("Data berhasil diproses dan disimpan.")

Data berhasil diproses dan disimpan.


# **Membangun Model**

In [34]:
df = pd.read_csv('processed_reviews.csv')

# Encoding label sentimen
label_encoder = LabelEncoder()
df['sentiment_label'] = label_encoder.fit_transform(df['sentiment'])

# Tokenisasi teks
max_words = 5000
max_len = 200
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(df['clean_review'].astype(str))
X = tokenizer.texts_to_sequences(df['clean_review'].astype(str))
X = pad_sequences(X, maxlen=max_len, padding='post')
y = df['sentiment_label']

# Split data untuk training dan testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = keras.Sequential([
    keras.layers.Embedding(input_dim=max_words, output_dim=128, input_length=max_len),
    Dropout(0.3),
    Bidirectional(keras.layers.LSTM(64, return_sequences=True)),
    Dropout(0.5),
    Bidirectional(keras.layers.LSTM(32, kernel_regularizer=l2(0.02))),
    Dropout(0.3),
    keras.layers.Dense(16, activation='tanh', kernel_regularizer=l2(0.02)),
    keras.layers.Dense(3, activation='softmax')
])

# Optimizer
optimizer = keras.optimizers.Adam(learning_rate=0.0005)
model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Callbacks
early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=1, min_lr=1e-6)

# Pelatihan model
epochs = 15
history = model.fit(X_train, y_train, validation_data=(X_test, y_test),
                    epochs=epochs, batch_size=64, callbacks=[early_stopping, reduce_lr])

# Simpan model
model.save('sentiment_model.keras')
print("Pelatihan selesai dan model disimpan.")

Epoch 1/15




[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 715ms/step - accuracy: 0.7729 - loss: 5.5888 - val_accuracy: 0.8000 - val_loss: 3.6409 - learning_rate: 5.0000e-04
Epoch 2/15
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 713ms/step - accuracy: 0.8232 - loss: 3.1879 - val_accuracy: 0.8000 - val_loss: 2.2026 - learning_rate: 5.0000e-04
Epoch 3/15
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 720ms/step - accuracy: 0.8144 - loss: 1.9607 - val_accuracy: 0.8000 - val_loss: 1.4221 - learning_rate: 5.0000e-04
Epoch 4/15
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 692ms/step - accuracy: 0.8059 - loss: 1.2934 - val_accuracy: 0.8010 - val_loss: 0.9681 - learning_rate: 5.0000e-04
Epoch 5/15
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 740ms/step - accuracy: 0.8487 - loss: 0.8255 - val_accuracy: 0.9190 - val_loss: 0.5898 - learning_rate: 5.0000e-04
Epoch 6/15
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

# **testing model (inference)**

In [35]:
from keras.models import load_model
model = load_model('sentiment_model.keras')

In [36]:

# Contoh review baru untuk diuji
new_review = ["Aplikasinya sangat bagus dan membantu!"]

new_seq = tokenizer.texts_to_sequences(new_review)
new_padded = pad_sequences(new_seq, maxlen=200, padding='post')

# Lakukan prediksi
prediction = model.predict(new_padded)

# Tampilkan hasil prediksi
predicted_class = np.argmax(prediction)
print(f"Prediksi Sentimen: {predicted_class}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 651ms/step
Prediksi Sentimen: 1


In [37]:
# Mapping indeks ke label sentimen
label_map = {0: "Negatif", 1: "Netral", 2: "Positif"}

# Prediksi
predicted_class = np.argmax(prediction)
predicted_label = label_map[predicted_class]

print(f"Prediksi Sentimen: {predicted_label}")

Prediksi Sentimen: Netral


In [18]:
!pip freeze > requirements.txt