## **Proses Instalasi**


In [19]:

!pip install numpy
!pip install pandas
!pip install gensim tensorflow
!pip install scikit-learn nltk imblearn



In [20]:
!pip install emoji




In [21]:
!pip install PySastrawi



## **Import Library**


In [22]:
import pandas as pd
import re
import emoji
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, Input
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from gensim.models import Word2Vec
from Sastrawi.Stemmer import StemmerFactory
from imblearn.over_sampling import SMOTE
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## **Inisialisasi Stemmer dan Kamus Gen Z**


In [23]:
# Inisialisasi stemmer Sastrawi
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# Load kamus gen z
slang_df = pd.read_csv('kamus-gen-z.csv')
slang_dict = dict(zip(slang_df['slang'], slang_df['formal']))

## **Membaca Dataset Ulasan APK**

In [24]:
df = pd.read_csv('dataset_apk.csv')
print("Jumlah data:", len(df))
print(df.head())

Jumlah data: 10000
                               reviewId         userName  \
0  130b89b8-2a69-4cc8-ad48-ea73d44f265f  Pengguna Google   
1  83672bfd-f4d3-44c9-a13a-09436478e108  Pengguna Google   
2  e6f584a1-ba88-4609-8243-d62e7dce5982  Pengguna Google   
3  079c5082-b38a-4c82-a78f-5e0ab7496444  Pengguna Google   
4  820caf85-f371-445f-80db-6661376e9ce4  Pengguna Google   

                                           userImage  \
0  https://play-lh.googleusercontent.com/EGemoI2N...   
1  https://play-lh.googleusercontent.com/EGemoI2N...   
2  https://play-lh.googleusercontent.com/EGemoI2N...   
3  https://play-lh.googleusercontent.com/EGemoI2N...   
4  https://play-lh.googleusercontent.com/EGemoI2N...   

                                             content  score  thumbsUpCount  \
0                                 bagus buanget cokk      5              0   
1  Kalau miliastra wonderland dipisahkan dari gen...      1              0   
2  Gamenya bagus sekali, grafik memanjakan sekali

**## Pelabelan dan Pembersihan Data Ulasan**

In [25]:
# Pelabelan sentimen
def label_sentiment(score):
    if score <= 2: return 'negatif'
    elif score == 3: return 'netral'
    else: return 'positif'

df['sentiment'] = df['score'].apply(label_sentiment)
print("Distribusi awal:\n", df['sentiment'].value_counts().to_string())

# Stopwords tambahan dari NLTK
stop_words = set(stopwords.words('indonesian')) | {'dan', 'yang', 'di', 'ke', 'nya', 'ini', 'itu'}

# Fungsi pembersihan teks
def clean_text(text):
    text = str(text).lower()
    text = emoji.replace_emoji(text, replace='')
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    words = text.split()
    # Mempertahankan kata pendek yg masih relevan
    text = ' '.join(slang_dict.get(word, word) for word in words if word not in stop_words or word in ['oke', 'bagus', 'top'])
    return stemmer.stem(text)

df['cleaned_content'] = df['content'].apply(clean_text)

# Oversampling dengan smote setelah TF-IDF
tfidf = TfidfVectorizer(max_features=10000, stop_words=list(stop_words), ngram_range=(1, 2))
X_tfidf = tfidf.fit_transform(df['cleaned_content']).toarray()
y = pd.get_dummies(df['sentiment']).values
smote = SMOTE(random_state=42)
X_tfidf_smote, y_smote = smote.fit_resample(X_tfidf, np.argmax(y, axis=1))
y_smote = pd.get_dummies(y_smote).values
df_balanced = pd.DataFrame({'cleaned_content': [' '.join(doc) for doc in tfidf.inverse_transform(X_tfidf_smote)], 'sentiment': np.argmax(y_smote, axis=1)})
df_balanced['sentiment'] = df_balanced['sentiment'].map({0: 'negatif', 1: 'netral', 2: 'positif'})
print(f"Jumlah data setelah SMOTE: {len(df_balanced)}")
print("Distribusi setelah SMOTE:\n", df_balanced['sentiment'].value_counts().to_string())

# Fungsi evaluasi data
def evaluate_model(y_true, y_pred, set_name=""):
    accuracy = accuracy_score(y_true, y_pred)
    print(f"\nAkurasi {set_name}: {accuracy * 100:.2f}%")
    print(classification_report(y_true, y_pred, target_names=['negatif', 'netral', 'positif']))
    return accuracy

Distribusi awal:
 sentiment
positif    6345
negatif    3028
netral      627




Jumlah data setelah SMOTE: 19035
Distribusi setelah SMOTE:
 sentiment
positif    6345
negatif    6345
netral     6345


## **Pelatihan Model**

In [26]:
# --- Model 1: Neural Network (Dense Layers) dengan Fitur TF-IDF (Data Split 80/20) ---
print("\n=== Neural Network (Dense Layers) dengan Fitur TF-IDF (Data Split 80/20) ===")
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_tfidf_smote, y_smote, test_size=0.2, random_state=42)

model1 = Sequential([
    Input(shape=(10000,)),
    Dense(512, activation='relu', kernel_regularizer=l2(0.001)),
    Dropout(0.4),
    Dense(256, activation='relu', kernel_regularizer=l2(0.001)),
    Dropout(0.4),
    Dense(3, activation='softmax')
])
model1.compile(optimizer=Adam(learning_rate=0.0005), loss='categorical_crossentropy', metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, min_lr=0.00001)
model1.fit(X_train1, y_train1, epochs=30, batch_size=64, validation_split=0.1, callbacks=[early_stopping, lr_scheduler], verbose=1)

y_pred_train1 = np.argmax(model1.predict(X_train1), axis=1)
y_pred_test1 = np.argmax(model1.predict(X_test1), axis=1)
y_train1_cat = np.argmax(y_train1, axis=1)
y_test1_cat = np.argmax(y_test1, axis=1)
train_acc1 = evaluate_model(y_train1_cat, y_pred_train1, "Training")
test_acc1 = evaluate_model(y_test1_cat, y_pred_test1, "Testing")

# --- Model 2: LSTM Neural Network dengan Embedding Word2Vec (Data Split 80/20) ---
print("\n=== LSTM Neural Network dengan Embedding Word2Vec (Data Split 80/20) ===")
sentences = [text.split() for text in df_balanced['cleaned_content']]
w2v_model = Word2Vec(sentences, vector_size=200, window=5, min_count=1, workers=4, epochs=20)

tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(df_balanced['cleaned_content'])
X_seq = tokenizer.texts_to_sequences(df_balanced['cleaned_content'])
max_len = 100
X_pad = pad_sequences(X_seq, maxlen=max_len)

embedding_matrix = np.zeros((10000, 200))
for word, i in tokenizer.word_index.items():
    if i < 10000 and word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]

X_train2, X_test2, y_train2, y_test2 = train_test_split(X_pad, y_smote, test_size=0.2, random_state=42)

model2 = Sequential([
    Embedding(10000, 200, weights=[embedding_matrix], input_length=max_len, trainable=True),
    Bidirectional(LSTM(256, return_sequences=True, kernel_regularizer=l2(0.005))),
    LSTM(128),
    Dropout(0.5),
    Dense(128, activation='relu', kernel_regularizer=l2(0.005)),
    Dropout(0.5),
    Dense(3, activation='softmax')
])
model2.compile(optimizer=Adam(learning_rate=0.0005), loss='categorical_crossentropy', metrics=['accuracy'])

model2.fit(X_train2, y_train2, epochs=30, batch_size=64, validation_split=0.1, callbacks=[early_stopping, lr_scheduler], verbose=1)

y_pred_train2 = np.argmax(model2.predict(X_train2), axis=1)
y_pred_test2 = np.argmax(model2.predict(X_test2), axis=1)
y_train2_cat = np.argmax(y_train2, axis=1)
y_test2_cat = np.argmax(y_test2, axis=1)
train_acc2 = evaluate_model(y_train2_cat, y_pred_train2, "Training")
test_acc2 = evaluate_model(y_test2_cat, y_pred_test2, "Testing")

# --- Model 3: Neural Network (Dense Layers) dengan Fitur TF-IDF (Data Split 70/30) ---
print("\n=== Neural Network (Dense Layers) dengan Fitur TF-IDF (Data Split 70/30) ===")
X_train3, X_test3, y_train3, y_test3 = train_test_split(X_tfidf_smote, y_smote, test_size=0.3, random_state=42)

model3 = Sequential([
    Input(shape=(10000,)),
    Dense(512, activation='relu', kernel_regularizer=l2(0.001)),
    Dropout(0.4),
    Dense(256, activation='relu', kernel_regularizer=l2(0.001)),
    Dropout(0.4),
    Dense(3, activation='softmax')
])
model3.compile(optimizer=Adam(learning_rate=0.0005), loss='categorical_crossentropy', metrics=['accuracy'])

model3.fit(X_train3, y_train3, epochs=30, batch_size=64, validation_split=0.1, callbacks=[early_stopping, lr_scheduler], verbose=1)

y_pred_train3 = np.argmax(model3.predict(X_train3), axis=1)
y_pred_test3 = np.argmax(model3.predict(X_test3), axis=1)
y_train3_cat = np.argmax(y_train3, axis=1)
y_test3_cat = np.argmax(y_test3, axis=1)
train_acc3 = evaluate_model(y_train3_cat, y_pred_train3, "Training")
test_acc3 = evaluate_model(y_test3_cat, y_pred_test3, "Testing")


=== Neural Network (Dense Layers) dengan Fitur TF-IDF (Data Split 80/20) ===
Epoch 1/30
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 94ms/step - accuracy: 0.5415 - loss: 1.4403 - val_accuracy: 0.7630 - val_loss: 0.8181 - learning_rate: 5.0000e-04
Epoch 2/30
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 95ms/step - accuracy: 0.8095 - loss: 0.7226 - val_accuracy: 0.8253 - val_loss: 0.7459 - learning_rate: 5.0000e-04
Epoch 3/30
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 92ms/step - accuracy: 0.8568 - loss: 0.6403 - val_accuracy: 0.8313 - val_loss: 0.7208 - learning_rate: 5.0000e-04
Epoch 4/30
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 101ms/step - accuracy: 0.8830 - loss: 0.5846 - val_accuracy: 0.8293 - val_loss: 0.7152 - learning_rate: 5.0000e-04
Epoch 5/30
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 92ms/step - accuracy: 0.8963 - loss: 0.5666 - val_accuracy: 0.8306 - val_lo



[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m368s[0m 2s/step - accuracy: 0.5274 - loss: 3.3191 - val_accuracy: 0.6461 - val_loss: 1.3671 - learning_rate: 5.0000e-04
Epoch 2/30
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m390s[0m 2s/step - accuracy: 0.6311 - loss: 1.2903 - val_accuracy: 0.6881 - val_loss: 1.0385 - learning_rate: 5.0000e-04
Epoch 3/30
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m354s[0m 2s/step - accuracy: 0.6919 - loss: 0.9889 - val_accuracy: 0.7104 - val_loss: 0.8592 - learning_rate: 5.0000e-04
Epoch 4/30
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m350s[0m 2s/step - accuracy: 0.7293 - loss: 0.8409 - val_accuracy: 0.7649 - val_loss: 0.7645 - learning_rate: 5.0000e-04
Epoch 5/30
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m381s[0m 2s/step - accuracy: 0.7750 - loss: 0.7205 - val_accuracy: 0.7932 - val_loss: 0.6752 - learning_rate: 5.0000e-04
Epoch 6/30
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

## **Prediksi Teks Baru / Belum ada di training**

In [27]:
# Fungsi inference
def predict_sentiment(text, model, vectorizer, is_word2vec=False, tokenizer=None, max_len=100):
    cleaned_text = clean_text(text)
    if is_word2vec:
        seq = tokenizer.texts_to_sequences([cleaned_text])
        padded = pad_sequences(seq, maxlen=max_len)
        pred = model.predict(padded)
    else:
        tfidf_vec = vectorizer.transform([cleaned_text]).toarray()
        pred = model.predict(tfidf_vec)
    sentiment = np.argmax(pred, axis=1)[0]
    return ['negatif', 'netral', 'positif'][sentiment]

# contoh hasilnya
sample_text = "Gimana ya, dilihat lihat dari komentarnya sepertinya game ini menarik sekali"
print("\nContoh Inference:")
print(f"Skema 1 (Dense+TF-IDF): {predict_sentiment(sample_text, model1, tfidf)}")
print(f"Skema 2 (LSTM+Word2Vec): {predict_sentiment(sample_text, model2, None, True, tokenizer, max_len)}")
print(f"Skema 3 (Dense+TF-IDF): {predict_sentiment(sample_text, model3, tfidf)}")


Contoh Inference:
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
Skema 1 (Dense+TF-IDF): positif
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 100ms/step
Skema 2 (LSTM+Word2Vec): positif
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
Skema 3 (Dense+TF-IDF): positif
