In [14]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk

In [2]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Function for preprocessing text
def preprocess_text(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text.lower())
    cleaned_tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalnum() and word not in stop_words]
    return ' '.join(cleaned_tokens)

# Load dataset
data = pd.read_csv('dataset/mobile_jkn.csv')  # Load your CSV file

# Preprocess text data
data['content'] = data['content'].apply(preprocess_text)

# Remove empty content
data = data[data['content'].str.strip() != '']  # Remove empty reviews

# Features and labels
X = data['content']
y = data['score']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Kurotsuki\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Kurotsuki\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Kurotsuki\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# Step 1: Vektorisasi data teks
vectorizer = TfidfVectorizer(max_features=10)  # Mengurangi jumlah fitur
X_train_vectorized = vectorizer.fit_transform(X_train)

# Mengonversi ke dense matrix dengan hati-hati
X_train_dense = X_train_vectorized.toarray()  # Memastikan data kecil

# Resampling menggunakan SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_dense, y_train)

# Hyperparameters
max_words = 5000  # Untuk LSTM dan Random Forest
max_len = 150     # Panjang maksimum input untuk LSTM
embedding_dim = 128  # Dimensionality of embedding untuk LSTM

# Label Binarization untuk label yang sudah di-resample
lb = LabelBinarizer()
y_train_onehot = lb.fit_transform(y_train_resampled)

# Flattening untuk kompatibilitas dengan model
y_train_flat = y_train_onehot.argmax(axis=1)

# Cek ukuran matriks resampled
print(f"Shape of resampled matrix: {X_train_resampled.shape}")

In [30]:
# --- Define and Train the LSTM Model ---
lstm_model = tf.keras.Sequential([
    tf.keras.layers.Embedding(max_words, embedding_dim, input_length=max_len),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True, kernel_regularizer=tf.keras.regularizers.l2(0.01))),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(y_train_onehot.shape[1], activation='softmax')
])

optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
lstm_model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_accuracy', patience=10, restore_best_weights=True)

# Train the LSTM model
lstm_model.fit(X_train_resampled, y_train_onehot_resampled, epochs=100, batch_size=64,
               validation_data=(X_test_padded, y_test_onehot), callbacks=[early_stopping])

# Evaluate the LSTM model
lstm_test_loss, lstm_test_accuracy = lstm_model.evaluate(X_test_padded, y_test_onehot)

# Predict with the LSTM model
y_pred_lstm = lstm_model.predict(X_test_padded).argmax(axis=1)

# LSTM metrics
lstm_precision = precision_score(y_test_flat, y_pred_lstm, average='weighted')
lstm_recall = recall_score(y_test_flat, y_pred_lstm, average='weighted')
lstm_f1 = f1_score(y_test_flat, y_pred_lstm, average='weighted')


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100


In [31]:
# --- Random Forest Model ---
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_resampled, y_train_flat)

# Predict with Random Forest
y_pred_rf = rf_model.predict(X_test_padded)

# Random Forest metrics
rf_accuracy = accuracy_score(y_test_flat, y_pred_rf)
rf_precision = precision_score(y_test_flat, y_pred_rf, average='weighted')
rf_recall = recall_score(y_test_flat, y_pred_rf, average='weighted')
rf_f1 = f1_score(y_test_flat, y_pred_rf, average='weighted')


In [36]:
# --- XGBoost Model ---
xgb_model = XGBClassifier(eval_metric='mlogloss')
xgb_model.fit(X_train_resampled, y_train_flat)

# Predict with XGBoost
y_pred_xgb = xgb_model.predict(X_test_padded)

# XGBoost metrics
xgb_accuracy = accuracy_score(y_test_flat, y_pred_xgb)
xgb_precision = precision_score(y_test_flat, y_pred_xgb, average='weighted')
xgb_recall = recall_score(y_test_flat, y_pred_xgb, average='weighted')
xgb_f1 = f1_score(y_test_flat, y_pred_xgb, average='weighted')

In [13]:
# Train Naive Bayes model
nb_model = MultinomialNB(alpha=0.001)  # Adjust alpha for smoothing
nb_model.fit(X_train_resampled, y_train_flat)

# Make predictions
y_pred = nb_model.predict(X_test_padded)

# Evaluate the model
accuracy = accuracy_score(y_test_flat, y_pred)
precision = precision_score(y_test_flat, y_pred, average='weighted')
recall = recall_score(y_test_flat, y_pred, average='weighted')
f1 = f1_score(y_test_flat, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

ValueError: Negative values in data passed to MultinomialNB (input X)

In [47]:
# --- Model Results ---
results = {
    "Model": ["LSTM", "Random Forest", "XGBoost", "Naive Bayes"],
    "Accuracy": [lstm_test_accuracy, rf_accuracy, xgb_accuracy, accuracy],
    "Precision": [lstm_precision, rf_precision, xgb_precision, precision],
    "Recall": [lstm_recall, rf_recall, xgb_recall, recall],
    "F1-Score": [lstm_f1, rf_f1, xgb_f1, f1]
}

# Create a DataFrame from the results
results_df = pd.DataFrame(results)

# Display the DataFrame as a table
print("\nComparison of Model Results With SMOTE:")
print(results_df)


Comparison of Model Results With SMOTE:
           Model  Accuracy  Precision    Recall  F1-Score
0           LSTM  0.801792   0.772197  0.801792  0.783995
1  Random Forest  0.701549   0.717198  0.701549  0.702007
2        XGBoost  0.705954   0.746207  0.705954  0.724851
3    Naive Bayes  0.609508   0.650668  0.609508  0.569608
