In [16]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk
from collections import Counter

In [24]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Function for preprocessing text
def preprocess_text(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text.lower())
    cleaned_tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalnum() and word not in stop_words]
    return ' '.join(cleaned_tokens)

# Load dataset
data = pd.read_csv('dataset/mobile_jkn.csv')  # Load your CSV file

# Preprocess text data
data['content'] = data['content'].apply(preprocess_text)

# Remove empty content
data = data[data['content'].str.strip() != '']  # Remove empty reviews

# Features and labels
X = data['content']
y = data['score']

# Count occurrences of each class
class_counts = Counter(y)

# Define target number of samples per class
target_samples = 10000

# List to store undersampled data
undersampled_data = []

# Loop through each class and adjust samples
for label in class_counts.keys():
    if class_counts[label] >= target_samples:
        # Take a random sample of size target_samples
        undersampled_data.append(data[data['score'] == label].sample(target_samples, random_state=42))
    else:
        # If the class has fewer samples than target_samples, replicate the data
        class_data = data[data['score'] == label]
        # Calculate how many times to replicate and get the remainder
        times_to_replicate = target_samples // class_counts[label]
        remainder = target_samples % class_counts[label]
        
        # Create the replicated dataset
        replicated_data = pd.concat([class_data] * times_to_replicate + [class_data.sample(remainder, random_state=42)])
        undersampled_data.append(replicated_data)

# Concatenate all undersampled data
undersampled_data = pd.concat(undersampled_data)

# Features and labels after undersampling
X = undersampled_data['content']
y = undersampled_data['score']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Kurotsuki\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Kurotsuki\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Kurotsuki\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [25]:
# Hyperparameters for text data
max_words = 10000
max_len = 150
embedding_dim = 128

# Tokenization and Padding
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_padded = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_len, padding='post', truncating='post')

# Label Binarization
lb = LabelBinarizer()
y_train_onehot = lb.fit_transform(y_train)
y_test_onehot = lb.transform(y_test)

# Check distribution after undersampling
print(f"Undersampled y_train distribution: {Counter(y_train)}")

# Ensure y_train_onehot is compatible with SMOTE
if len(set(y_train_onehot.flatten())) < 2:
    print("Not enough classes for SMOTE.")
else:
    # Resampling using SMOTE
    smote = SMOTE(sampling_strategy='auto', random_state=42)  # Generates a balance
    X_train_resampled, y_train_onehot_resampled = smote.fit_resample(X_train_padded, y_train_onehot)

# Flattening for compatibility with models
y_train_flat = y_train_onehot_resampled.argmax(axis=1)
y_test_flat = y_test_onehot.argmax(axis=1)

# Debugging: Check class distribution
print(f"Resampled y_train distribution: {Counter(y_train_flat)}")

Undersampled y_train distribution: Counter({5: 7524, 4: 7523, 3: 7510, 2: 7494, 1: 7449})
Resampled y_train distribution: Counter({3: 7524, 4: 7524, 0: 7524, 2: 7524, 1: 7524})


In [26]:
# --- Define and Train the LSTM Model ---
lstm_model = tf.keras.Sequential([
    tf.keras.layers.Embedding(max_words, embedding_dim, input_length=max_len),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True, kernel_regularizer=tf.keras.regularizers.l2(0.01))),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(y_train_onehot.shape[1], activation='softmax')
])

optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
lstm_model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_accuracy', patience=10, restore_best_weights=True)

# Train the LSTM model
lstm_model.fit(X_train_resampled, y_train_onehot_resampled, epochs=100, batch_size=64,
               validation_data=(X_test_padded, y_test_onehot), callbacks=[early_stopping])

# Evaluate the LSTM model
lstm_test_loss, lstm_test_accuracy = lstm_model.evaluate(X_test_padded, y_test_onehot)

# Predict with the LSTM model
y_pred_lstm = lstm_model.predict(X_test_padded).argmax(axis=1)

# LSTM metrics
lstm_precision = precision_score(y_test_flat, y_pred_lstm, average='weighted')
lstm_recall = recall_score(y_test_flat, y_pred_lstm, average='weighted')
lstm_f1 = f1_score(y_test_flat, y_pred_lstm, average='weighted')

print(f"LSTM Test Accuracy: {lstm_test_accuracy}")
print(f"LSTM Precision: {lstm_precision}")
print(f"LSTM Recall: {lstm_recall}")
print(f"LSTM F1-Score: {lstm_f1}")

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [53]:
# --- Random Forest Model ---
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_resampled, y_resampled)

# Predict with Random Forest
y_pred_rf = rf_model.predict(X_test_tfidf)

# Random Forest metrics
rf_accuracy = accuracy_score(y_test_flat, y_pred_rf)
rf_precision = precision_score(y_test_flat, y_pred_rf, average='weighted')
rf_recall = recall_score(y_test_flat, y_pred_rf, average='weighted')
rf_f1 = f1_score(y_test_flat, y_pred_rf, average='weighted')


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [42]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train).toarray()
X_test_tfidf = tfidf_vectorizer.transform(X_test).toarray()

# Reshape y_train agar sesuai dengan dimensi X_train_tfidf
y_train_flat = y_train.values.flatten()  # pastikan y_train_flat adalah array 1D

# Terapkan SMOTE
smote = SMOTE(random_state=42)

# Lakukan resampling pada X_train_tfidf dan y_train_flat
X_resampled, y_resampled = smote.fit_resample(X_train_tfidf, y_train_flat)

# Lihat hasil setelah SMOTE
print(f"Shape of X_resampled: {X_resampled.shape}")
print(f"Shape of y_resampled: {y_resampled.shape}")

# Proses pelatihan model
# Misalnya menggunakan SVM atau model lain yang Anda pilih
from sklearn.svm import SVC

model = SVC()
model.fit(X_resampled, y_resampled)

# Prediksi dan evaluasi
y_pred = model.predict(X_test_tfidf)

# Evaluasi model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

Shape of X_resampled: (37620, 1000)
Shape of y_resampled: (37620,)
Accuracy: 0.6904
Precision: 0.6951216888227025
Recall: 0.6904
F1 Score: 0.682103786261003


In [46]:
# Normalisasi label agar mulai dari 0
y_resampled_normalized = y_resampled - 1  # Jika kelas mulai dari 1 hingga 5

# --------------------- XGBoost Model ---------------------
xgb_model = XGBClassifier(eval_metric='mlogloss')  # Fine-tune hyperparameters di sini
xgb_model.fit(X_resampled, y_resampled_normalized)  # Menggunakan label yang sudah dinormalisasi

# Prediksi dengan XGBoost
y_pred_xgb = xgb_model.predict(X_test_tfidf)

# XGBoost Metrics
xgb_accuracy = accuracy_score(y_test - 1, y_pred_xgb)  # Normalisasi y_test jika perlu
xgb_precision = precision_score(y_test - 1, y_pred_xgb, average='weighted')
xgb_recall = recall_score(y_test - 1, y_pred_xgb, average='weighted')
xgb_f1 = f1_score(y_test - 1, y_pred_xgb, average='weighted')

print("XGBoost Accuracy:", xgb_accuracy)
print("XGBoost Precision:", xgb_precision)
print("XGBoost Recall:", xgb_recall)
print("XGBoost F1 Score:", xgb_f1)


XGBoost Accuracy: 0.56928
XGBoost Precision: 0.5671231562907131
XGBoost Recall: 0.56928
XGBoost F1 Score: 0.5613871805874229


In [50]:
# --------------------- Naive Bayes Model with SMOTE ---------------------
nb_model = MultinomialNB(alpha=0.001)  # Adjust alpha for smoothing if necessary
nb_model.fit(X_resampled, y_resampled)  # Menggunakan data hasil SMOTE

# Predict with Naive Bayes
y_pred_nb = nb_model.predict(X_test_tfidf)

# Naive Bayes Metrics
nb_accuracy = accuracy_score(y_test_flat, y_pred_nb)
nb_precision = precision_score(y_test_flat, y_pred_nb, average='weighted', zero_division=0)
nb_recall = recall_score(y_test_flat, y_pred_nb, average='weighted', zero_division=0)
nb_f1 = f1_score(y_test_flat, y_pred_nb, average='weighted', zero_division=0)

print("\nNaive Bayes Accuracy with SMOTE:", nb_accuracy)
print("Naive Bayes Precision with SMOTE:", nb_precision)
print("Naive Bayes Recall with SMOTE:", nb_recall)
print("Naive Bayes F1 Score with SMOTE:", nb_f1)



Naive Bayes Accuracy with SMOTE: 0.16816
Naive Bayes Precision with SMOTE: 0.17953746186405525
Naive Bayes Recall with SMOTE: 0.16816
Naive Bayes F1 Score with SMOTE: 0.16810490673694825


In [None]:
# --- Model Results ---
results = {
    "Model": ["LSTM", "Random Forest", "XGBoost", "Naive Bayes"],
    "Accuracy": [lstm_test_accuracy, rf_accuracy, xgb_accuracy, accuracy],
    "Precision": [lstm_precision, rf_precision, xgb_precision, precision],
    "Recall": [lstm_recall, rf_recall, xgb_recall, recall],
    "F1-Score": [lstm_f1, rf_f1, xgb_f1, f1]
}

# Create a DataFrame from the results
results_df = pd.DataFrame(results)

# Display the DataFrame as a table
print("\nComparison of Model Results With SMOTE:")
print(results_df)