In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk
from collections import Counter

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Function for preprocessing text
def preprocess_text(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text.lower())
    cleaned_tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalnum() and word not in stop_words]
    return ' '.join(cleaned_tokens)

# Load dataset
data = pd.read_csv('dataset/mobile_jkn.csv')  # Load your CSV file

# Preprocess text data
data['content'] = data['content'].apply(preprocess_text)

# Remove empty content
data = data[data['content'].str.strip() != '']  # Remove empty reviews

# Features and labels
X = data['content']
y = data['score']

# Count occurrences of each class
class_counts = Counter(y)

# Define target number of samples per class
target_samples = 10000

# List to store undersampled data
undersampled_data = []

# Loop through each class and adjust samples
for label in class_counts.keys():
    if class_counts[label] >= target_samples:
        # Take a random sample of size target_samples
        undersampled_data.append(data[data['score'] == label].sample(target_samples, random_state=42))
    else:
        # If the class has fewer samples than target_samples, replicate the data
        class_data = data[data['score'] == label]
        # Calculate how many times to replicate and get the remainder
        times_to_replicate = target_samples // class_counts[label]
        remainder = target_samples % class_counts[label]
        
        # Create the replicated dataset
        replicated_data = pd.concat([class_data] * times_to_replicate + [class_data.sample(remainder, random_state=42)])
        undersampled_data.append(replicated_data)

# Concatenate all undersampled data
undersampled_data = pd.concat(undersampled_data)

# Features and labels after undersampling
X = undersampled_data['content']
y = undersampled_data['score']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Kurotsuki\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Kurotsuki\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Kurotsuki\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
# Hyperparameters for text data
max_words = 10000
max_len = 150
embedding_dim = 128

# Tokenization and Padding
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_padded = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_len, padding='post', truncating='post')

# Label Binarization
lb = LabelBinarizer()
y_train_onehot = lb.fit_transform(y_train)
y_test_onehot = lb.transform(y_test)

# Check distribution after undersampling
print(f"Undersampled y_train distribution: {Counter(y_train)}")

# Ensure y_train_onehot is compatible with SMOTE
if len(set(y_train_onehot.flatten())) < 2:
    print("Not enough classes for SMOTE.")
else:
    # Resampling using SMOTE
    smote = SMOTE(sampling_strategy='auto', random_state=42)  # Generates a balance
    X_train_resampled, y_train_onehot_resampled = smote.fit_resample(X_train_padded, y_train_onehot)

# Flattening for compatibility with models
y_train_flat = y_train_onehot_resampled.argmax(axis=1)
y_test_flat = y_test_onehot.argmax(axis=1)

# Debugging: Check class distribution
print(f"Resampled y_train distribution: {Counter(y_train_flat)}")

Undersampled y_train distribution: Counter({5: 7524, 4: 7523, 3: 7510, 2: 7494, 1: 7449})
Resampled y_train distribution: Counter({3: 7524, 4: 7524, 0: 7524, 2: 7524, 1: 7524})


In [5]:
# --- Define and Train the LSTM Model ---
lstm_model = tf.keras.Sequential([
    tf.keras.layers.Embedding(max_words, embedding_dim, input_length=max_len),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True, kernel_regularizer=tf.keras.regularizers.l2(0.01))),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(y_train_onehot.shape[1], activation='softmax')
])

optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
lstm_model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_accuracy', patience=10, restore_best_weights=True)

# Train the LSTM model
lstm_model.fit(X_train_resampled, y_train_onehot_resampled, epochs=200, batch_size=128,
               validation_data=(X_test_padded, y_test_onehot), callbacks=[early_stopping])

# Evaluate the LSTM model
lstm_test_loss, lstm_test_accuracy = lstm_model.evaluate(X_test_padded, y_test_onehot)

# Predict with the LSTM model
y_pred_lstm = lstm_model.predict(X_test_padded).argmax(axis=1)

# LSTM metrics
lstm_precision = precision_score(y_test_flat, y_pred_lstm, average='weighted')
lstm_recall = recall_score(y_test_flat, y_pred_lstm, average='weighted')
lstm_f1 = f1_score(y_test_flat, y_pred_lstm, average='weighted')

print(f"LSTM Test Accuracy: {lstm_test_accuracy}")
print(f"LSTM Precision: {lstm_precision}")
print(f"LSTM Recall: {lstm_recall}")
print(f"LSTM F1-Score: {lstm_f1}")

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [8]:
# Check the length of features and labels
print(f"Length of X_train_padded: {len(X_train_padded)}")
print(f"Length of y_train_flat: {len(y_train_flat)}")

# Ensure they have the same number of samples
if len(X_train_padded) == len(y_train_flat):
    # Apply SMOTE if sizes are consistent
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train_padded, y_train_flat)
    print(f"Resampled y_train distribution: {Counter(y_train_resampled)}")
else:
    print("Mismatch in number of samples between X_train and y_train.")

# Check for empty or invalid content in X_train before tokenization
print(f"Number of empty reviews in X_train before padding: {sum(X_train.str.strip() == '')}")
print(f"Number of empty reviews in y_train: {sum(pd.isnull(y_train))}")

# Remove any rows where content is empty or y_train is NaN
X_train_cleaned = X_train[X_train.str.strip() != '']
y_train_cleaned = y_train[X_train.str.strip() != '']

# Tokenization and Padding
X_train_seq = tokenizer.texts_to_sequences(X_train_cleaned)
X_train_padded = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')

# Binarize labels again after cleaning
y_train_flat = lb.fit_transform(y_train_cleaned).argmax(axis=1)

# Check the new lengths
print(f"Length of X_train_padded after cleaning: {len(X_train_padded)}")
print(f"Length of y_train_flat after cleaning: {len(y_train_flat)}")


Length of X_train_padded: 37500
Length of y_train_flat: 37620
Mismatch in number of samples between X_train and y_train.
Number of empty reviews in X_train before padding: 0
Number of empty reviews in y_train: 0
Length of X_train_padded after cleaning: 37500
Length of y_train_flat after cleaning: 37500


In [9]:
# Import necessary libraries
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from collections import Counter

# Apply SMOTE to handle imbalanced data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_padded, y_train_flat)

# Define function to evaluate model
def evaluate_model(model, X_test_padded, y_test_flat):
    y_pred = model.predict(X_test_padded)
    accuracy = accuracy_score(y_test_flat, y_pred)
    precision = precision_score(y_test_flat, y_pred, average='weighted')
    recall = recall_score(y_test_flat, y_pred, average='weighted')
    f1 = f1_score(y_test_flat, y_pred, average='weighted')
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")

# Naive Bayes Model
nb_model = MultinomialNB()
nb_model.fit(X_train_resampled, y_train_resampled)
print("Naive Bayes Results:")
evaluate_model(nb_model, X_test_padded, y_test_flat)

# XGBoost Model
xgb_model = XGBClassifier(random_state=42)
xgb_model.fit(X_train_resampled, y_train_resampled)
print("\nXGBoost Results:")
evaluate_model(xgb_model, X_test_padded, y_test_flat)

# Random Forest Model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_resampled, y_train_resampled)
print("\nRandom Forest Results:")
evaluate_model(rf_model, X_test_padded, y_test_flat)


Naive Bayes Results:
Accuracy: 0.28912
Precision: 0.2863017159168845
Recall: 0.28912
F1 Score: 0.22578833413438085

XGBoost Results:
Accuracy: 0.58704
Precision: 0.5855836347963148
Recall: 0.58704
F1 Score: 0.5797484927850259

Random Forest Results:
Accuracy: 0.74856
Precision: 0.7518853889362966
Recall: 0.74856
F1 Score: 0.746661752159862


In [21]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=100)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train).toarray()
X_test_tfidf = tfidf_vectorizer.transform(X_test).toarray()

# Reshape y_train agar sesuai dengan dimensi X_train_tfidf
y_train_flat = y_train.values.flatten()  # pastikan y_train_flat adalah array 1D

X_train_resampled, y_train_onehot_resampled = smote.fit_resample(X_train_tfidf, y_train_flat)

In [22]:
# --- Random Forest Model ---
rf_model = RandomForestClassifier(random_state=128)
rf_model.fit(X_train_resampled, y_train_onehot_resampled)

# Predict with Random Forest
y_pred_rf = rf_model.predict(X_test_tfidf)

# Random Forest metrics
rf_accuracy = accuracy_score(y_test_flat, y_pred_rf)
rf_precision = precision_score(y_test_flat, y_pred_rf, average='weighted')
rf_recall = recall_score(y_test_flat, y_pred_rf, average='weighted')
rf_f1 = f1_score(y_test_flat, y_pred_rf, average='weighted')

print(f"RF Test Accuracy: {rf_accuracy}")
print(f"RF Precision: {rf_precision}")
print(f"RF Recall: {rf_recall}")
print(f"RF F1-Score: {rf_f1}")

RF Test Accuracy: 0.05464
RF Precision: 0.07311520675334408
RF Recall: 0.05464
RF F1-Score: 0.06163986004024331


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [42]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train).toarray()
X_test_tfidf = tfidf_vectorizer.transform(X_test).toarray()

# Reshape y_train agar sesuai dengan dimensi X_train_tfidf
y_train_flat = y_train.values.flatten()  # pastikan y_train_flat adalah array 1D

# Terapkan SMOTE
smote = SMOTE(random_state=42)

# Lakukan resampling pada X_train_tfidf dan y_train_flat
X_resampled, y_resampled = smote.fit_resample(X_train_tfidf, y_train_flat)

# Lihat hasil setelah SMOTE
print(f"Shape of X_resampled: {X_resampled.shape}")
print(f"Shape of y_resampled: {y_resampled.shape}")

# Proses pelatihan model
# Misalnya menggunakan SVM atau model lain yang Anda pilih
from sklearn.svm import SVC

model = SVC()
model.fit(X_resampled, y_resampled)

# Prediksi dan evaluasi
y_pred = model.predict(X_test_tfidf)

# Evaluasi model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

Shape of X_resampled: (37620, 1000)
Shape of y_resampled: (37620,)
Accuracy: 0.6904
Precision: 0.6951216888227025
Recall: 0.6904
F1 Score: 0.682103786261003


In [46]:
# Normalisasi label agar mulai dari 0
y_resampled_normalized = y_resampled - 1  # Jika kelas mulai dari 1 hingga 5

# --------------------- XGBoost Model ---------------------
xgb_model = XGBClassifier(eval_metric='mlogloss')  # Fine-tune hyperparameters di sini
xgb_model.fit(X_resampled, y_resampled_normalized)  # Menggunakan label yang sudah dinormalisasi

# Prediksi dengan XGBoost
y_pred_xgb = xgb_model.predict(X_test_tfidf)

# XGBoost Metrics
xgb_accuracy = accuracy_score(y_test - 1, y_pred_xgb)  # Normalisasi y_test jika perlu
xgb_precision = precision_score(y_test - 1, y_pred_xgb, average='weighted')
xgb_recall = recall_score(y_test - 1, y_pred_xgb, average='weighted')
xgb_f1 = f1_score(y_test - 1, y_pred_xgb, average='weighted')

print("XGBoost Accuracy:", xgb_accuracy)
print("XGBoost Precision:", xgb_precision)
print("XGBoost Recall:", xgb_recall)
print("XGBoost F1 Score:", xgb_f1)


XGBoost Accuracy: 0.56928
XGBoost Precision: 0.5671231562907131
XGBoost Recall: 0.56928
XGBoost F1 Score: 0.5613871805874229


In [50]:
# --------------------- Naive Bayes Model with SMOTE ---------------------
nb_model = MultinomialNB(alpha=0.001)  # Adjust alpha for smoothing if necessary
nb_model.fit(X_resampled, y_resampled)  # Menggunakan data hasil SMOTE

# Predict with Naive Bayes
y_pred_nb = nb_model.predict(X_test_tfidf)

# Naive Bayes Metrics
nb_accuracy = accuracy_score(y_test_flat, y_pred_nb)
nb_precision = precision_score(y_test_flat, y_pred_nb, average='weighted', zero_division=0)
nb_recall = recall_score(y_test_flat, y_pred_nb, average='weighted', zero_division=0)
nb_f1 = f1_score(y_test_flat, y_pred_nb, average='weighted', zero_division=0)

print("\nNaive Bayes Accuracy with SMOTE:", nb_accuracy)
print("Naive Bayes Precision with SMOTE:", nb_precision)
print("Naive Bayes Recall with SMOTE:", nb_recall)
print("Naive Bayes F1 Score with SMOTE:", nb_f1)



Naive Bayes Accuracy with SMOTE: 0.16816
Naive Bayes Precision with SMOTE: 0.17953746186405525
Naive Bayes Recall with SMOTE: 0.16816
Naive Bayes F1 Score with SMOTE: 0.16810490673694825


In [None]:
# --- Model Results ---
results = {
    "Model": ["LSTM", "Random Forest", "XGBoost", "Naive Bayes"],
    "Accuracy": [lstm_test_accuracy, rf_accuracy, xgb_accuracy, accuracy],
    "Precision": [lstm_precision, rf_precision, xgb_precision, precision],
    "Recall": [lstm_recall, rf_recall, xgb_recall, recall],
    "F1-Score": [lstm_f1, rf_f1, xgb_f1, f1]
}

# Create a DataFrame from the results
results_df = pd.DataFrame(results)

# Display the DataFrame as a table
print("\nComparison of Model Results With SMOTE:")
print(results_df)