In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk

In [2]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Function for preprocessing text
def preprocess_text(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text.lower())
    cleaned_tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalnum() and word not in stop_words]
    return ' '.join(cleaned_tokens)

# Load dataset
data = pd.read_csv('dataset/mobile_jkn.csv')  # Load your CSV file

# Preprocess text data
data['content'] = data['content'].apply(preprocess_text)

# Remove empty content
data = data[data['content'].str.strip() != '']  # Remove empty reviews

# Features and labels
X = data['content']
y = data['score']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Kurotsuki\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Kurotsuki\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Kurotsuki\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
# Hyperparameters for text data
max_words = 10000
max_len = 150
embedding_dim = 128

# Tokenization and Padding
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_padded = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_len, padding='post', truncating='post')

# Label Binarization
lb = LabelBinarizer()
y_train_onehot = lb.fit_transform(y_train)
y_test_onehot = lb.transform(y_test)

# Flattening for compatibility with models
y_train_flat = y_train_onehot.argmax(axis=1)
y_test_flat = y_test_onehot.argmax(axis=1)


In [4]:
# --- Define and Train the LSTM Model ---
lstm_model = tf.keras.Sequential([
    tf.keras.layers.Embedding(max_words, embedding_dim, input_length=max_len),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(y_train_onehot.shape[1], activation='softmax')
])

optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
lstm_model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_accuracy', patience=10, restore_best_weights=True)

# Train the LSTM model
lstm_model.fit(X_train_padded, y_train_onehot, epochs=100, batch_size=64,
               validation_data=(X_test_padded, y_test_onehot), callbacks=[early_stopping])

# Evaluate the LSTM model
lstm_test_loss, lstm_test_accuracy = lstm_model.evaluate(X_test_padded, y_test_onehot)

# Predict with the LSTM model
y_pred_lstm = lstm_model.predict(X_test_padded).argmax(axis=1)

# LSTM metrics
lstm_precision = precision_score(y_test_flat, y_pred_lstm, average='weighted', zero_division=0)
lstm_recall = recall_score(y_test_flat, y_pred_lstm, average='weighted', zero_division=0)
lstm_f1 = f1_score(y_test_flat, y_pred_lstm, average='weighted', zero_division=0)

print(f"LSTM Test Accuracy: {lstm_test_accuracy}")
print(f"LSTM Precision: {lstm_precision}")
print(f"LSTM Recall: {lstm_recall}")
print(f"LSTM F1 Score: {lstm_f1}")

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
LSTM Test Accuracy: 0.8442183136940002
LSTM Precision: 0.7750007130220489
LSTM Recall: 0.8442183070068854
LSTM F1 Score: 0.7904135813711823


In [5]:
# --- Random Forest Model ---
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_padded, y_train_flat)

# Predict with Random Forest
y_pred_rf = rf_model.predict(X_test_padded)

# Random Forest metrics
rf_accuracy = accuracy_score(y_test_flat, y_pred_rf)
rf_precision = precision_score(y_test_flat, y_pred_rf, average='weighted')
rf_recall = recall_score(y_test_flat, y_pred_rf, average='weighted')
rf_f1 = f1_score(y_test_flat, y_pred_rf, average='weighted')


In [6]:
# --- XGBoost Model ---
xgb_model = XGBClassifier(eval_metric='mlogloss')
xgb_model.fit(X_train_padded, y_train_flat)

# Predict with XGBoost
y_pred_xgb = xgb_model.predict(X_test_padded)

# XGBoost metrics
xgb_accuracy = accuracy_score(y_test_flat, y_pred_xgb)
xgb_precision = precision_score(y_test_flat, y_pred_xgb, average='weighted')
xgb_recall = recall_score(y_test_flat, y_pred_xgb, average='weighted')
xgb_f1 = f1_score(y_test_flat, y_pred_xgb, average='weighted')


In [7]:
# Train Naive Bayes model without SMOTE
nb_model = MultinomialNB(alpha=0.001)  # You can adjust alpha for hyperparameter tuning
nb_model.fit(X_train_padded, y_train_flat)

# Predictions
y_pred_flat = nb_model.predict(X_test_padded)

# Evaluation metrics
accuracy = accuracy_score(y_test_flat, y_pred_flat)
precision = precision_score(y_test_flat, y_pred_flat, average='weighted')
recall = recall_score(y_test_flat, y_pred_flat, average='weighted')
f1 = f1_score(y_test_flat, y_pred_flat, average='weighted')

# Display the results
print("Naive Bayes Model Performance:")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Naive Bayes Model Performance:
Accuracy: 0.6096597812879708
Precision: 0.6394554663846649
Recall: 0.6096597812879708
F1 Score: 0.5528483909542201


In [8]:
# --- Model Results ---
results = {
    "Model": ["LSTM", "Random Forest", "XGBoost", "Naive Bayes"],
    "Accuracy": [lstm_test_accuracy, rf_accuracy, xgb_accuracy, accuracy],
    "Precision": [lstm_precision, rf_precision, xgb_precision, precision],
    "Recall": [lstm_recall, rf_recall, xgb_recall, recall],
    "F1-Score": [lstm_f1, rf_f1, xgb_f1, f1]
}

# Create a DataFrame from the results
results_df = pd.DataFrame(results)

# Display the DataFrame as a table
print("\nComparison of Model Results Without SMOTE:")
print(results_df)


Comparison of Model Results Without SMOTE:
           Model  Accuracy  Precision    Recall  F1-Score
0           LSTM  0.844218   0.775001  0.844218  0.790414
1  Random Forest  0.781946   0.709381  0.781946  0.735544
2        XGBoost  0.812880   0.728318  0.812880  0.760543
3    Naive Bayes  0.609660   0.639455  0.609660  0.552848
