In [1]:
import re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping

In [3]:
# Set memory growth for GPU
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)


In [4]:
# Load dataset
data = pd.read_csv('dataset/mobile_jkn.csv')

# Preprocessing function to clean text
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\d', ' ', text)  # Remove digits
    text = text.lower()              # Convert to lowercase
    text = re.sub(r'\s+', ' ', text) # Remove extra spaces
    return text

# Apply preprocessing to the 'Review' column
data['content'] = data['content'].apply(preprocess_text)

# Split data into features (X) and target (y)
X = data['content']
y = data['score']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
# --- LSTM Model ---

# Tokenization and Padding for LSTM
max_words = 10000  # Vocabulary size
max_len = 150      # Max length for padding

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_padded = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_len, padding='post', truncating='post')

# Convert ratings to one-hot encoding
lb = LabelBinarizer()
y_train_onehot = lb.fit_transform(y_train)
y_test_onehot = lb.transform(y_test)

# Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_padded, y_train_onehot)

# Build and compile LSTM model
lstm_model = tf.keras.Sequential([
    tf.keras.layers.Embedding(max_words, 128, input_length=max_len),
    tf.keras.layers.SpatialDropout1D(0.3),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True)),  # Increase LSTM units
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),  # Increase units
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(y_train_onehot.shape[1], activation='softmax')
])

optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
lstm_model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

# Early stopping callback
early_stopping = EarlyStopping(monitor='val_accuracy', patience=10, restore_best_weights=True)  # Monitor val_accuracy

# Train the LSTM model
lstm_model.fit(X_train_resampled, y_train_resampled, epochs=100, batch_size=64,
               validation_data=(X_test_padded, y_test_onehot), callbacks=[early_stopping])

# Evaluate LSTM model
lstm_test_loss, lstm_test_accuracy = lstm_model.evaluate(X_test_padded, y_test_onehot)

print(f"LSTM Test Loss: {lstm_test_loss:.4f}, Test Accuracy: {lstm_test_accuracy:.4f}")

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100

KeyboardInterrupt: 

In [None]:
# Limit features in TF-IDF and use sparse matrix to reduce memory usage
tfidf = TfidfVectorizer(max_features=2000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Convert sparse matrix to dense for easier batch handling
X_train_tfidf = X_train_tfidf.toarray()
X_test_tfidf = X_test_tfidf.toarray()

# Define batch size
batch_size = 10000

# Initialize SVM model with linear kernel
svm_model = SVC(kernel='linear')

# Shuffle the training data before batch processing
X_train_tfidf, y_train = shuffle(X_train_tfidf, y_train, random_state=42)

# Mini-batch training
for i in range(0, X_train_tfidf.shape[0], batch_size):
    X_batch = X_train_tfidf[i:i+batch_size]
    y_batch = y_train[i:i+batch_size]
    
    # Train SVM on the current batch
    svm_model.fit(X_batch, y_batch)
    print(f"Batch {i // batch_size + 1} trained.")

# Predict on the test set
y_pred = svm_model.predict(X_test_tfidf)

# Calculate and print accuracy
svm_test_accuracy = accuracy_score(y_test, y_pred)
print(f"SVM Test Accuracy with mini-batch training: {svm_test_accuracy:.4f}")

In [None]:
# --- Naive Bayes Model ---

# Train Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

# Predict and evaluate Naive Bayes model
y_pred_nb = nb_model.predict(X_test_tfidf)
nb_test_accuracy = accuracy_score(y_test, y_pred_nb)

In [None]:
# Print accuracies
print(f"LSTM Test Accuracy: {lstm_test_accuracy}")
print(f"SVM Test Accuracy: {svm_test_accuracy}")
print(f"Naive Bayes Test Accuracy: {nb_test_accuracy}")

In [None]:
# -------------------- Penyeimbangan Data Menggunakan SMOTE -------------------- #

# Resample training data using SMOTE
smote = SMOTE(random_state=42)
X_train_padded_resampled, y_train_onehot_resampled = smote.fit_resample(X_train_padded, y_train_onehot)

# Optionally apply SMOTE to test data (less common)
# X_test_padded_resampled, y_test_onehot_resampled = smote.fit_resample(X_test_padded, y_test_onehot)

# Flatten y_train_onehot_resampled and y_test_onehot_resampled for SVM and Naive Bayes compatibility
y_train_flat = y_train_onehot_resampled.argmax(axis=1)
y_test_flat = y_test_onehot.argmax(axis=1)

# -------------------- SVM Model -------------------- #

# Train SVM model with resampled training data
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train_padded_resampled, y_train_flat)

# Predict on test data (with or without resampled data)
y_pred_svm = svm_model.predict(X_test_padded)  # or X_test_padded_resampled

# SVM Accuracy
svm_accuracy = accuracy_score(y_test_flat, y_pred_svm)
print(f"SVM Test Accuracy: {svm_accuracy}")

# Classification report for SVM
print("SVM Classification Report:")
print(classification_report(y_test_flat, y_pred_svm))

# -------------------- Naive Bayes Model -------------------- #

# Train Naive Bayes model with resampled training data
nb_model = MultinomialNB()
nb_model.fit(X_train_padded_resampled, y_train_flat)

# Predict on test data (with or without resampled data)
y_pred_nb = nb_model.predict(X_test_padded)  # or X_test_padded_resampled

# Naive Bayes Accuracy
nb_accuracy = accuracy_score(y_test_flat, y_pred_nb)
print(f"Naive Bayes Test Accuracy: {nb_accuracy}")

# Classification report for Naive Bayes
print("Naive Bayes Classification Report:")
print(classification_report(y_test_flat, y_pred_nb))
