In [1]:
import re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle
from imblearn.over_sampling import SMOTE
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
# Set memory growth for GPU
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)


In [3]:
# Load dataset
data = pd.read_csv('dataset/mobile_jkn.csv')

# Preprocessing function to clean text
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\d', ' ', text)  # Remove digits
    text = text.lower()              # Convert to lowercase
    text = re.sub(r'\s+', ' ', text) # Remove extra spaces
    return text

# Apply preprocessing to the 'Review' column
data['content'] = data['content'].apply(preprocess_text)

# Split data into features (X) and target (y)
X = data['content']
y = data['score']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
# --- LSTM Model ---

# Tokenization and Padding for LSTM
max_words = 10000  # Vocabulary size
max_len = 150      # Max length for padding

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_padded = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_len, padding='post', truncating='post')

# Convert ratings to one-hot encoding
lb = LabelBinarizer()
y_train_onehot = lb.fit_transform(y_train)
y_test_onehot = lb.transform(y_test)

# Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_padded, y_train_onehot)

# Build and compile LSTM model
lstm_model = tf.keras.Sequential([
    tf.keras.layers.Embedding(max_words, 128, input_length=max_len),
    tf.keras.layers.SpatialDropout1D(0.3),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True)),  # Increase LSTM units
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),  # Increase units
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(y_train_onehot.shape[1], activation='softmax')
])

optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
lstm_model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

# Early stopping callback
early_stopping = EarlyStopping(monitor='val_accuracy', patience=10, restore_best_weights=True)  # Monitor val_accuracy

# Train the LSTM model
lstm_model.fit(X_train_resampled, y_train_resampled, epochs=100, batch_size=64,
               validation_data=(X_test_padded, y_test_onehot), callbacks=[early_stopping])

# Evaluate LSTM model
lstm_test_loss, lstm_test_accuracy = lstm_model.evaluate(X_test_padded, y_test_onehot)

print(f"LSTM Test Loss: {lstm_test_loss:.4f}, Test Accuracy: {lstm_test_accuracy:.4f}")

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
LSTM Test Loss: 0.7435, Test Accuracy: 0.8131


In [4]:
# Reduce TF-IDF features to 1000
tfidf = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Handle class imbalance using SMOTE for SVM
smote = SMOTE(random_state=42)
X_train_resampled_svm, y_train_resampled_svm = smote.fit_resample(X_train_tfidf, y_train)

# Convert resampled sparse matrix to dense (array)
X_train_resampled_svm = X_train_resampled_svm.toarray()
y_train_resampled_svm = np.array(y_train_resampled_svm)

# Define a mini-batch size
batch_size = 10000

# Shuffle the resampled data
X_train_resampled_svm, y_train_resampled_svm = shuffle(X_train_resampled_svm, y_train_resampled_svm, random_state=42)

# Initialize SVM model
svm_model = SVC(kernel='linear')

# Train SVM using mini-batch approach
for i in range(0, X_train_resampled_svm.shape[0], batch_size):
    X_batch = X_train_resampled_svm[i:i+batch_size]
    y_batch = y_train_resampled_svm[i:i+batch_size]
    svm_model.fit(X_batch, y_batch)  # Train on the current batch
    print(f"Batch {i // batch_size + 1} trained.")

# Predict on test set
y_pred_svm = svm_model.predict(X_test_tfidf.toarray())
svm_test_accuracy = accuracy_score(y_test, y_pred_svm)

print(f"SVM Test Accuracy with mini-batch training: {svm_test_accuracy:.4f}")


Batch 1 trained.
Batch 2 trained.
Batch 3 trained.
Batch 4 trained.
Batch 5 trained.
Batch 6 trained.
Batch 7 trained.
Batch 8 trained.
Batch 9 trained.
Batch 10 trained.
Batch 11 trained.
Batch 12 trained.
Batch 13 trained.
Batch 14 trained.
Batch 15 trained.
Batch 16 trained.
Batch 17 trained.
Batch 18 trained.
Batch 19 trained.
Batch 20 trained.
Batch 21 trained.
Batch 22 trained.
Batch 23 trained.
Batch 24 trained.
Batch 25 trained.
SVM Test Accuracy with mini-batch training: 0.6248


In [5]:
# --- Naive Bayes Model ---

# Train Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

# Predict and evaluate Naive Bayes model
y_pred_nb = nb_model.predict(X_test_tfidf)
nb_test_accuracy = accuracy_score(y_test, y_pred_nb)

In [6]:
# Print accuracies
print(f"LSTM Test Accuracy: {lstm_test_accuracy}")
print(f"SVM Test Accuracy: {svm_test_accuracy}")
print(f"Naive Bayes Test Accuracy: {nb_test_accuracy}")

NameError: name 'lstm_test_accuracy' is not defined