In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelBinarizer

# Function for text preprocessing
def preprocess_text(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text.lower())
    cleaned_tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalnum() and word not in stop_words]
    return ' '.join(cleaned_tokens)

# Load dataset
data = pd.read_csv('dataset/mobile_jkn.csv')

# Preprocess text data
data['content'] = data['content'].apply(preprocess_text)

# Remove empty content
data = data[data['content'].str.strip() != '']  # Remove empty reviews

# Features and labels
X = data['content']
y = data['score']

# Split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Hyperparameters for text data
max_words = 10000
max_len = 150

# Tokenization and Padding
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_padded = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_len, padding='post', truncating='post')

# Label Binarization
lb = LabelBinarizer()
y_train_onehot = lb.fit_transform(y_train)
y_test_onehot = lb.transform(y_test)

# Flatten y_train and y_test for model compatibility
y_train_flat = y_train_onehot.argmax(axis=1)
y_test_flat = y_test_onehot.argmax(axis=1)

# Apply SMOTE to handle imbalanced data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_padded, y_train_flat)

# Function to evaluate models
def evaluate_model(model, X_test_padded, y_test_flat):
    model.fit(X_train_resampled, y_train_resampled)
    y_pred = model.predict(X_test_padded)
    accuracy = accuracy_score(y_test_flat, y_pred)
    precision = precision_score(y_test_flat, y_pred, average='macro')
    recall = recall_score(y_test_flat, y_pred, average='macro')
    f1 = f1_score(y_test_flat, y_pred, average='macro')
    return accuracy, precision, recall, f1

# Initialize models
nb_model = MultinomialNB()
xgb_model = XGBClassifier(random_state=42)
rf_model = RandomForestClassifier(random_state=42)

# Evaluate Naive Bayes
nb_accuracy, nb_precision, nb_recall, nb_f1 = evaluate_model(nb_model, X_test_padded, y_test_flat)
print(f"Naive Bayes Results:\nAccuracy: {nb_accuracy}\nPrecision: {nb_precision}\nRecall: {nb_recall}\nF1 Score: {nb_f1}\n")

# Evaluate XGBoost
xgb_accuracy, xgb_precision, xgb_recall, xgb_f1 = evaluate_model(xgb_model, X_test_padded, y_test_flat)
print(f"XGBoost Results:\nAccuracy: {xgb_accuracy}\nPrecision: {xgb_precision}\nRecall: {xgb_recall}\nF1 Score: {xgb_f1}\n")

# Evaluate Random Forest
rf_accuracy, rf_precision, rf_recall, rf_f1 = evaluate_model(rf_model, X_test_padded, y_test_flat)
print(f"Random Forest Results:\nAccuracy: {rf_accuracy}\nPrecision: {rf_precision}\nRecall: {rf_recall}\nF1 Score: {rf_f1}\n")


Naive Bayes Results:
Accuracy: 0.6057108140947752
Precision: 0.3339582280784015
Recall: 0.28098851091420574
F1 Score: 0.2586715881349586

XGBoost Results:
Accuracy: 0.7008505467800729
Precision: 0.38234736127905145
Recall: 0.39185431480754584
F1 Score: 0.38309478135705277

Random Forest Results:
Accuracy: 0.7055893074119076
Precision: 0.3565844040094506
Recall: 0.3696105286848156
F1 Score: 0.3567225814042015



In [4]:
# Check the number of unique classes in the training data
print(f"Number of unique classes in y_train_resampled: {len(np.unique(y_train_resampled))}")


Number of unique classes in y_train_resampled: 5


In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Embedding, Bidirectional
from keras.callbacks import EarlyStopping
from keras.preprocessing.text import Tokenizer
from imblearn.over_sampling import SMOTE
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load dataset
data = pd.read_csv('dataset/mobile_jkn.csv')

# Data preprocessing
X = data['content']  # Review text
y = data['score']    # Sentiment labels

# Update the encoding to keep original class names
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Text preprocessing
max_length = 100  # Set maximum length of sequences
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_train)  # Fit on the original training data
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

X_train_padded = pad_sequences(X_train_sequences, maxlen=max_length)
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_length)

# SMOTE for oversampling
smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_padded, y_train)

# LSTM model
def create_lstm_model(input_length, vocab_size, num_classes):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=input_length))
    model.add(Bidirectional(LSTM(128, return_sequences=True)))  # Use Bidirectional LSTM
    model.add(Dropout(0.3))  # Increase dropout rate
    model.add(LSTM(64))
    model.add(Dense(num_classes, activation='softmax'))  # Use number of unique classes
    return model

input_length = X_train_padded.shape[1]  # Length of padded sequences
vocab_size = 10000  # Update this to your actual vocab size
num_classes = len(np.unique(y_encoded))  # Get the number of unique classes
lstm_model = create_lstm_model(input_length, vocab_size, num_classes)

# Compile model
lstm_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the LSTM model
history = lstm_model.fit(X_train_resampled, y_train_resampled,
                         validation_data=(X_test_padded, y_test),
                         epochs=30, 
                         batch_size=64,
                         callbacks=[early_stopping],
                         verbose=2)

# Evaluate LSTM model
lstm_loss, lstm_accuracy = lstm_model.evaluate(X_test_padded, y_test, verbose=0)
print(f"LSTM Test Loss: {lstm_loss:.4f}")
print(f"LSTM Test Accuracy: {lstm_accuracy:.4f}")

Epoch 1/30
3863/3863 - 134s - loss: 1.1877 - accuracy: 0.4589 - val_loss: 0.7363 - val_accuracy: 0.7889 - 134s/epoch - 35ms/step
Epoch 2/30
3863/3863 - 125s - loss: 1.1032 - accuracy: 0.4973 - val_loss: 0.7031 - val_accuracy: 0.7973 - 125s/epoch - 32ms/step
Epoch 3/30
3863/3863 - 122s - loss: 1.0595 - accuracy: 0.5199 - val_loss: 0.7004 - val_accuracy: 0.8050 - 122s/epoch - 32ms/step
Epoch 4/30
3863/3863 - 128s - loss: 1.0191 - accuracy: 0.5429 - val_loss: 0.7482 - val_accuracy: 0.7864 - 128s/epoch - 33ms/step
Epoch 5/30
3863/3863 - 126s - loss: 0.9769 - accuracy: 0.5653 - val_loss: 0.7368 - val_accuracy: 0.7847 - 126s/epoch - 33ms/step
Epoch 6/30
3863/3863 - 124s - loss: 0.9321 - accuracy: 0.5884 - val_loss: 0.7962 - val_accuracy: 0.7653 - 124s/epoch - 32ms/step
LSTM Test Loss: 0.7004
LSTM Test Accuracy: 0.8050


In [17]:
# Predictions for classification report
y_pred_lstm = lstm_model.predict(X_test_padded)
y_pred_classes = np.argmax(y_pred_lstm, axis=1)  # Convert predictions to class labels

# Get original class names from label encoder as strings
class_names = label_encoder.classes_.astype(str)

# Calculate and print classification report
report = classification_report(y_test, y_pred_classes, target_names=class_names)
print(report)


              precision    recall  f1-score   support

           1       0.76      0.89      0.82      5139
           2       0.10      0.05      0.06       767
           3       0.11      0.06      0.08       599
           4       0.16      0.13      0.14      1119
           5       0.92      0.91      0.92     12376

    accuracy                           0.81     20000
   macro avg       0.41      0.41      0.40     20000
weighted avg       0.78      0.81      0.79     20000

