In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from imblearn.over_sampling import SMOTE
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from keras.optimizers import Adam
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
# # Load dataset
# df = pd.read_csv('dataset/mobile_jkn.csv')

# # Data preprocessing
# X = df['content']  # Assuming 'review' column has the text
# y = df['score']  # Assuming 'rating' column has the labels

# # Encode labels
# label_encoder = LabelEncoder()
# y_encoded = label_encoder.fit_transform(y)

# # Train-test split
# X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42)

# # Tentukan jumlah kata maksimal dan panjang sekuens maksimal
# max_words = 10000
# max_len = 100

# # Buat tokenizer
# tokenizer = Tokenizer(num_words=max_words)
# tokenizer.fit_on_texts(X_train)

# # Ubah teks menjadi sekuens
# X_train_seq = tokenizer.texts_to_sequences(X_train)
# X_test_seq = tokenizer.texts_to_sequences(X_test)

# # Padding sekuens agar semua input memiliki panjang yang sama
# X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
# X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

# # Periksa hasil padding
# print(X_train_pad.shape, X_test_pad.shape)

# # Function to evaluate models
# def evaluate_model(y_true, y_pred):
#     accuracy = accuracy_score(y_true, y_pred)
#     precision = precision_score(y_true, y_pred, average='weighted', zero_division=0)
#     recall = recall_score(y_true, y_pred, average='weighted', zero_division=0)
#     f1 = f1_score(y_true, y_pred, average='weighted', zero_division=0)
    
#     return accuracy, precision, recall, f1

# # Results dictionary
# results = {
#     'Model': [],
#     'Accuracy': [],
#     'Precision': [],
#     'Recall': [],
#     'F1 Score': []
# }


In [6]:
# Load dataset
df = pd.read_csv('dataset/mobile_jkn.csv')

# Data preprocessing
X = df['content']  # Assuming 'content' column has the text data
y = df['score']  # Assuming 'score' column has the target labels

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42)

# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=10000)  # Limit to 10,000 words
X_train_tfidf = tfidf.fit_transform(X_train).toarray()  # Convert to TF-IDF features (dense array)
X_test_tfidf = tfidf.transform(X_test).toarray()  # Apply TF-IDF on test data

# Padding sequences to ensure the input to LSTM has a consistent length (required for RNN models)
max_len = 100  # Maximum length for padding (this can be adjusted based on your data)

# Since TF-IDF already outputs dense features, we don't need padding directly, but can truncate/pad the sequences
X_train_pad = pad_sequences(X_train_tfidf, maxlen=max_len)
X_test_pad = pad_sequences(X_test_tfidf, maxlen=max_len)

# If using SMOTE (Optional)
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_pad, y_train)

# Check the shapes of padded data
print(X_train_pad.shape, X_test_pad.shape)

MemoryError: Unable to allocate 5.22 GiB for an array with shape (70000, 10000) and data type float64

In [None]:
# Model LSTM tanpa SMOTE
model_without_smote = Sequential()
model_without_smote.add(Embedding(max_words, 128, input_length=max_len))  # Layer embedding
model_without_smote.add(LSTM(100))  # Layer LSTM
model_without_smote.add(Dense(2, activation='softmax'))  # Output layer untuk klasifikasi

# Kompilasi model
optimizer = Adam(learning_rate=0.001)
model_without_smote.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Train model
model_without_smote.fit(X_train_pad, y_train, epochs=5, batch_size=64, verbose=0)

# Prediksi dan evaluasi model
y_pred_lstm_without_smote = np.argmax(model_without_smote.predict(X_test_pad), axis=1)
accuracy, precision, recall, f1 = evaluate_model(y_test, y_pred_lstm_without_smote)

results['Model'].append('LSTM (Without SMOTE)')
results['Accuracy'].append(accuracy)
results['Precision'].append(precision)
results['Recall'].append(recall)
results['F1 Score'].append(f1)

# Terapkan SMOTE pada data yang sudah di-padding
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_pad, y_train)

# Model LSTM dengan SMOTE
model_with_smote = Sequential()
model_with_smote.add(Embedding(max_words, 128, input_length=max_len))  # Layer embedding
model_with_smote.add(LSTM(100))  # Layer LSTM
model_with_smote.add(Dense(2, activation='softmax'))  # Output layer untuk klasifikasi

# Kompilasi model
model_with_smote.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Train model
model_with_smote.fit(X_train_res, y_train_res, epochs=5, batch_size=64, verbose=0)

# Prediksi dan evaluasi model
y_pred_lstm_with_smote = np.argmax(model_with_smote.predict(X_test_pad), axis=1)
accuracy, precision, recall, f1 = evaluate_model(y_test, y_pred_lstm_with_smote)

results['Model'].append('LSTM (With SMOTE)')
results['Accuracy'].append(accuracy)
results['Precision'].append(precision)
results['Recall'].append(recall)
results['F1 Score'].append(f1)

In [None]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.model_selection import train_test_split

# # Definisikan jumlah maksimal fitur yang akan digunakan oleh TF-IDF
# max_words = 10000

# # Buat vectorizer TF-IDF
# tfidf = TfidfVectorizer(max_features=max_words)

# # Transformasikan data teks menjadi vektor TF-IDF tanpa mengkonversi ke array
# X_tfidf = tfidf.fit_transform(df['content'])  # Kembali ke sparse matrix
# y = df['score']

# # Train-test split
# X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.3, random_state=42)


In [None]:
# Naive Bayes tanpa SMOTE
nb_model_without_smote = MultinomialNB()
nb_model_without_smote.fit(X_train, y_train)  # Menggunakan data hasil TF-IDF

# Prediksi
y_pred_nb_without_smote = nb_model_without_smote.predict(X_test)

# Evaluasi model
accuracy, precision, recall, f1 = evaluate_model(y_test, y_pred_nb_without_smote)

# Simpan hasil ke dalam dictionary
results['Model'].append('Naive Bayes (Without SMOTE)')
results['Accuracy'].append(accuracy)
results['Precision'].append(precision)
results['Recall'].append(recall)
results['F1 Score'].append(f1)

# Terapkan SMOTE pada data hasil TF-IDF
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Naive Bayes dengan SMOTE
nb_model_with_smote = MultinomialNB()
nb_model_with_smote.fit(X_train_res, y_train_res)  # Menggunakan data yang diresample

# Prediksi
y_pred_nb_with_smote = nb_model_with_smote.predict(X_test)

# Evaluasi model
accuracy, precision, recall, f1 = evaluate_model(y_test, y_pred_nb_with_smote)

# Simpan hasil ke dalam dictionary
results['Model'].append('Naive Bayes (With SMOTE)')
results['Accuracy'].append(accuracy)
results['Precision'].append(precision)
results['Recall'].append(recall)
results['F1 Score'].append(f1)

In [None]:
# # Definisikan jumlah maksimal fitur yang akan digunakan oleh TF-IDF
# max_words = 10000

# # Buat vectorizer TF-IDF
# tfidf = TfidfVectorizer(max_features=max_words)

# # Transformasikan data teks menjadi vektor TF-IDF
# X_tfidf = tfidf.fit_transform(df['content']).toarray()  # Preprocess 'content'
# y = df['score']

# # Train-test split
# X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.3, random_state=42)


In [None]:
# Random Forest tanpa SMOTE
rf_model_without_smote = RandomForestClassifier(random_state=42)
rf_model_without_smote.fit(X_train, y_train)  # Menggunakan data hasil TF-IDF

# Prediksi
y_pred_rf_without_smote = rf_model_without_smote.predict(X_test)

# Evaluasi model
accuracy, precision, recall, f1 = evaluate_model(y_test, y_pred_rf_without_smote)

# Simpan hasil ke dalam dictionary
results['Model'].append('Random Forest (Without SMOTE)')
results['Accuracy'].append(accuracy)
results['Precision'].append(precision)
results['Recall'].append(recall)
results['F1 Score'].append(f1)

# Terapkan SMOTE pada data hasil TF-IDF
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Random Forest dengan SMOTE
rf_model_with_smote = RandomForestClassifier(random_state=42)
rf_model_with_smote.fit(X_train_res, y_train_res)  # Menggunakan data yang diresample

# Prediksi
y_pred_rf_with_smote = rf_model_with_smote.predict(X_test)

# Evaluasi model
accuracy, precision, recall, f1 = evaluate_model(y_test, y_pred_rf_with_smote)

# Simpan hasil ke dalam dictionary
results['Model'].append('Random Forest (With SMOTE)')
results['Accuracy'].append(accuracy)
results['Precision'].append(precision)
results['Recall'].append(recall)
results['F1 Score'].append(f1)


In [None]:
# Adjust labels if necessary
y_train = y_train - 1
y_test = y_test - 1

# XGBoost tanpa SMOTE
xgb_model_without_smote = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
xgb_model_without_smote.fit(X_train, y_train)  # Menggunakan data hasil TF-IDF

# Prediksi
y_pred_xgb_without_smote = xgb_model_without_smote.predict(X_test)

# Evaluasi model (assumes you have evaluate_model function defined)
accuracy, precision, recall, f1 = evaluate_model(y_test, y_pred_xgb_without_smote)

# Simpan hasil ke dalam dictionary
results['Model'].append('XGBoost (Without SMOTE)')
results['Accuracy'].append(accuracy)
results['Precision'].append(precision)
results['Recall'].append(recall)
results['F1 Score'].append(f1)

# Terapkan SMOTE pada data hasil TF-IDF
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# XGBoost dengan SMOTE
xgb_model_with_smote = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
xgb_model_with_smote.fit(X_train_res, y_train_res)  # Menggunakan data yang di-resample

# Prediksi
y_pred_xgb_with_smote = xgb_model_with_smote.predict(X_test)

# Evaluasi model
accuracy, precision, recall, f1 = evaluate_model(y_test, y_pred_xgb_with_smote)

# Simpan hasil ke dalam dictionary
results['Model'].append('XGBoost (With SMOTE)')
results['Accuracy'].append(accuracy)
results['Precision'].append(precision)
results['Recall'].append(recall)
results['F1 Score'].append(f1)

In [None]:
# Create a DataFrame for results
results_df = pd.DataFrame(results)

# Display results
print(results_df)
