In [4]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelBinarizer, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import SGDClassifier
import nltk

# Pastikan untuk mengunduh resource NLTK
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

# Fungsi untuk membersihkan teks
def preprocess_text(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text.lower())
    cleaned_tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalnum() and word not in stop_words]
    return ' '.join(cleaned_tokens)

# Load your dataset
data = pd.read_csv('dataset/mobile_jkn.csv')  # Load your CSV file

# Preprocess the text
data['content'] = data['content'].apply(preprocess_text)

# Assume 'content' contains review text and 'score' contains sentiment labels
X = data['content']  # Feature: text reviews
y = data['score']    # Label: sentiment

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Hyperparameters
max_words = 5000  # Maximum number of words to consider
max_len = 200     # Maximum length of input sequences
embedding_dim = 128  # Embedding size for each token

# Tokenization of the text data
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

# Convert the text data to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad the sequences to ensure uniform input shape
X_train_padded = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_len, padding='post', truncating='post')

# Convert labels to categorical (one-hot encoding)
lb = LabelBinarizer()
y_train_onehot = lb.fit_transform(y_train)
y_test_onehot = lb.transform(y_test)

# -------------------- Penyeimbangan Data Menggunakan SMOTE -------------------- #
smote = SMOTE(random_state=42)
X_train_padded_resampled, y_train_onehot_resampled = smote.fit_resample(X_train_padded, y_train_onehot)

# Flatten y_train_onehot_resampled and y_test_onehot for compatibility
y_train_flat = y_train_onehot_resampled.argmax(axis=1)
y_test_flat = y_test_onehot.argmax(axis=1)

# -------------------- Mencoba Model SVM dengan Tuning Hyperparameter -------------------- #
# Grid Search untuk hyperparameter SVM
svm_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SGDClassifier(loss='hinge', random_state=42))
])

param_grid = {
    'svm__alpha': [1e-3, 1e-4, 1e-5],  # Regularization strength
    'svm__max_iter': [1000, 2000],  # Maximum number of iterations
}

grid_search_svm = GridSearchCV(svm_pipeline, param_grid, cv=5, scoring='f1_weighted', verbose=1)
grid_search_svm.fit(X_train_padded_resampled, y_train_flat)

# Hasil terbaik dari Grid Search
best_svm_model = grid_search_svm.best_estimator_
print("Best parameters for SVM:", grid_search_svm.best_params_)

# Predict on test data
y_pred_svm_batch = best_svm_model.predict(X_test_padded)

# SVM Accuracy
svm_batch_accuracy = accuracy_score(y_test_flat, y_pred_svm_batch)
print(f"SVM Test Accuracy: {svm_batch_accuracy}")

# SVM Precision, Recall, F1-Score
svm_batch_precision = precision_score(y_test_flat, y_pred_svm_batch, average='weighted')
svm_batch_recall = recall_score(y_test_flat, y_pred_svm_batch, average='weighted')
svm_batch_f1 = f1_score(y_test_flat, y_pred_svm_batch, average='weighted')

print(f"SVM Precision: {svm_batch_precision}")
print(f"SVM Recall: {svm_batch_recall}")
print(f"SVM F1-Score: {svm_batch_f1}")

# -------------------- Mencoba Model Random Forest -------------------- #
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_padded_resampled, y_train_flat)

# Predict on test data
y_pred_rf = rf_model.predict(X_test_padded)

# Random Forest Accuracy
rf_accuracy = accuracy_score(y_test_flat, y_pred_rf)
print(f"Random Forest Test Accuracy: {rf_accuracy}")

# Random Forest Precision, Recall, F1-Score
rf_precision = precision_score(y_test_flat, y_pred_rf, average='weighted')
rf_recall = recall_score(y_test_flat, y_pred_rf, average='weighted')
rf_f1 = f1_score(y_test_flat, y_pred_rf, average='weighted')

print(f"Random Forest Precision: {rf_precision}")
print(f"Random Forest Recall: {rf_recall}")
print(f"Random Forest F1-Score: {rf_f1}")

# -------------------- Mencoba Model XGBoost -------------------- #
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb_model.fit(X_train_padded_resampled, y_train_flat)

# Predict on test data
y_pred_xgb = xgb_model.predict(X_test_padded)

# XGBoost Accuracy
xgb_accuracy = accuracy_score(y_test_flat, y_pred_xgb)
print(f"XGBoost Test Accuracy: {xgb_accuracy}")

# XGBoost Precision, Recall, F1-Score
xgb_precision = precision_score(y_test_flat, y_pred_xgb, average='weighted')
xgb_recall = recall_score(y_test_flat, y_pred_xgb, average='weighted')
xgb_f1 = f1_score(y_test_flat, y_pred_xgb, average='weighted')

print(f"XGBoost Precision: {xgb_precision}")
print(f"XGBoost Recall: {xgb_recall}")
print(f"XGBoost F1-Score: {xgb_f1}")

# -------------------- Perbandingan Hasil Model -------------------- #
print("\nPerbandingan Hasil Model:")
print(f"SVM Test Accuracy: {svm_batch_accuracy}, Precision: {svm_batch_precision}, Recall: {svm_batch_recall}, F1-Score: {svm_batch_f1}")
print(f"Random Forest Test Accuracy: {rf_accuracy}, Precision: {rf_precision}, Recall: {rf_recall}, F1-Score: {rf_f1}")
print(f"XGBoost Test Accuracy: {xgb_accuracy}, Precision: {xgb_precision}, Recall: {xgb_recall}, F1-Score: {xgb_f1}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Kurotsuki\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Kurotsuki\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Kurotsuki\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Kurotsuki\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best parameters for SVM: {'svm__alpha': 0.0001, 'svm__max_iter': 1000}
SVM Test Accuracy: 0.57055
SVM Precision: 0.6465039239395692
SVM Recall: 0.57055
SVM F1-Score: 0.5823968843732784
Random Forest Test Accuracy: 0.71545
Random Forest Precision: 0.7230285722182033
Random Forest Recall: 0.71545
Random Forest F1-Score: 0.7131417241685111


Parameters: { "use_label_encoder" } are not used.



XGBoost Test Accuracy: 0.71135
XGBoost Precision: 0.7534009095092872
XGBoost Recall: 0.71135
XGBoost F1-Score: 0.7309879348846319

Perbandingan Hasil Model:
SVM Test Accuracy: 0.57055, Precision: 0.6465039239395692, Recall: 0.57055, F1-Score: 0.5823968843732784
Random Forest Test Accuracy: 0.71545, Precision: 0.7230285722182033, Recall: 0.71545, F1-Score: 0.7131417241685111
XGBoost Test Accuracy: 0.71135, Precision: 0.7534009095092872, Recall: 0.71135, F1-Score: 0.7309879348846319
