# PROYEK SENTIMENT ANALYSIS - HONKAI IMPACT 3 GOOGLE PLAY REVIEWS

- **Nama:** Faris Munir Mahdi
- **Dataset:** Google Play Store Reviews - Honkai Impact 3 Application
- **Domain Proyek:** Natural Language Processing and Sentiment Analysis
- **Target:** Minimal 3 model dengan akurasi testing ≥85%

# 1. IMPORT LIBRARIES DAN SETUP

In [120]:
# 1.1. Import library machine learning
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [121]:
# 1.2. Import library TensorFlow untuk LSTM
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

In [122]:
# 1.3. Konfigurasi GPU TensorFlow
print(f"TensorFlow version: {tf.__version__}")
print(f"GPU Available: {tf.config.list_physical_devices('GPU')}")

TensorFlow version: 2.18.0
GPU Available: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [123]:
# 1.4. Aktifkan GPU memory growth untuk menghindari error OOM
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print(f"✅ GPU memory growth enabled for {len(gpus)} GPU(s)")
    except RuntimeError as e:
        print(f"⚠ GPU configuration error: {e}")
else:
    print("⚠ No GPU detected, using CPU")

✅ GPU memory growth enabled for 1 GPU(s)


# 2. LOAD DATASET DAN SETUP PATH

In [124]:
# 2.1. Setup path dataset
current_dir = os.getcwd()
dataset_path = os.path.join(current_dir, 'datasets', 'ulasan_honkai_impact_3_processed.csv')

In [125]:
# 2.2. Load dataset yang sudah diproses
df = pd.read_csv(dataset_path)
print(f"✅ Dataset loaded: {len(df)} samples")

✅ Dataset loaded: 51638 samples


# 3. ANALISIS EKSPLORATORI DATA (EDA)

In [126]:
# 3.1. Informasi dasar dataset
print(f"\n--- Dataset Info ---")
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")


--- Dataset Info ---
Shape: (51638, 21)
Columns: ['reviewId', 'userName', 'userImage', 'content', 'score', 'thumbsUpCount', 'reviewCreatedVersion', 'at', 'replyContent', 'repliedAt', 'appVersion', 'content_final', 'text_clean', 'text_casefold', 'text_slang', 'text_tokens', 'text_filtered', 'text_final', 'polarity_score', 'polarity', 'text_length']


In [127]:
# 3.2. Distribusi sentimen
print(f"\n--- Sentiment Distribution ---")
print(df['polarity'].value_counts())


--- Sentiment Distribution ---
polarity
positive    39135
negative    11920
neutral       583
Name: count, dtype: int64


In [128]:
# 3.3. Analisis missing values
print(f"\n--- Missing Values ---")
print(df.isnull().sum())


--- Missing Values ---
reviewId                   0
userName                   0
userImage                  0
content                    0
score                      0
thumbsUpCount              0
reviewCreatedVersion       0
at                         0
replyContent               0
repliedAt                  0
appVersion                 0
content_final              0
text_clean              1066
text_casefold           1066
text_slang              1066
text_tokens                0
text_filtered              0
text_final              1330
polarity_score             0
polarity                   0
text_length                0
dtype: int64


# 4. DATA CLEANING DAN PREPROCESSING

In [129]:
# 4.1. Pembersihan data
print(f"\n--- Data Cleaning ---")
print(f"Before cleaning: {len(df)} samples")

df = df.dropna(subset=['text_final', 'polarity'])
df = df.drop_duplicates(subset=['text_final'])

print(f"After cleaning: {len(df)} samples")


--- Data Cleaning ---
Before cleaning: 51638 samples
After cleaning: 35426 samples


In [130]:
# 4.2. Validasi kriteria dataset
if len(df) >= 3000:
    print(f"✅ Dataset meets criteria (≥3000 samples)")
else:
    print(f"⚠ Dataset below 3000 samples")

✅ Dataset meets criteria (≥3000 samples)


# 5. PERSIAPAN FEATURE DAN TARGET

In [131]:
# 5.1. Definisi feature dan target
X = df['text_final']
y = df['polarity']

In [132]:
# 5.2. Informasi feature dan target
print(f"\n--- Feature & Target Info ---")
print(f"Text features: {len(X)}")
print(f"Target distribution:")
print(y.value_counts())


--- Feature & Target Info ---
Text features: 35426
Target distribution:
polarity
positive    25127
negative     9824
neutral       475
Name: count, dtype: int64


# 6. EKSTRAKSI FEATURE: TF-IDF VECTORIZATION

In [133]:
# 6.1. Setup TF-IDF Vectorizer
print(f"\n--- TF-IDF Vectorization ---")
tfidf_vectorizer = TfidfVectorizer(max_features=1000, min_df=5, max_df=0.85, ngram_range=(1,2))
tfidf_matrix = tfidf_vectorizer.fit_transform(X)
X_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
print(f"✅ TF-IDF shape: {X_tfidf.shape}")


--- TF-IDF Vectorization ---
✅ TF-IDF shape: (35426, 1000)


# 7. EKSTRAKSI FEATURE: BAG OF WORDS VECTORIZATION

In [134]:
# 7.1. Setup Bag of Words Vectorizer
print(f"\n--- Bag of Words Vectorization ---")
vectorizer = CountVectorizer(max_features=1000, min_df=5, max_df=0.85)
bow_matrix = vectorizer.fit_transform(X)
X_bow = pd.DataFrame(bow_matrix.toarray(), columns=vectorizer.get_feature_names_out())
print(f"✅ BoW shape: {X_bow.shape}")


--- Bag of Words Vectorization ---
✅ BoW shape: (35426, 1000)


# 8. PEMBAGIAN DATA UNTUK EKSPERIMEN

In [135]:
# 8.1. Setup pembagian data untuk 4 eksperimen
print(f"\n--- Data Splitting ---")
# Experiment 1: Logistic Regression | TF-IDF | 80/20
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Experiment 2: Random Forest | TF-IDF | 70/30
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_tfidf, y, test_size=0.3, random_state=42)

# Experiment 3: Random Forest | BoW | 80/20
X_train3, X_test3, y_train3, y_test3 = train_test_split(X_bow, y, test_size=0.2, random_state=42)


--- Data Splitting ---


In [136]:
# 8.2. Inisialisasi container hasil
all_results = []
print("✅ Data split complete for 4 experiments")

✅ Data split complete for 4 experiments


# 9. PEMODELAN: LOGISTIC REGRESSION (TF-IDF)

In [137]:
# 9.1. Training model Logistic Regression
print(f"\n--- Experiment 1: Logistic Regression + TF-IDF ---")

# Inisialisasi dan latih model
lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train1, y_train1)


--- Experiment 1: Logistic Regression + TF-IDF ---


In [138]:
y_train_pred1 = lr.predict(X_train1)
y_test_pred1 = lr.predict(X_test1)

train_acc1 = accuracy_score(y_train1, y_train_pred1)
test_acc1 = accuracy_score(y_test1, y_test_pred1)

# Buat dictionary hasil
result1 = {
    'Model': 'Logistic Regression (TF-IDF)',
    'Train Acc': train_acc1,
    'Test Acc': test_acc1
}

In [139]:
all_results.append(result1)

# Cetak hasil evaluasi dengan format yang lebih rapi dan eksplisit
print(f"Model: {result1['Model']}")
print(f"Akurasi Pelatihan: {result1['Train Acc']:.4f}")
print(f"Akurasi Pengujian: {result1['Test Acc']:.4f}")
print("--- Eksperimen 1 Selesai ---") # Indikator bahwa eksperimen selesai

Model: Logistic Regression (TF-IDF)
Akurasi Pelatihan: 0.9175
Akurasi Pengujian: 0.9074
--- Eksperimen 1 Selesai ---


# 10. PEMODELAN: RANDOM FOREST (TF-IDF)

In [140]:
# 10.1. Training model Random Forest dengan TF-IDF
print(f"\n--- Experiment 2: Random Forest + TF-IDF ---")
rf_tfidf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_tfidf.fit(X_train2, y_train2)


--- Experiment 2: Random Forest + TF-IDF ---


In [141]:
# 10.2. Prediksi dan evaluasi
y_train_pred2 = rf_tfidf.predict(X_train2)
y_test_pred2 = rf_tfidf.predict(X_test2)

train_acc2 = accuracy_score(y_train2, y_train_pred2)
test_acc2 = accuracy_score(y_test2, y_test_pred2)

result2 = {'Model': 'Random Forest (TF-IDF)', 'Train Acc': train_acc2, 'Test Acc': test_acc2}

In [142]:
# 10.3. Simpan hasil eksperimen 2
all_results.append(result2)
print(f"✅ Train: {train_acc2:.4f}, Test: {test_acc2:.4f}")

✅ Train: 0.9846, Test: 0.8790


# 11. PEMODELAN: RANDOM FOREST (BAG OF WORDS)

In [143]:
# 11.1. Training model Random Forest dengan BoW
print(f"\n--- Experiment 3: Random Forest + BoW ---")
rf_bow = RandomForestClassifier(n_estimators=100, random_state=42)
rf_bow.fit(X_train3, y_train3)


--- Experiment 3: Random Forest + BoW ---


In [144]:
# 11.2. Prediksi dan evaluasi
y_train_pred3 = rf_bow.predict(X_train3)
y_test_pred3 = rf_bow.predict(X_test3)

In [145]:
train_acc3 = accuracy_score(y_train3, y_train_pred3)
test_acc3 = accuracy_score(y_test3, y_test_pred3)

In [146]:
# 11.3. Simpan hasil eksperimen 3
result3 = {'Model': 'Random Forest (BoW)', 'Train Acc': train_acc3, 'Test Acc': test_acc3}
all_results.append(result3)
print(f"✅ Train: {train_acc3:.4f}, Test: {test_acc3:.4f}")


✅ Train: 0.9866, Test: 0.8750


# 12. PEMODELAN: LSTM (DEEP LEARNING)

In [147]:
# 12.1. Persiapan data untuk LSTM
print(f"\n--- Experiment 4: LSTM Deep Learning ---")

# Prepare data for LSTM
X_text = df['text_final']
y_text = df['polarity']


--- Experiment 4: LSTM Deep Learning ---


In [148]:
# 12.2. Encode labels ke nilai numerik
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y_text)
num_classes = len(label_encoder.classes_)

In [149]:
# 12.3. Tokenisasi dengan 5000 kata paling sering
tokenizer = Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer.fit_on_texts(X_text)

In [150]:
# 12.4. Konversi teks ke sequences
sequences = tokenizer.texts_to_sequences(X_text)

In [151]:
# 12.5. Pad sequences ke panjang uniform 20
X_padded = pad_sequences(sequences, maxlen=20, padding='post')

In [152]:
# 12.6. One-hot encode labels untuk categorical crossentropy
y_categorical = to_categorical(y_encoded, num_classes=num_classes)

In [153]:
# 12.7. Split data untuk LSTM (80/20)
X_train_lstm, X_test_lstm, y_train_lstm, y_test_lstm = train_test_split(
    X_padded, y_categorical, test_size=0.2, random_state=42
)

In [154]:
# 12.8. Build arsitektur LSTM
with tf.device('/GPU:0' if gpus else '/CPU:0'):
    lstm_model = Sequential()
    lstm_model.add(Embedding(input_dim=5000, output_dim=64))  # Embedding layer
    lstm_model.add(LSTM(64, return_sequences=False))         # LSTM layer
    lstm_model.add(Dropout(0.5))                             # Dropout for regularization
    lstm_model.add(Dense(num_classes, activation='softmax')) # Output layer

In [155]:
# 12.9. Compile model
lstm_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

print(f"✅ Model built on: {'GPU' if gpus else 'CPU'}")
print("LSTM Model Summary:")
lstm_model.summary()

✅ Model built on: GPU
LSTM Model Summary:


In [156]:
# 12.10. Training model
print("⏳ Training LSTM model...")
print(f"Training on: {'GPU' if gpus else 'CPU'}")

# Use GPU for training if available
with tf.device('/GPU:0' if gpus else '/CPU:0'):
    history = lstm_model.fit(X_train_lstm, y_train_lstm, epochs=10, batch_size=64, 
                            validation_split=0.1, verbose=1)

⏳ Training LSTM model...
Training on: GPU
Epoch 1/10
[1m399/399[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 27ms/step - accuracy: 0.7334 - loss: 0.6453 - val_accuracy: 0.8694 - val_loss: 0.3388
Epoch 2/10
[1m399/399[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 21ms/step - accuracy: 0.9038 - loss: 0.2664 - val_accuracy: 0.8546 - val_loss: 0.3680
Epoch 3/10
[1m399/399[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 19ms/step - accuracy: 0.9204 - loss: 0.2220 - val_accuracy: 0.8843 - val_loss: 0.3153
Epoch 4/10
[1m399/399[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 25ms/step - accuracy: 0.9381 - loss: 0.1786 - val_accuracy: 0.8885 - val_loss: 0.3273
Epoch 5/10
[1m399/399[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 23ms/step - accuracy: 0.9485 - loss: 0.1506 - val_accuracy: 0.8832 - val_loss: 0.3453
Epoch 6/10
[1m399/399[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 18ms/step - accuracy: 0.9529 - loss: 0.1374 - val_accuracy: 0.87

In [157]:
# 12.11. Make predictions
y_train_pred_lstm = lstm_model.predict(X_train_lstm)
y_test_pred_lstm = lstm_model.predict(X_test_lstm)

[1m886/886[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7ms/step
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step


In [158]:
# 12.12. Convert predictions back to class labels
y_train_pred_labels = np.argmax(y_train_pred_lstm, axis=1)
y_test_pred_labels = np.argmax(y_test_pred_lstm, axis=1)
y_train_true_labels = np.argmax(y_train_lstm, axis=1)
y_test_true_labels = np.argmax(y_test_lstm, axis=1)

In [159]:
# 12.13. Calculate accuracies
train_acc_lstm = accuracy_score(y_train_true_labels, y_train_pred_labels)
test_acc_lstm = accuracy_score(y_test_true_labels, y_test_pred_labels)

In [160]:
# 12.14. Simpan hasil eksperimen 4
result4 = {'Model': 'LSTM (Deep Learning)', 'Train Acc': train_acc_lstm, 'Test Acc': test_acc_lstm}
all_results.append(result4)
print(f"✅ Train: {train_acc_lstm:.4f}, Test: {test_acc_lstm:.4f}")

✅ Train: 0.9641, Test: 0.8745


# 13. EVALUASI MODEL DAN PERBANDINGAN

In [161]:
# 13.1. Ringkasan performa semua model
print(f"\n--- Model Performance Summary ---")
accuracy_df_final = pd.DataFrame(all_results).sort_values(by='Test Acc', ascending=False)
print(accuracy_df_final.to_string(index=False))


--- Model Performance Summary ---
                       Model  Train Acc  Test Acc
Logistic Regression (TF-IDF)   0.917502  0.907423
      Random Forest (TF-IDF)   0.984596  0.878999
         Random Forest (BoW)   0.986591  0.874965
        LSTM (Deep Learning)   0.964114  0.874541


# 14. VALIDASI KRITERIA SUBMISSION

In [162]:
# 14.1. Pengecekan kriteria submission
print(f"\n--- Submission Criteria Check ---")
models_above_85 = accuracy_df_final[accuracy_df_final['Test Acc'] >= 0.85]
print(f"Models with ≥85% accuracy: {len(models_above_85)}/4")


--- Submission Criteria Check ---
Models with ≥85% accuracy: 4/4


In [163]:
# 14.2. Evaluasi kriteria utama
if len(models_above_85) >= 3:
    print("✅ PASSED: ≥3 models with 85%+ accuracy")
else:
    print("⚠ FAILED: Need ≥3 models with 85%+ accuracy")

✅ PASSED: ≥3 models with 85%+ accuracy


In [164]:
# 14.3. Informasi model terbaik
best_model_info = accuracy_df_final.iloc[0]
print(f"🏆 Best model: {best_model_info['Model']} ({best_model_info['Test Acc']:.4f})")

🏆 Best model: Logistic Regression (TF-IDF) (0.9074)


In [165]:
# 14.4. Pengecekan kriteria bonus
models_above_92 = accuracy_df_final[accuracy_df_final['Test Acc'] >= 0.92]
if len(models_above_92) >= 1:
    print("🎉 BONUS: ≥1 model with 92%+ accuracy")

# 15. FUNGSI INFERENCE UNTUK PREDIKSI

In [166]:
# 15.1. Fungsi prediksi untuk model tradisional
def predict_sentiment(text, model, vectorizer):
    """Prediksi sentimen menggunakan model tradisional (LR, RF)"""
    text_vector = vectorizer.transform([text])
    if hasattr(vectorizer, 'get_feature_names_out'):
        feature_names = vectorizer.get_feature_names_out()
        text_df = pd.DataFrame(text_vector.toarray(), columns=feature_names)
        pred_class = model.predict(text_df)[0]
        
        if hasattr(model, 'predict_proba'):
            probs = model.predict_proba(text_df)[0]
            confidence = max(probs)
        else:
            confidence = 1.0
    else:
        text_array = text_vector.toarray()
        pred_class = model.predict(text_array)[0]
        
        if hasattr(model, 'predict_proba'):
            probs = model.predict_proba(text_array)[0]
            confidence = max(probs)
        else:
            confidence = 1.0
    
    return pred_class, confidence

In [167]:
# 15.2. Fungsi prediksi LSTM
def predict_lstm(text):
    """Prediksi sentimen menggunakan LSTM model"""
    seq = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(seq, maxlen=20, padding='post')
    with tf.device('/GPU:0' if gpus else '/CPU:0'):
        probs = lstm_model.predict(padded, verbose=0)  # Add verbose=0 to reduce output
    pred_class = np.argmax(probs, axis=1)[0]
    confidence = max(probs[0])
    return label_encoder.inverse_transform([pred_class])[0], confidence

# 16. TESTING INFERENCE PADA CONTOH TEKS

In [168]:
# 16.1. Contoh teks untuk testing
print(f"\n--- Inference Testing ---")

test_texts = [
    "good game, fun mechanics",
    "Membosankan",
    "garbage company",
    "Honkai Impact 3rd game terbaik!",
    "bring back part",
]


--- Inference Testing ---


In [169]:
# 16.2. Testing semua model pada contoh teks
for i, text in enumerate(test_texts, 1):
    print(f"\nTest {i}: '{text}'")
    
    pred_lr, conf_lr = predict_sentiment(text, lr, tfidf_vectorizer)
    print(f"Logistic Regression: {pred_lr} ({conf_lr:.3f})")
    
    pred_rf_tfidf, conf_rf_tfidf = predict_sentiment(text, rf_tfidf, tfidf_vectorizer)
    print(f"Random Forest (TF-IDF): {pred_rf_tfidf} ({conf_rf_tfidf:.3f})")
    
    pred_rf_bow, conf_rf_bow = predict_sentiment(text, rf_bow, vectorizer)
    print(f"Random Forest (BoW): {pred_rf_bow} ({conf_rf_bow:.3f})")
    
    pred_lstm, conf_lstm = predict_lstm(text)
    print(f"LSTM: {pred_lstm} ({conf_lstm:.3f})")


Test 1: 'good game, fun mechanics'
Logistic Regression: positive (1.000)
Random Forest (TF-IDF): positive (1.000)
Random Forest (BoW): positive (1.000)
LSTM: positive (1.000)

Test 2: 'Membosankan'
Logistic Regression: negative (0.990)
Random Forest (TF-IDF): negative (1.000)
Random Forest (BoW): negative (1.000)
LSTM: negative (1.000)

Test 3: 'garbage company'
Logistic Regression: negative (0.980)
Random Forest (TF-IDF): negative (0.980)
Random Forest (BoW): negative (1.000)
LSTM: negative (1.000)

Test 4: 'Honkai Impact 3rd game terbaik!'
Logistic Regression: positive (0.999)
Random Forest (TF-IDF): positive (0.998)
Random Forest (BoW): positive (1.000)
LSTM: positive (1.000)

Test 5: 'bring back part'
Logistic Regression: negative (0.522)
Random Forest (TF-IDF): neutral (0.620)
Random Forest (BoW): neutral (0.640)
LSTM: neutral (0.873)


# 17. RINGKASAN FINAL PROYEK

In [170]:
# 17.1. Summary lengkap proyek
print(f"\n--- Final Summary ---")
unique_labels = sorted(y.unique())
print(f"📊 Dataset: {len(df)} samples")
print(f"🏷️ Classes: {', '.join(unique_labels)}")
print(f"🏆 Best model: {best_model_info['Model']} - {best_model_info['Test Acc']:.4f}")
print(f"✅ Criteria: {'✅ PASSED' if len(models_above_85) >= 3 else '⚠ FAILED'}")
print("🎯 SENTIMENT ANALYSIS PROJECT COMPLETED WITH GPU ACCELERATION")


--- Final Summary ---
📊 Dataset: 35426 samples
🏷️ Classes: negative, neutral, positive
🏆 Best model: Logistic Regression (TF-IDF) - 0.9074
✅ Criteria: ✅ PASSED
🎯 SENTIMENT ANALYSIS PROJECT COMPLETED WITH GPU ACCELERATION


# 18. FUNGSI UPDATE REPORT.MD DENGAN HASIL AKTUAL

In [171]:
def update_report_with_results():
    """Update report.md file dengan hasil aktual dari analisis"""
    
    # 18.1. Path ke file report
    report_path = os.path.join(current_dir, 'report.md')
    
    # 18.2. Baca file report
    try:
        with open(report_path, 'r', encoding='utf-8') as f:
            content = f.read()
        
        # 18.3. Hitung statistik dataset
        dataset_size = len(df)
        sentiment_counts = df['polarity'].value_counts()
        total_samples = sentiment_counts.sum()
        
        positive_count = sentiment_counts.get('positive', 0)
        negative_count = sentiment_counts.get('negative', 0)
        neutral_count = sentiment_counts.get('neutral', 0)
        
        positive_percent = round((positive_count / total_samples) * 100, 1)
        negative_percent = round((negative_count / total_samples) * 100, 1)
        neutral_percent = round((neutral_count / total_samples) * 100, 1)
        
        # 18.4. Update placeholders dengan nilai aktual
        replacements = {
            '{DATASET_SIZE}': f"{dataset_size:,}",
            '{POSITIVE_COUNT}': f"{positive_count:,}",
            '{NEGATIVE_COUNT}': f"{negative_count:,}",
            '{NEUTRAL_COUNT}': f"{neutral_count:,}",
            '{POSITIVE_PERCENT}': f"{positive_percent}",
            '{NEGATIVE_PERCENT}': f"{negative_percent}",
            '{NEUTRAL_PERCENT}': f"{neutral_percent}",
            
            # Model accuracies
            '{LR_TRAIN_ACC}': f"{train_acc1:.4f}",
            '{LR_TEST_ACC}': f"{test_acc1:.4f}",
            '{LR_TRAIN_ACC_PERCENT}': f"{train_acc1*100:.2f}",
            '{LR_TEST_ACC_PERCENT}': f"{test_acc1*100:.2f}",
            '{LR_STATUS}': "✅ PASS" if test_acc1 >= 0.85 else "❌ FAIL",
            
            '{RF_TFIDF_TRAIN_ACC}': f"{train_acc2:.4f}",
            '{RF_TFIDF_TEST_ACC}': f"{test_acc2:.4f}",
            '{RF_TFIDF_TRAIN_ACC_PERCENT}': f"{train_acc2*100:.2f}",
            '{RF_TFIDF_TEST_ACC_PERCENT}': f"{test_acc2*100:.2f}",
            '{RF_TFIDF_STATUS}': "✅ PASS" if test_acc2 >= 0.85 else "❌ FAIL",
            
            '{RF_BOW_TRAIN_ACC}': f"{train_acc3:.4f}",
            '{RF_BOW_TEST_ACC}': f"{test_acc3:.4f}",
            '{RF_BOW_TRAIN_ACC_PERCENT}': f"{train_acc3*100:.2f}",
            '{RF_BOW_TEST_ACC_PERCENT}': f"{test_acc3*100:.2f}",
            '{RF_BOW_STATUS}': "✅ PASS" if test_acc3 >= 0.85 else "❌ FAIL",
            
            '{LSTM_TRAIN_ACC}': f"{train_acc_lstm:.4f}",
            '{LSTM_TEST_ACC}': f"{test_acc_lstm:.4f}",
            '{LSTM_TRAIN_ACC_PERCENT}': f"{train_acc_lstm*100:.2f}",
            '{LSTM_TEST_ACC_PERCENT}': f"{test_acc_lstm*100:.2f}",
            '{LSTM_STATUS}': "✅ PASS" if test_acc_lstm >= 0.85 else "❌ FAIL",
            
            # Best model info
            '{BEST_MODEL}': best_model_info['Model'],
            '{BEST_ACCURACY}': f"{best_model_info['Test Acc']:.4f}",
            
            # Criteria validation
            '{MODELS_ABOVE_85_COUNT}': str(len(models_above_85)),
            '{FINAL_STATUS}': "✅ PASSED" if len(models_above_85) >= 3 else "❌ FAILED"
        }
        
        # 18.5. Replace semua placeholders
        for placeholder, value in replacements.items():
            content = content.replace(placeholder, str(value))
        
        # 18.6. Tulis kembali file report
        with open(report_path, 'w', encoding='utf-8') as f:
            f.write(content)
        
        print(f"✅ Report updated successfully: {report_path}")
        
        # 18.7. Summary update yang dilakukan
        print(f"\n--- Report Update Summary ---")
        print(f"📊 Dataset size: {dataset_size:,} samples")
        print(f"📈 Best model: {best_model_info['Model']} ({best_model_info['Test Acc']:.4f})")
        print(f"✅ Models above 85%: {len(models_above_85)}/4")
        print(f"🎯 Final status: {'PASSED' if len(models_above_85) >= 3 else 'FAILED'}")
        
    except FileNotFoundError:
        print(f"❌ Report file not found: {report_path}")
    except Exception as e:
        print(f"❌ Error updating report: {e}")

# 18.8. Panggil fungsi update report
print(f"\n🔄 Updating report.md with actual results...")
update_report_with_results()


🔄 Updating report.md with actual results...
✅ Report updated successfully: /home/ubuntu/Documents/Machine_Learning/sentimen_analysis_honkai_impact_3/report.md

--- Report Update Summary ---
📊 Dataset size: 35,426 samples
📈 Best model: Logistic Regression (TF-IDF) (0.9074)
✅ Models above 85%: 4/4
🎯 Final status: PASSED
