In [None]:
# ============================================
# SETUP GOOGLE COLAB
# ============================================

# Install dependencies
!pip install transformers torch pandas numpy scikit-learn matplotlib seaborn tqdm -q

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Path ke folder skripsi di Google Drive
DRIVE_PATH = '/content/drive/MyDrive/skripsi'

import os

# Check apakah folder exists
if os.path.exists(DRIVE_PATH):
    os.chdir(DRIVE_PATH)
    print(f'‚úì Working directory: {os.getcwd()}')
    print(f'‚úì Files in folder skripsi:')
    for f in os.listdir('.'):
        print(f'   - {f}')
else:
    print(f'‚ùå Folder tidak ditemukan: {DRIVE_PATH}')
    print('Pastikan folder "skripsi" ada di Google Drive kamu')

# Check GPU
import torch
if torch.cuda.is_available():
    print(f'\n‚úì GPU Available: {torch.cuda.get_device_name(0)}')
    print(f'‚úì GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB')
else:
    print('\n‚ö†Ô∏è GPU not available, using CPU (akan lebih lambat)')

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from tqdm.auto import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_recall_fscore_support, 
    classification_report, confusion_matrix, f1_score
)
from sklearn.utils import resample
import random
import os
import copy
import json
from datetime import datetime

warnings.filterwarnings('ignore')

# Reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

# Device setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'üñ•Ô∏è  Device: {device}')
if torch.cuda.is_available():
    print(f'üéÆ GPU: {torch.cuda.get_device_name(0)}')

## üìä 1. Load & Explore Data

In [None]:
# Load cleaned data dari Google Drive folder skripsi
DATA_FILES = [
    'gojek_reviews_5class_clean.csv',
    'data/gojek_reviews_5class_clean.csv',
]

DATA_PATH = None
for f in DATA_FILES:
    if os.path.exists(f):
        DATA_PATH = f
        break

if DATA_PATH is None:
    print('‚ùå Data file tidak ditemukan!')
    print(f'\nüìÅ Files yang ada di folder skripsi:')
    for f in os.listdir('.'):
        print(f'   - {f}')
    if os.path.exists('data'):
        print(f'\nüìÅ Files di folder data:')
        for f in os.listdir('data'):
            print(f'   - data/{f}')
    print(f'\nüí° Upload file "gojek_reviews_5class_clean.csv" ke folder skripsi')
else:
    print(f'‚úì Using data file: {DATA_PATH}')
    df = pd.read_csv(DATA_PATH)
    
    # Label mapping untuk 5 kelas
    LABEL_MAP = {
        'sangat_negatif': 0,
        'negatif': 1,
        'netral': 2,
        'positif': 3,
        'sangat_positif': 4
    }
    LABEL_NAMES = ['sangat_negatif', 'negatif', 'netral', 'positif', 'sangat_positif']
    NUM_CLASSES = 5
    
    # Create label column
    df['label'] = df['sentiment_label'].map(LABEL_MAP)
    
    print('=' * 60)
    print('üìä DATA OVERVIEW')
    print('=' * 60)
    print(f'Total samples: {len(df):,}')
    print(f'\nColumns: {df.columns.tolist()}')
    print(f'\nüìà Sentiment Distribution:')
    print(df['sentiment_label'].value_counts())
    
    # Visualize
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Bar plot
    colors = ['#e74c3c', '#f39c12', '#95a5a6', '#3498db', '#2ecc71']
    sentiment_counts = df['sentiment_label'].value_counts().reindex(LABEL_NAMES)
    axes[0].bar(range(5), sentiment_counts.values, color=colors)
    axes[0].set_xticks(range(5))
    axes[0].set_xticklabels(LABEL_NAMES, rotation=45, ha='right')
    axes[0].set_title('Sentiment Distribution (5 Classes)')
    axes[0].set_ylabel('Count')
    
    # Pie chart
    axes[1].pie(sentiment_counts.values, labels=LABEL_NAMES, 
                autopct='%1.1f%%', colors=colors)
    axes[1].set_title('Sentiment Percentage')
    
    plt.tight_layout()
    plt.show()
    
    # Show sample reviews
    print('\nüìù Sample Reviews per Class:')
    for label_name in LABEL_NAMES:
        sample = df[df['sentiment_label'] == label_name].sample(1).iloc[0]
        print(f'\n[{label_name.upper()}] Rating {sample["rating"]}:')
        print(f'   "{sample["review"][:100]}..."')

## ‚öñÔ∏è 2. Data Preparation & Split

In [None]:
# Stratified split: Train (70%), Val (15%), Test (15%)
# Stratified memastikan proporsi kelas sama di semua split

# First split: Train vs (Val+Test)
train_df, temp_df = train_test_split(
    df, 
    test_size=0.3, 
    random_state=42, 
    stratify=df['label']
)

# Second split: Val vs Test
val_df, test_df = train_test_split(
    temp_df, 
    test_size=0.5, 
    random_state=42, 
    stratify=temp_df['label']
)

print('=' * 60)
print('üìä DATA SPLIT')
print('=' * 60)
print(f'Training set: {len(train_df):,} samples ({len(train_df)/len(df)*100:.1f}%)')
print(f'Validation set: {len(val_df):,} samples ({len(val_df)/len(df)*100:.1f}%)')
print(f'Test set: {len(test_df):,} samples ({len(test_df)/len(df)*100:.1f}%)')

# Verify stratification
print('\nüìà Distribution per split:')
for name, data in [('Train', train_df), ('Val', val_df), ('Test', test_df)]:
    dist = data['label'].value_counts(normalize=True).sort_index() * 100
    print(f'{name}: ' + ' | '.join([f'{LABEL_NAMES[i][:4]}: {dist[i]:.1f}%' for i in range(5)]))

## üîß 3. Load IndoBERT Tokenizer

In [None]:
# Load IndoBERT tokenizer
MODEL_NAME = 'indobenchmark/indobert-base-p1'
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

# Analyze text lengths untuk menentukan MAX_LEN optimal
text_lengths = df['review'].apply(lambda x: len(tokenizer.encode(str(x), add_special_tokens=True)))

print('=' * 60)
print('üìè TEXT LENGTH ANALYSIS')
print('=' * 60)
print(f'Min tokens: {text_lengths.min()}')
print(f'Max tokens: {text_lengths.max()}')
print(f'Mean tokens: {text_lengths.mean():.1f}')
print(f'Median tokens: {text_lengths.median()}')
print(f'95th percentile: {text_lengths.quantile(0.95):.0f}')
print(f'99th percentile: {text_lengths.quantile(0.99):.0f}')

# Set MAX_LEN based on 95th percentile (to capture most texts)
MAX_LEN = min(int(text_lengths.quantile(0.95)) + 10, 128)  # Cap at 128
print(f'\n‚úì Using MAX_LEN = {MAX_LEN}')

# Visualize
plt.figure(figsize=(10, 4))
plt.hist(text_lengths, bins=50, edgecolor='black', alpha=0.7)
plt.axvline(x=MAX_LEN, color='r', linestyle='--', label=f'MAX_LEN = {MAX_LEN}')
plt.xlabel('Token Count')
plt.ylabel('Frequency')
plt.title('Distribution of Text Lengths')
plt.legend()
plt.show()

## üì¶ 4. Dataset Class dengan Data Augmentation

In [None]:
class SentimentDataset(Dataset):
    def __init__(self, reviews, labels, tokenizer, max_len, augment=False):
        self.reviews = reviews
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.augment = augment
    
    def __len__(self):
        return len(self.reviews)
    
    def _augment_text(self, text):
        """Random word dropout for augmentation"""
        if not self.augment or random.random() > 0.3:  # 30% chance to augment
            return text
        
        words = text.split()
        if len(words) <= 3:
            return text
        
        # Randomly drop 10-20% of words
        drop_rate = random.uniform(0.1, 0.2)
        keep_words = [w for w in words if random.random() > drop_rate]
        
        if len(keep_words) < 2:
            return text
        
        return ' '.join(keep_words)
    
    def __getitem__(self, idx):
        review = str(self.reviews[idx])
        label = self.labels[idx]
        
        # Apply augmentation
        review = self._augment_text(review)
        
        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Create datasets
train_dataset = SentimentDataset(
    train_df['review'].values,
    train_df['label'].values,
    tokenizer,
    MAX_LEN,
    augment=True  # Enable augmentation for training
)

val_dataset = SentimentDataset(
    val_df['review'].values,
    val_df['label'].values,
    tokenizer,
    MAX_LEN,
    augment=False
)

test_dataset = SentimentDataset(
    test_df['review'].values,
    test_df['label'].values,
    tokenizer,
    MAX_LEN,
    augment=False
)

# Create data loaders
BATCH_SIZE = 16  # Smaller batch for 5 classes

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

print(f'‚úì Train batches: {len(train_loader)}')
print(f'‚úì Val batches: {len(val_loader)}')
print(f'‚úì Test batches: {len(test_loader)}')

## üèóÔ∏è 5. Model Architecture dengan Anti-Overfitting

In [None]:
class IndoBERTSentiment5Class(nn.Module):
    def __init__(self, model_name, num_classes=5, dropout_rate=0.3, freeze_bert_layers=6):
        super(IndoBERTSentiment5Class, self).__init__()
        
        self.bert = BertModel.from_pretrained(model_name)
        self.hidden_size = self.bert.config.hidden_size
        
        # Freeze lower BERT layers untuk mencegah overfitting
        # Hanya fine-tune upper layers
        if freeze_bert_layers > 0:
            # Freeze embeddings
            for param in self.bert.embeddings.parameters():
                param.requires_grad = False
            
            # Freeze first N layers
            for i in range(freeze_bert_layers):
                for param in self.bert.encoder.layer[i].parameters():
                    param.requires_grad = False
        
        # Multi-layer classifier dengan dropout
        self.classifier = nn.Sequential(
            nn.Dropout(dropout_rate),
            nn.Linear(self.hidden_size, 512),
            nn.ReLU(),
            nn.BatchNorm1d(512),
            nn.Dropout(dropout_rate),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.BatchNorm1d(256),
            nn.Dropout(dropout_rate * 0.5),  # Lower dropout before final layer
            nn.Linear(256, num_classes)
        )
        
        # Initialize classifier weights
        self._init_weights()
    
    def _init_weights(self):
        for module in self.classifier.modules():
            if isinstance(module, nn.Linear):
                nn.init.xavier_uniform_(module.weight)
                if module.bias is not None:
                    nn.init.zeros_(module.bias)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        
        # Use [CLS] token representation
        pooled_output = outputs.pooler_output
        
        # Classify
        logits = self.classifier(pooled_output)
        
        return logits

# Initialize model
model = IndoBERTSentiment5Class(
    MODEL_NAME, 
    num_classes=NUM_CLASSES, 
    dropout_rate=0.3,
    freeze_bert_layers=6  # Freeze first 6 layers (of 12)
)
model = model.to(device)

# Count parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
frozen_params = total_params - trainable_params

print('=' * 60)
print('üèóÔ∏è  MODEL ARCHITECTURE')
print('=' * 60)
print(f'Total parameters: {total_params:,}')
print(f'Trainable parameters: {trainable_params:,} ({trainable_params/total_params*100:.1f}%)')
print(f'Frozen parameters: {frozen_params:,} ({frozen_params/total_params*100:.1f}%)')
print(f'\n‚úì Freezing helps prevent overfitting by limiting trainable params')

## üìâ 6. Loss Function dengan Label Smoothing dan Focal Loss

In [None]:
class FocalLossWithLabelSmoothing(nn.Module):
    """
    Combines Focal Loss and Label Smoothing for better multi-class classification.
    - Focal Loss: Focuses on hard examples, reduces weight on easy examples
    - Label Smoothing: Prevents overconfidence, improves generalization
    """
    def __init__(self, num_classes=5, gamma=2.0, alpha=None, smoothing=0.1):
        super().__init__()
        self.num_classes = num_classes
        self.gamma = gamma  # Focusing parameter
        self.smoothing = smoothing
        
        # Alpha for class weighting (optional)
        if alpha is not None:
            self.alpha = torch.tensor(alpha).float()
        else:
            self.alpha = None
    
    def forward(self, inputs, targets):
        # Apply label smoothing
        confidence = 1.0 - self.smoothing
        smooth_labels = torch.zeros_like(inputs).scatter_(
            1, targets.unsqueeze(1), confidence
        )
        smooth_labels += self.smoothing / self.num_classes
        
        # Compute probabilities
        log_probs = F.log_softmax(inputs, dim=1)
        probs = torch.exp(log_probs)
        
        # Focal weight: (1 - p_t)^gamma
        focal_weight = (1 - probs) ** self.gamma
        
        # Compute focal loss with label smoothing
        focal_loss = -focal_weight * smooth_labels * log_probs
        
        # Apply class weighting if provided
        if self.alpha is not None:
            alpha = self.alpha.to(inputs.device)
            focal_loss = alpha.unsqueeze(0) * focal_loss
        
        return focal_loss.sum(dim=1).mean()

# Calculate class weights (inverse frequency)
class_counts = train_df['label'].value_counts().sort_index().values
class_weights = 1.0 / class_counts
class_weights = class_weights / class_weights.sum() * NUM_CLASSES  # Normalize

print('üìä Class Weights:')
for i, (name, weight) in enumerate(zip(LABEL_NAMES, class_weights)):
    print(f'   {name}: {weight:.4f}')

# Initialize loss function
criterion = FocalLossWithLabelSmoothing(
    num_classes=NUM_CLASSES,
    gamma=2.0,  # Focal parameter
    alpha=class_weights.tolist(),  # Class weights
    smoothing=0.1  # Label smoothing
)

print('\n‚úì Loss Function: Focal Loss + Label Smoothing')
print(f'   - Gamma (focal): 2.0')
print(f'   - Label Smoothing: 0.1')

## ‚öôÔ∏è 7. Training Configuration

In [None]:
# Hyperparameters yang dioptimasi untuk anti-overfitting
EPOCHS = 10
LEARNING_RATE = 2e-5
WEIGHT_DECAY = 0.01  # L2 regularization
WARMUP_RATIO = 0.1
MAX_GRAD_NORM = 1.0  # Gradient clipping
PATIENCE = 3  # Early stopping patience

# Optimizer dengan weight decay (L2 regularization)
optimizer = AdamW(
    model.parameters(),
    lr=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    eps=1e-8
)

# Learning rate scheduler dengan warmup
total_steps = len(train_loader) * EPOCHS
warmup_steps = int(total_steps * WARMUP_RATIO)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)

print('=' * 60)
print('‚öôÔ∏è  TRAINING CONFIGURATION')
print('=' * 60)
print(f'Epochs: {EPOCHS}')
print(f'Batch size: {BATCH_SIZE}')
print(f'Learning rate: {LEARNING_RATE}')
print(f'Weight decay (L2): {WEIGHT_DECAY}')
print(f'Warmup steps: {warmup_steps}')
print(f'Total steps: {total_steps}')
print(f'Gradient clipping: {MAX_GRAD_NORM}')
print(f'Early stopping patience: {PATIENCE}')

## üöÄ 8. Training Loop dengan Early Stopping

In [None]:
def train_epoch(model, data_loader, criterion, optimizer, scheduler, device, max_grad_norm):
    model.train()
    total_loss = 0
    predictions = []
    actual_labels = []
    
    progress_bar = tqdm(data_loader, desc='Training', leave=False)
    
    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        optimizer.zero_grad()
        
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        
        loss.backward()
        
        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        
        optimizer.step()
        scheduler.step()
        
        total_loss += loss.item()
        
        _, preds = torch.max(outputs, dim=1)
        predictions.extend(preds.cpu().numpy())
        actual_labels.extend(labels.cpu().numpy())
        
        progress_bar.set_postfix({'loss': loss.item()})
    
    avg_loss = total_loss / len(data_loader)
    accuracy = accuracy_score(actual_labels, predictions)
    f1 = f1_score(actual_labels, predictions, average='macro')
    
    return avg_loss, accuracy, f1

def eval_model(model, data_loader, criterion, device):
    model.eval()
    total_loss = 0
    predictions = []
    actual_labels = []
    
    with torch.no_grad():
        for batch in tqdm(data_loader, desc='Evaluating', leave=False):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            
            total_loss += loss.item()
            
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().numpy())
            actual_labels.extend(labels.cpu().numpy())
    
    avg_loss = total_loss / len(data_loader)
    accuracy = accuracy_score(actual_labels, predictions)
    f1 = f1_score(actual_labels, predictions, average='macro')
    
    return avg_loss, accuracy, f1, predictions, actual_labels

In [None]:
# Training dengan Early Stopping
print('=' * 60)
print('üöÄ STARTING TRAINING')
print('=' * 60)

best_val_f1 = 0
best_model_state = None
patience_counter = 0
history = {
    'train_loss': [], 'train_acc': [], 'train_f1': [],
    'val_loss': [], 'val_acc': [], 'val_f1': []
}

for epoch in range(EPOCHS):
    print(f'\nüìÖ Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 40)
    
    # Training
    train_loss, train_acc, train_f1 = train_epoch(
        model, train_loader, criterion, optimizer, scheduler, device, MAX_GRAD_NORM
    )
    
    # Validation
    val_loss, val_acc, val_f1, _, _ = eval_model(
        model, val_loader, criterion, device
    )
    
    # Record history
    history['train_loss'].append(train_loss)
    history['train_acc'].append(train_acc)
    history['train_f1'].append(train_f1)
    history['val_loss'].append(val_loss)
    history['val_acc'].append(val_acc)
    history['val_f1'].append(val_f1)
    
    # Print metrics
    print(f'Train - Loss: {train_loss:.4f} | Acc: {train_acc:.4f} | F1: {train_f1:.4f}')
    print(f'Val   - Loss: {val_loss:.4f} | Acc: {val_acc:.4f} | F1: {val_f1:.4f}')
    
    # Check overfitting indicator
    overfit_gap = train_acc - val_acc
    if overfit_gap > 0.1:
        print(f'‚ö†Ô∏è  Overfitting warning! Gap: {overfit_gap:.4f}')
    
    # Early stopping check
    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        best_model_state = copy.deepcopy(model.state_dict())
        patience_counter = 0
        print(f'‚úì New best model! Val F1: {val_f1:.4f}')
    else:
        patience_counter += 1
        print(f'No improvement. Patience: {patience_counter}/{PATIENCE}')
        
        if patience_counter >= PATIENCE:
            print(f'\n‚èπÔ∏è  Early stopping triggered at epoch {epoch + 1}')
            break

# Load best model
model.load_state_dict(best_model_state)
print(f'\n‚úì Loaded best model with Val F1: {best_val_f1:.4f}')

## üìä 9. Training History Visualization

In [None]:
# Plot training history
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

epochs_range = range(1, len(history['train_loss']) + 1)

# Loss
axes[0].plot(epochs_range, history['train_loss'], 'b-', label='Train Loss', marker='o')
axes[0].plot(epochs_range, history['val_loss'], 'r-', label='Val Loss', marker='o')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].set_title('Training vs Validation Loss')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Accuracy
axes[1].plot(epochs_range, history['train_acc'], 'b-', label='Train Acc', marker='o')
axes[1].plot(epochs_range, history['val_acc'], 'r-', label='Val Acc', marker='o')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Accuracy')
axes[1].set_title('Training vs Validation Accuracy')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

# F1 Score
axes[2].plot(epochs_range, history['train_f1'], 'b-', label='Train F1', marker='o')
axes[2].plot(epochs_range, history['val_f1'], 'r-', label='Val F1', marker='o')
axes[2].set_xlabel('Epoch')
axes[2].set_ylabel('F1 Score')
axes[2].set_title('Training vs Validation F1 Score')
axes[2].legend()
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('training_history_5class.png', dpi=150, bbox_inches='tight')
plt.show()

# Overfitting analysis
final_train_acc = history['train_acc'][-1]
final_val_acc = history['val_acc'][-1]
gap = final_train_acc - final_val_acc

print('\nüìä OVERFITTING ANALYSIS')
print('=' * 40)
print(f'Final Train Accuracy: {final_train_acc:.4f}')
print(f'Final Val Accuracy: {final_val_acc:.4f}')
print(f'Gap (Train - Val): {gap:.4f}')

if gap < 0.05:
    print('‚úì Model is NOT overfitting (gap < 5%)')
elif gap < 0.10:
    print('‚ö†Ô∏è  Model shows slight overfitting (gap 5-10%)')
else:
    print('‚ùå Model is overfitting (gap > 10%)')

## üß™ 10. Final Evaluation on Test Set

In [None]:
# Evaluate on test set
print('=' * 60)
print('üß™ FINAL EVALUATION ON TEST SET')
print('=' * 60)

test_loss, test_acc, test_f1, test_preds, test_labels = eval_model(
    model, test_loader, criterion, device
)

print(f'\nTest Results:')
print(f'  Loss: {test_loss:.4f}')
print(f'  Accuracy: {test_acc:.4f} ({test_acc*100:.2f}%)')
print(f'  Macro F1: {test_f1:.4f}')

# Detailed classification report
print('\nüìã CLASSIFICATION REPORT')
print('=' * 60)
print(classification_report(test_labels, test_preds, target_names=LABEL_NAMES, digits=4))

In [None]:
# Confusion Matrix
cm = confusion_matrix(test_labels, test_preds)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Raw counts
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=LABEL_NAMES, yticklabels=LABEL_NAMES, ax=axes[0])
axes[0].set_xlabel('Predicted')
axes[0].set_ylabel('Actual')
axes[0].set_title('Confusion Matrix (Counts)')

# Normalized (percentages)
sns.heatmap(cm_normalized, annot=True, fmt='.2%', cmap='Blues',
            xticklabels=LABEL_NAMES, yticklabels=LABEL_NAMES, ax=axes[1])
axes[1].set_xlabel('Predicted')
axes[1].set_ylabel('Actual')
axes[1].set_title('Confusion Matrix (Normalized)')

plt.tight_layout()
plt.savefig('confusion_matrix_5class.png', dpi=150, bbox_inches='tight')
plt.show()

# Analysis of confusion
print('\nüìä CONFUSION ANALYSIS')
print('=' * 60)
for i, label in enumerate(LABEL_NAMES):
    correct = cm[i, i]
    total = cm[i].sum()
    acc = correct / total
    print(f'{label}: {acc:.2%} correct ({correct}/{total})')
    
    # Show main confusions
    for j, other_label in enumerate(LABEL_NAMES):
        if i != j and cm[i, j] > 0:
            conf_rate = cm[i, j] / total
            if conf_rate > 0.05:  # Show if > 5% confusion
                print(f'   ‚îî‚îÄ {conf_rate:.1%} confused with {other_label}')

## üíæ 11. Save Model

In [None]:
# Create models directory
os.makedirs('models', exist_ok=True)

# Save model
model_path = 'models/indobert_sentiment_5class_best.pt'
torch.save({
    'model_state_dict': model.state_dict(),
    'model_config': {
        'model_name': MODEL_NAME,
        'num_classes': NUM_CLASSES,
        'max_len': MAX_LEN,
        'label_map': LABEL_MAP,
        'label_names': LABEL_NAMES
    },
    'training_history': history,
    'test_metrics': {
        'accuracy': test_acc,
        'f1_score': test_f1,
        'loss': test_loss
    }
}, model_path)

print(f'‚úì Model saved to: {model_path}')

# Save training config
config = {
    'model_name': MODEL_NAME,
    'num_classes': NUM_CLASSES,
    'max_len': MAX_LEN,
    'batch_size': BATCH_SIZE,
    'epochs_trained': len(history['train_loss']),
    'learning_rate': LEARNING_RATE,
    'weight_decay': WEIGHT_DECAY,
    'dropout_rate': 0.3,
    'label_smoothing': 0.1,
    'focal_gamma': 2.0,
    'frozen_layers': 6,
    'test_accuracy': test_acc,
    'test_f1': test_f1,
    'label_map': LABEL_MAP,
    'label_names': LABEL_NAMES
}

with open('models/training_config_5class.json', 'w') as f:
    json.dump(config, f, indent=2)

print(f'‚úì Config saved to: models/training_config_5class.json')

## üîÆ 12. Prediction Function

In [None]:
def predict_sentiment(text, model, tokenizer, device, max_len=MAX_LEN):
    """
    Predict sentiment for a single text.
    Returns: (predicted_label, confidence, all_probabilities)
    """
    model.eval()
    
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_len,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids, attention_mask)
        probs = F.softmax(outputs, dim=1)
        confidence, predicted = torch.max(probs, dim=1)
    
    predicted_label = LABEL_NAMES[predicted.item()]
    all_probs = {name: probs[0][i].item() for i, name in enumerate(LABEL_NAMES)}
    
    return predicted_label, confidence.item(), all_probs

# Test predictions
print('=' * 60)
print('üîÆ SAMPLE PREDICTIONS')
print('=' * 60)

test_texts = [
    "Aplikasi ini sangat bagus dan membantu sekali!",
    "Lumayan lah aplikasinya, cukup membantu",
    "Biasa aja, tidak ada yang istimewa",
    "Kurang bagus, sering error dan lambat",
    "Aplikasi sampah! Sangat mengecewakan, tidak akan pakai lagi!"
]

for text in test_texts:
    label, conf, probs = predict_sentiment(text, model, tokenizer, device)
    print(f'\nüìù "{text[:50]}..."')
    print(f'   Prediction: {label.upper()} (confidence: {conf:.2%})')
    print(f'   Probabilities:')
    for name, prob in sorted(probs.items(), key=lambda x: -x[1]):
        bar = '‚ñà' * int(prob * 20)
        print(f'      {name:15s}: {prob:.2%} {bar}')

## üìà 13. Final Summary

In [None]:
print('=' * 60)
print('üìà FINAL TRAINING SUMMARY')
print('=' * 60)

print(f'''
üéØ MODEL PERFORMANCE:
   ‚Ä¢ Test Accuracy: {test_acc:.4f} ({test_acc*100:.2f}%)
   ‚Ä¢ Test Macro F1: {test_f1:.4f}
   
üõ°Ô∏è ANTI-OVERFITTING TECHNIQUES USED:
   ‚úì Balanced Dataset (2,500 per class)
   ‚úì Dropout (0.3)
   ‚úì Label Smoothing (0.1)
   ‚úì Focal Loss (gamma=2.0)
   ‚úì Weight Decay / L2 Regularization (0.01)
   ‚úì Learning Rate Warmup
   ‚úì Gradient Clipping (max_norm=1.0)
   ‚úì Early Stopping (patience=3)
   ‚úì Layer Freezing (6 of 12 BERT layers)
   ‚úì Data Augmentation (random word dropout)
   ‚úì Stratified Train/Val/Test Split

üìä OVERFITTING CHECK:
   ‚Ä¢ Train Accuracy: {history["train_acc"][-1]:.4f}
   ‚Ä¢ Val Accuracy: {history["val_acc"][-1]:.4f}
   ‚Ä¢ Gap: {history["train_acc"][-1] - history["val_acc"][-1]:.4f}
   ‚Ä¢ Status: {"‚úì Not Overfitting" if (history["train_acc"][-1] - history["val_acc"][-1]) < 0.05 else "‚ö†Ô∏è Check overfitting"}

üíæ SAVED FILES:
   ‚Ä¢ Model: models/indobert_sentiment_5class_best.pt
   ‚Ä¢ Config: models/training_config_5class.json
   ‚Ä¢ Plots: training_history_5class.png, confusion_matrix_5class.png
''')

print('\nüéâ Training Complete!')