## üîß 1. Setup & Check GPU

In [None]:
# Install packages jika belum ada
# !pip install torch transformers scikit-learn pandas numpy matplotlib seaborn tqdm

import torch
import sys
import platform

print('=' * 60)
print('üñ•Ô∏è  SYSTEM INFO')
print('=' * 60)
print(f'Python: {sys.version}')
print(f'PyTorch: {torch.__version__}')
print(f'Platform: {platform.system()} {platform.release()}')
print()

# Check GPU
if torch.cuda.is_available():
    print('‚úÖ CUDA is available!')
    print(f'   GPU: {torch.cuda.get_device_name(0)}')
    print(f'   CUDA Version: {torch.version.cuda}')
    print(f'   Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB')
    device = torch.device('cuda')
else:
    print('‚ö†Ô∏è CUDA not available, using CPU')
    print('   Training will be SLOW!')
    device = torch.device('cpu')

print(f'\nüéÆ Using device: {device}')

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from tqdm.auto import tqdm
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_recall_fscore_support, 
    classification_report, confusion_matrix, f1_score
)
from sklearn.utils import resample
import random
import os
import copy
import json
from datetime import datetime

warnings.filterwarnings('ignore')

# Reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)
print('‚úì Libraries imported')
print(f'‚úì Device: {device}')

## üìä 2. Load Data

In [None]:
# Load data dari folder data/
DATA_PATH = 'data/gojek_reviews_final_augmented.csv'

if not os.path.exists(DATA_PATH):
    print(f'‚ùå File tidak ditemukan: {DATA_PATH}')
    print('\nüìÅ Files yang ada:')
    if os.path.exists('data'):
        for f in os.listdir('data'):
            print(f'   - data/{f}')
else:
    df = pd.read_csv(DATA_PATH)
    
    print('=' * 60)
    print('üìä DATA OVERVIEW')
    print('=' * 60)
    print(f'Total samples: {len(df):,}')
    print(f'Columns: {df.columns.tolist()}')
    print(f'\nüìà Sentiment Distribution:')
    print(df['sentiment'].value_counts())
    
    # Visualize
    fig, ax = plt.subplots(1, 1, figsize=(8, 4))
    colors = {'negative': '#e74c3c', 'neutral': '#95a5a6', 'positive': '#2ecc71'}
    counts = df['sentiment'].value_counts()
    bars = ax.bar(counts.index, counts.values, color=[colors[s] for s in counts.index])
    ax.set_title('Sentiment Distribution')
    ax.set_ylabel('Count')
    for bar, count in zip(bars, counts.values):
        ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 50, 
                str(count), ha='center', fontweight='bold')
    plt.tight_layout()
    plt.show()

## ‚öñÔ∏è 3. Prepare Data

In [None]:
# Check if data is already balanced
counts = df['sentiment'].value_counts()
min_count = counts.min()
max_count = counts.max()

if (max_count - min_count) / max_count < 0.1:
    print('‚úì Data sudah balanced!')
    df_balanced = df.copy()
else:
    print('‚ö†Ô∏è Melakukan undersampling...')
    df_balanced = pd.DataFrame()
    for sentiment in ['negative', 'neutral', 'positive']:
        df_class = df[df['sentiment'] == sentiment]
        df_sampled = resample(df_class, replace=False, n_samples=min_count, random_state=42)
        df_balanced = pd.concat([df_balanced, df_sampled])
    df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

print(f'\nüìä Data untuk training: {len(df_balanced):,} samples')
print(df_balanced['sentiment'].value_counts())

In [None]:
# Label mapping
LABEL_MAP = {'negative': 0, 'neutral': 1, 'positive': 2}
LABEL_NAMES = ['negative', 'neutral', 'positive']
NUM_CLASSES = 3

df_balanced['label'] = df_balanced['sentiment'].map(LABEL_MAP)

# Split: 70% train, 15% val, 15% test
train_df, temp_df = train_test_split(
    df_balanced, test_size=0.3, random_state=42, stratify=df_balanced['label']
)
val_df, test_df = train_test_split(
    temp_df, test_size=0.5, random_state=42, stratify=temp_df['label']
)

print('=' * 60)
print('üìÇ DATA SPLITS')
print('=' * 60)
print(f'Train: {len(train_df):,} samples ({len(train_df)/len(df_balanced)*100:.1f}%)')
print(f'Val:   {len(val_df):,} samples ({len(val_df)/len(df_balanced)*100:.1f}%)')
print(f'Test:  {len(test_df):,} samples ({len(test_df)/len(df_balanced)*100:.1f}%)')

## ‚öôÔ∏è 4. Configuration

In [None]:
# === HYPERPARAMETERS ===
# Optimized untuk menghindari overfitting

CONFIG = {
    # Model
    'model_name': 'indobenchmark/indobert-base-p1',
    'max_length': 128,
    'num_classes': NUM_CLASSES,
    
    # Training
    'batch_size': 32,
    'epochs': 30,
    'learning_rate': 5e-6,  # Sangat kecil untuk hindari overfitting
    
    # Anti-Overfitting
    'dropout_rate': 0.6,
    'attention_dropout': 0.3,
    'weight_decay': 0.05,
    'label_smoothing': 0.2,
    'warmup_ratio': 0.15,
    'max_grad_norm': 0.5,
    'early_stopping_patience': 7,
    
    # Data Augmentation
    'word_dropout_prob': 0.2,
    
    # Layer Freezing
    'freeze_layers': 9,  # Freeze 9 dari 12 layer
    
    # R-Drop
    'rdrop_alpha': 0.5,
}

print('=' * 60)
print('‚öôÔ∏è  CONFIGURATION')
print('=' * 60)
for key, value in CONFIG.items():
    print(f'{key}: {value}')

## üì¶ 5. Dataset & DataLoader

In [None]:
# Load tokenizer
print('Loading tokenizer...')
tokenizer = BertTokenizer.from_pretrained(CONFIG['model_name'])
print(f'‚úì Tokenizer loaded: {CONFIG["model_name"]}')

class SentimentDataset(Dataset):
    """Dataset dengan augmentation"""
    
    def __init__(self, texts, labels, tokenizer, max_length=128, 
                 augment=False, word_dropout_prob=0.2):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.augment = augment
        self.word_dropout_prob = word_dropout_prob
    
    def __len__(self):
        return len(self.texts)
    
    def _augment_text(self, text):
        if not self.augment:
            return text
        
        text = str(text)
        words = text.split()
        
        if len(words) <= 3:
            return text
        
        aug_type = random.random()
        
        if aug_type < 0.3:
            # Word dropout
            words = [w for w in words if random.random() > self.word_dropout_prob]
        elif aug_type < 0.5:
            # Word swap
            if len(words) > 2:
                idx = random.randint(0, len(words) - 2)
                words[idx], words[idx + 1] = words[idx + 1], words[idx]
        elif aug_type < 0.7:
            # Random deletion
            if len(words) > 4:
                del_idx = random.randint(0, len(words) - 1)
                words.pop(del_idx)
        
        return ' '.join(words) if words else text
    
    def __getitem__(self, idx):
        text = self._augment_text(self.texts[idx])
        
        encoding = self.tokenizer.encode_plus(
            str(text),
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(self.labels[idx], dtype=torch.long)
        }

In [None]:
# Create datasets
train_dataset = SentimentDataset(
    train_df['content_clean'].values,
    train_df['label'].values,
    tokenizer,
    max_length=CONFIG['max_length'],
    augment=True,
    word_dropout_prob=CONFIG['word_dropout_prob']
)

val_dataset = SentimentDataset(
    val_df['content_clean'].values,
    val_df['label'].values,
    tokenizer,
    max_length=CONFIG['max_length'],
    augment=False
)

test_dataset = SentimentDataset(
    test_df['content_clean'].values,
    test_df['label'].values,
    tokenizer,
    max_length=CONFIG['max_length'],
    augment=False
)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=CONFIG['batch_size'], shuffle=True, drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=CONFIG['batch_size'], shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=CONFIG['batch_size'], shuffle=False)

print(f'\n‚úì Datasets created:')
print(f'  Train: {len(train_dataset)} samples, {len(train_loader)} batches')
print(f'  Val:   {len(val_dataset)} samples, {len(val_loader)} batches')
print(f'  Test:  {len(test_dataset)} samples, {len(test_loader)} batches')

## üß† 6. Model

In [None]:
class IndoBERTSentimentClassifier(nn.Module):
    """
    IndoBERT dengan regularisasi maksimal:
    - Freeze 9/12 layer BERT
    - Multiple dropout
    - Simple classifier
    """
    
    def __init__(self, model_name, num_classes, dropout_rate=0.6, 
                 attention_dropout=0.3, freeze_layers=9):
        super(IndoBERTSentimentClassifier, self).__init__()
        
        # Load pretrained BERT
        self.bert = BertModel.from_pretrained(model_name)
        self.hidden_size = self.bert.config.hidden_size
        
        # Freeze embeddings
        for param in self.bert.embeddings.parameters():
            param.requires_grad = False
        
        # Freeze first N encoder layers
        for i in range(freeze_layers):
            for param in self.bert.encoder.layer[i].parameters():
                param.requires_grad = False
        
        # Add dropout to attention in unfrozen layers
        for i in range(freeze_layers, 12):
            self.bert.encoder.layer[i].attention.self.dropout = nn.Dropout(attention_dropout)
            self.bert.encoder.layer[i].attention.output.dropout = nn.Dropout(attention_dropout)
        
        print(f'‚úì Froze embeddings and first {freeze_layers} encoder layers')
        print(f'  Only layers {freeze_layers}-11 are trainable (3 layers)')
        
        # Regularization
        self.dropout1 = nn.Dropout(dropout_rate)
        self.dropout2 = nn.Dropout(dropout_rate)
        self.layer_norm = nn.LayerNorm(self.hidden_size)
        
        # Simple classifier
        self.fc = nn.Linear(self.hidden_size, num_classes)
        
        # Initialize
        nn.init.xavier_uniform_(self.fc.weight)
        nn.init.zeros_(self.fc.bias)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        
        pooled_output = outputs.pooler_output
        
        x = self.layer_norm(pooled_output)
        x = self.dropout1(x)
        
        if self.training:
            x = self.dropout2(x)
        
        logits = self.fc(x)
        
        return logits

In [None]:
# Initialize model
print('Loading IndoBERT model...')
model = IndoBERTSentimentClassifier(
    model_name=CONFIG['model_name'],
    num_classes=CONFIG['num_classes'],
    dropout_rate=CONFIG['dropout_rate'],
    attention_dropout=CONFIG['attention_dropout'],
    freeze_layers=CONFIG['freeze_layers']
).to(device)

# Count parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
frozen_params = total_params - trainable_params

print(f'\n‚úì Model loaded to {device}')
print(f'  Total parameters: {total_params:,}')
print(f'  Trainable: {trainable_params:,} ({trainable_params/total_params*100:.1f}%)')
print(f'  Frozen: {frozen_params:,} ({frozen_params/total_params*100:.1f}%)')

## üìâ 7. Loss, Optimizer & Scheduler

In [None]:
# Loss function dengan label smoothing
criterion = nn.CrossEntropyLoss(label_smoothing=CONFIG['label_smoothing'])

# Optimizer - hanya untuk trainable parameters
no_decay = ['bias', 'LayerNorm.weight', 'layer_norm.weight']
trainable_params_list = [(n, p) for n, p in model.named_parameters() if p.requires_grad]

optimizer_grouped_parameters = [
    {
        'params': [p for n, p in trainable_params_list if not any(nd in n for nd in no_decay)],
        'weight_decay': CONFIG['weight_decay']
    },
    {
        'params': [p for n, p in trainable_params_list if any(nd in n for nd in no_decay)],
        'weight_decay': 0.0
    }
]

optimizer = AdamW(optimizer_grouped_parameters, lr=CONFIG['learning_rate'])

# Scheduler
total_steps = len(train_loader) * CONFIG['epochs']
warmup_steps = int(total_steps * CONFIG['warmup_ratio'])

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)

print(f'‚úì Optimizer: AdamW (lr={CONFIG["learning_rate"]}, wd={CONFIG["weight_decay"]})')
print(f'‚úì Scheduler: Linear warmup ({warmup_steps} warmup, {total_steps} total)')
print(f'‚úì Loss: CrossEntropy with label_smoothing={CONFIG["label_smoothing"]}')

## üèãÔ∏è 8. Training Functions

In [None]:
def compute_kl_loss(p, q):
    """KL divergence for R-Drop"""
    p_loss = F.kl_div(F.log_softmax(p, dim=-1), F.softmax(q, dim=-1), reduction='batchmean')
    q_loss = F.kl_div(F.log_softmax(q, dim=-1), F.softmax(p, dim=-1), reduction='batchmean')
    return (p_loss + q_loss) / 2

def train_epoch(model, dataloader, criterion, optimizer, scheduler, device, 
                max_grad_norm, rdrop_alpha=0.5):
    """Train dengan R-Drop regularization"""
    model.train()
    total_loss = 0
    all_preds = []
    all_labels = []
    
    progress_bar = tqdm(dataloader, desc='Training', leave=False)
    
    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        optimizer.zero_grad()
        
        # R-Drop: 2 forward passes
        logits1 = model(input_ids, attention_mask)
        logits2 = model(input_ids, attention_mask)
        
        # Cross entropy loss
        ce_loss = (criterion(logits1, labels) + criterion(logits2, labels)) / 2
        
        # KL divergence loss
        kl_loss = compute_kl_loss(logits1, logits2)
        
        # Total loss
        loss = ce_loss + rdrop_alpha * kl_loss
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        
        optimizer.step()
        scheduler.step()
        
        total_loss += loss.item()
        avg_logits = (logits1 + logits2) / 2
        preds = torch.argmax(avg_logits, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels.cpu().numpy())
        
        progress_bar.set_postfix({'loss': f'{loss.item():.4f}'})
    
    avg_loss = total_loss / len(dataloader)
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='weighted')
    
    return avg_loss, accuracy, f1

def evaluate(model, dataloader, criterion, device):
    """Evaluate model"""
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)
            
            total_loss += loss.item()
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())
    
    avg_loss = total_loss / len(dataloader)
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='weighted')
    
    return avg_loss, accuracy, f1, all_preds, all_labels

class EarlyStopping:
    """Early stopping dengan gap monitoring"""
    
    def __init__(self, patience=7, min_delta=0.001, mode='max', max_gap=0.08):
        self.patience = patience
        self.min_delta = min_delta
        self.mode = mode
        self.max_gap = max_gap
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.best_model = None
        self.best_gap = float('inf')
    
    def __call__(self, score, model, train_score=None):
        if self.mode == 'max':
            is_improvement = self.best_score is None or score > self.best_score + self.min_delta
        else:
            is_improvement = self.best_score is None or score < self.best_score - self.min_delta
        
        if is_improvement:
            self.best_score = score
            self.best_model = copy.deepcopy(model.state_dict())
            self.counter = 0
            if train_score is not None:
                self.best_gap = train_score - score
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        
        return self.early_stop

print('‚úì Training functions defined')

## üöÄ 9. Training Loop

In [None]:
# Training history
history = {
    'train_loss': [], 'train_acc': [], 'train_f1': [],
    'val_loss': [], 'val_acc': [], 'val_f1': [],
    'gap': []
}

early_stopping = EarlyStopping(
    patience=CONFIG['early_stopping_patience'], 
    mode='max',
    max_gap=0.08
)

print('=' * 60)
print('üöÄ TRAINING STARTED')
print('=' * 60)
print(f'Device: {device}')
print(f'Epochs: {CONFIG["epochs"]} | Patience: {CONFIG["early_stopping_patience"]}')
print(f'LR: {CONFIG["learning_rate"]} | Batch: {CONFIG["batch_size"]}')
print(f'Frozen Layers: {CONFIG["freeze_layers"]}/12 | Dropout: {CONFIG["dropout_rate"]}')
print('-' * 60)

best_val_f1 = 0
best_epoch = 0
best_gap = float('inf')

start_time = datetime.now()

for epoch in range(CONFIG['epochs']):
    epoch_start = datetime.now()
    print(f'\nüìç Epoch {epoch + 1}/{CONFIG["epochs"]}')
    
    # Train
    train_loss, train_acc, train_f1 = train_epoch(
        model, train_loader, criterion, optimizer, scheduler, 
        device, CONFIG['max_grad_norm'], CONFIG['rdrop_alpha']
    )
    
    # Validate
    val_loss, val_acc, val_f1, _, _ = evaluate(
        model, val_loader, criterion, device
    )
    
    gap = train_acc - val_acc
    epoch_time = datetime.now() - epoch_start
    
    # Save history
    history['train_loss'].append(train_loss)
    history['train_acc'].append(train_acc)
    history['train_f1'].append(train_f1)
    history['val_loss'].append(val_loss)
    history['val_acc'].append(val_acc)
    history['val_f1'].append(val_f1)
    history['gap'].append(gap)
    
    # Print metrics
    print(f'  Train - Loss: {train_loss:.4f} | Acc: {train_acc:.4f} | F1: {train_f1:.4f}')
    print(f'  Val   - Loss: {val_loss:.4f} | Acc: {val_acc:.4f} | F1: {val_f1:.4f}')
    
    # Track best
    if val_f1 > best_val_f1 and gap < 0.10:
        best_val_f1 = val_f1
        best_epoch = epoch + 1
        best_gap = gap
        print(f'  ‚≠ê New best! F1: {val_f1:.4f}, Gap: {gap*100:.2f}%')
    
    # Gap status
    gap_status = '‚úÖ Good' if gap < 0.05 else '‚ö° OK' if gap < 0.10 else '‚ö†Ô∏è High'
    print(f'  üìä Gap: {gap*100:.2f}% {gap_status} | Time: {epoch_time}')
    
    # Early stopping
    if early_stopping(val_f1, model, train_acc):
        print(f'\nüõë Early stopping at epoch {epoch + 1}')
        break

total_time = datetime.now() - start_time

# Load best model
if early_stopping.best_model is not None:
    model.load_state_dict(early_stopping.best_model)

print(f'\n‚úì Training completed in {total_time}')
print(f'  Best epoch: {best_epoch}')
print(f'  Best Val F1: {best_val_f1:.4f}')
print(f'  Best Gap: {best_gap*100:.2f}%')

## üìà 10. Training Visualization

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
epochs_range = range(1, len(history['train_loss']) + 1)

# Loss
axes[0, 0].plot(epochs_range, history['train_loss'], 'b-o', label='Train')
axes[0, 0].plot(epochs_range, history['val_loss'], 'r-s', label='Val')
axes[0, 0].set_xlabel('Epoch')
axes[0, 0].set_ylabel('Loss')
axes[0, 0].set_title('Training & Validation Loss')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Accuracy
axes[0, 1].plot(epochs_range, history['train_acc'], 'b-o', label='Train')
axes[0, 1].plot(epochs_range, history['val_acc'], 'r-s', label='Val')
axes[0, 1].set_xlabel('Epoch')
axes[0, 1].set_ylabel('Accuracy')
axes[0, 1].set_title('Training & Validation Accuracy')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# F1 Score
axes[1, 0].plot(epochs_range, history['train_f1'], 'b-o', label='Train')
axes[1, 0].plot(epochs_range, history['val_f1'], 'r-s', label='Val')
axes[1, 0].set_xlabel('Epoch')
axes[1, 0].set_ylabel('F1 Score')
axes[1, 0].set_title('Training & Validation F1 Score')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# Gap
axes[1, 1].plot(epochs_range, [g*100 for g in history['gap']], 'g-o')
axes[1, 1].axhline(y=5, color='orange', linestyle='--', label='Good threshold (5%)')
axes[1, 1].axhline(y=10, color='red', linestyle='--', label='Warning threshold (10%)')
axes[1, 1].set_xlabel('Epoch')
axes[1, 1].set_ylabel('Train-Val Gap (%)')
axes[1, 1].set_title('Overfitting Monitor (Train-Val Gap)')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('training_history.png', dpi=150, bbox_inches='tight')
plt.show()
print('‚úì Saved: training_history.png')

## üß™ 11. Evaluation on Test Set

In [None]:
# Evaluate on test set
test_loss, test_acc, test_f1, test_preds, test_labels = evaluate(
    model, test_loader, criterion, device
)

print('=' * 60)
print('üß™ TEST SET EVALUATION')
print('=' * 60)
print(f'Test Accuracy: {test_acc*100:.2f}%')
print(f'Test F1 Score: {test_f1*100:.2f}%')
print(f'Test Loss: {test_loss:.4f}')

# Classification report
print('\nüìä Classification Report:')
print(classification_report(test_labels, test_preds, target_names=LABEL_NAMES))

In [None]:
# Confusion Matrix
cm = confusion_matrix(test_labels, test_preds)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Counts
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=LABEL_NAMES, yticklabels=LABEL_NAMES, ax=axes[0])
axes[0].set_xlabel('Predicted')
axes[0].set_ylabel('Actual')
axes[0].set_title('Confusion Matrix (Counts)')

# Percentages
sns.heatmap(cm_normalized, annot=True, fmt='.1f', cmap='Blues',
            xticklabels=LABEL_NAMES, yticklabels=LABEL_NAMES, ax=axes[1])
axes[1].set_xlabel('Predicted')
axes[1].set_ylabel('Actual')
axes[1].set_title('Confusion Matrix (%)')

plt.tight_layout()
plt.savefig('confusion_matrix.png', dpi=150, bbox_inches='tight')
plt.show()
print('‚úì Saved: confusion_matrix.png')

## üíæ 12. Save Model

In [None]:
# Create models directory
os.makedirs('models', exist_ok=True)

# Save model
model_path = 'models/indobert_sentiment_3class.pt'
torch.save({
    'model_state_dict': model.state_dict(),
    'config': CONFIG,
    'label_map': LABEL_MAP,
    'label_names': LABEL_NAMES,
    'test_accuracy': test_acc,
    'test_f1': test_f1,
    'best_val_f1': best_val_f1,
    'best_gap': best_gap,
    'history': history,
}, model_path)
print(f'‚úì Model saved: {model_path}')

# Save tokenizer
tokenizer.save_pretrained('models/tokenizer')
print(f'‚úì Tokenizer saved: models/tokenizer/')

# Save history
with open('models/training_history.json', 'w') as f:
    json.dump(history, f, indent=2)
print(f'‚úì History saved: models/training_history.json')

# List saved files
print('\nüìÅ Saved files:')
for root, dirs, files in os.walk('models'):
    for file in files:
        filepath = os.path.join(root, file)
        size = os.path.getsize(filepath) / (1024*1024)
        print(f'   {filepath} ({size:.2f} MB)')

## üîÆ 13. Inference Demo

In [None]:
def predict_sentiment(text, model, tokenizer, device, label_names):
    """Predict sentiment untuk satu teks"""
    model.eval()
    
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    with torch.no_grad():
        logits = model(input_ids, attention_mask)
        probs = torch.softmax(logits, dim=1)
        pred = torch.argmax(probs, dim=1).item()
    
    return {
        'sentiment': label_names[pred],
        'confidence': probs[0][pred].item(),
        'probabilities': {
            label_names[i]: probs[0][i].item() 
            for i in range(len(label_names))
        }
    }

# Test
test_reviews = [
    "Aplikasi gojek sangat membantu, driver ramah dan cepat",
    "Driver nya lama banget, udah nunggu 1 jam gak datang",
    "Biasa aja sih aplikasinya",
    "Pelayanan buruk, tidak akan pakai lagi",
    "Mantap, makanan sampai dengan selamat dan masih hangat",
]

print('=' * 60)
print('üîÆ INFERENCE DEMO')
print('=' * 60)

for review in test_reviews:
    result = predict_sentiment(review, model, tokenizer, device, LABEL_NAMES)
    emoji = {'negative': 'üò†', 'neutral': 'üòê', 'positive': 'üòä'}[result['sentiment']]
    print(f'\nüìù "{review[:50]}..."' if len(review) > 50 else f'\nüìù "{review}"')
    print(f'   {emoji} {result["sentiment"].upper()} ({result["confidence"]*100:.1f}%)')

## üìä 14. Final Summary

In [None]:
final_gap = history['train_acc'][-1] - history['val_acc'][-1]
min_gap = min(history['gap'])

print('=' * 60)
print('üìä FINAL SUMMARY')
print('=' * 60)

print(f'''
üéØ MODEL PERFORMANCE:
   ‚Ä¢ Test Accuracy: {test_acc*100:.2f}%
   ‚Ä¢ Test F1 Score: {test_f1*100:.2f}%
   ‚Ä¢ Best Val F1: {best_val_f1*100:.2f}%

üìà OVERFITTING CHECK:
   ‚Ä¢ Final Gap: {final_gap*100:.2f}%
   ‚Ä¢ Best Gap: {best_gap*100:.2f}%
   ‚Ä¢ Status: {"‚úÖ Good" if final_gap < 0.05 else "‚ö†Ô∏è Check" if final_gap < 0.10 else "‚ùå Overfitting"}

‚öôÔ∏è TECHNIQUES USED:
   ‚Ä¢ Layer Freezing: {CONFIG['freeze_layers']}/12
   ‚Ä¢ Dropout: {CONFIG['dropout_rate']}
   ‚Ä¢ R-Drop Alpha: {CONFIG['rdrop_alpha']}
   ‚Ä¢ Weight Decay: {CONFIG['weight_decay']}
   ‚Ä¢ Label Smoothing: {CONFIG['label_smoothing']}
   ‚Ä¢ Learning Rate: {CONFIG['learning_rate']}

üíæ SAVED FILES:
   ‚Ä¢ models/indobert_sentiment_3class.pt
   ‚Ä¢ models/tokenizer/
   ‚Ä¢ models/training_history.json
   ‚Ä¢ training_history.png
   ‚Ä¢ confusion_matrix.png
''')

if test_acc >= 0.75 and final_gap < 0.05:
    print('üéâ EXCELLENT! Model has good accuracy and generalization!')
elif test_acc >= 0.75:
    print('‚ö†Ô∏è Good accuracy but watch for overfitting.')
elif final_gap < 0.05:
    print('‚úÖ Good generalization but accuracy could improve.')
else:
    print('‚ùå Consider adjusting hyperparameters.')

print('=' * 60)
print('‚úÖ Training completed!')
print('=' * 60)