In [None]:
# ============================================
# CELL 1: SETUP & GPU CHECK
# ============================================

!pip install transformers accelerate -q

import torch
import os

print('='*60)
print('üñ•Ô∏è  SYSTEM INFO')
print('='*60)
print(f'PyTorch: {torch.__version__}')

if torch.cuda.is_available():
    n_gpu = torch.cuda.device_count()
    print(f'‚úÖ GPU Available: {n_gpu} GPU(s)')
    for i in range(n_gpu):
        gpu_name = torch.cuda.get_device_name(i)
        gpu_mem = torch.cuda.get_device_properties(i).total_memory / 1024**3
        print(f'   GPU {i}: {gpu_name} ({gpu_mem:.1f} GB)')
else:
    print('‚ö†Ô∏è GPU not available!')

# List input files
print('\nüìÅ Input files:')
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(f'   {os.path.join(dirname, filename)}')

In [None]:
# ============================================
# CELL 2: IMPORTS & SEED
# ============================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from tqdm.auto import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_recall_fscore_support,
    classification_report, confusion_matrix, f1_score
)
import random
import copy
import json
from datetime import datetime
import time

warnings.filterwarnings('ignore')

# Reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

# Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'üéÆ Using device: {device}')

In [None]:
# ============================================
# CELL 3: LOAD DATA
# ============================================

# Auto-detect data file
DATA_PATH = None
search_patterns = ['gojek_reviews_3class_clean', 'gojek_reviews_3class', 'gojek']

for dirname, _, filenames in os.walk('/kaggle/input'):
    for pattern in search_patterns:
        for filename in filenames:
            if pattern in filename and filename.endswith('.csv'):
                DATA_PATH = os.path.join(dirname, filename)
                break
        if DATA_PATH:
            break
    if DATA_PATH:
        break

if DATA_PATH:
    print(f'‚úÖ Found: {DATA_PATH}')
    df = pd.read_csv(DATA_PATH)
else:
    print('‚ùå Data not found! Available files:')
    for dirname, _, filenames in os.walk('/kaggle/input'):
        for f in filenames:
            print(f'   {os.path.join(dirname, f)}')
    raise FileNotFoundError('Please upload gojek_reviews_3class_clean.csv')

# Data overview
print('\n' + '='*60)
print('üìä DATA OVERVIEW')
print('='*60)
print(f'Total samples: {len(df):,}')
print(f'Columns: {df.columns.tolist()}')

# Check required columns
text_col = 'content_clean' if 'content_clean' in df.columns else 'content'
print(f'\nText column: {text_col}')
print(f'\nüìà Sentiment Distribution:')
print(df['sentiment'].value_counts())

# Visualize
fig, ax = plt.subplots(figsize=(8, 4))
colors = {'negative': '#e74c3c', 'neutral': '#f39c12', 'positive': '#27ae60'}
counts = df['sentiment'].value_counts()
bars = ax.bar(counts.index, counts.values, color=[colors.get(s, '#3498db') for s in counts.index])
ax.set_title('Sentiment Distribution', fontsize=14, fontweight='bold')
ax.set_ylabel('Count')
for bar, count in zip(bars, counts.values):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 100, 
            f'{count:,}', ha='center', fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
# ============================================
# CELL 4: CONFIGURATION - OPTIMIZED v2
# ============================================
# PERBAIKAN: Lebih banyak layer trainable, regularisasi lebih kuat

# Label mapping
LABEL_MAP = {'negative': 0, 'neutral': 1, 'positive': 2}
LABEL_NAMES = ['negative', 'neutral', 'positive']
NUM_CLASSES = 3

# OPTIMIZED CONFIG v2 - Fix overfitting & improve accuracy
CONFIG = {
    # Model
    'model_name': 'indobenchmark/indobert-base-p1',
    'max_length': 128,
    'num_classes': NUM_CLASSES,
    
    # Training - ADJUSTED
    'batch_size': 16,  # Smaller batch = more updates = better generalization
    'epochs': 15,
    'learning_rate': 1e-5,  # Lower LR for stability
    
    # Anti-Overfitting - STRENGTHENED
    'dropout_rate': 0.5,  # Increased from 0.3
    'hidden_dropout': 0.4,  # New: dropout for hidden layer
    'weight_decay': 0.05,  # Increased from 0.01
    'label_smoothing': 0.15,  # Increased from 0.1
    'warmup_ratio': 0.2,  # Longer warmup
    'max_grad_norm': 0.5,  # Tighter gradient clipping
    'early_stopping_patience': 4,  # Stop earlier if no improvement
    
    # Layer Freezing - LESS FREEZING for better learning
    'freeze_embeddings': True,
    'freeze_layers': 4,  # CHANGED: Freeze only 0-3, train 4-11 (8 layers trainable)
    
    # R-Drop regularization - ADJUSTED
    'use_rdrop': True,
    'rdrop_alpha': 1.0,  # Increased for stronger regularization
    
    # Data augmentation - ENHANCED
    'augment_train': True,
    'word_dropout_prob': 0.15,  # Increased
    'augment_prob': 0.7,  # Probability to augment each sample
    
    # New: Use hidden layer in classifier
    'use_hidden_layer': True,
    'hidden_size': 256,
}

print('='*60)
print('‚öôÔ∏è  TRAINING CONFIGURATION v2 (OPTIMIZED)')
print('='*60)
print('üîß Key changes from v1:')
print('   ‚Ä¢ freeze_layers: 8 ‚Üí 4 (more trainable)')
print('   ‚Ä¢ dropout: 0.3 ‚Üí 0.5')
print('   ‚Ä¢ weight_decay: 0.01 ‚Üí 0.05')
print('   ‚Ä¢ learning_rate: 2e-5 ‚Üí 1e-5')
print('   ‚Ä¢ batch_size: 32 ‚Üí 16')
print('   ‚Ä¢ rdrop_alpha: 0.7 ‚Üí 1.0')
print('-'*60)
for k, v in CONFIG.items():
    print(f'  {k}: {v}')

In [None]:
# ============================================
# CELL 5: PREPARE DATA SPLITS
# ============================================

# Add label column
df['label'] = df['sentiment'].map(LABEL_MAP)

# Stratified split: 80% train, 10% val, 10% test
train_df, temp_df = train_test_split(
    df, test_size=0.2, random_state=42, stratify=df['label']
)
val_df, test_df = train_test_split(
    temp_df, test_size=0.5, random_state=42, stratify=temp_df['label']
)

print('='*60)
print('üìÇ DATA SPLITS (Stratified)')
print('='*60)
print(f'Train: {len(train_df):,} ({len(train_df)/len(df)*100:.0f}%)')
print(f'Val:   {len(val_df):,} ({len(val_df)/len(df)*100:.0f}%)')
print(f'Test:  {len(test_df):,} ({len(test_df)/len(df)*100:.0f}%)')

print(f'\nüìä Distribution per split:')
for name, split_df in [('Train', train_df), ('Val', val_df), ('Test', test_df)]:
    dist = split_df['sentiment'].value_counts()
    print(f'  {name}: {dict(dist)}')

In [None]:
# ============================================
# CELL 6: DATASET CLASS - ENHANCED AUGMENTATION
# ============================================

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained(CONFIG['model_name'])
print(f'‚úÖ Tokenizer loaded: {CONFIG["model_name"]}')

class SentimentDataset(Dataset):
    """Dataset with ENHANCED text augmentation for better generalization"""
    
    def __init__(self, texts, labels, tokenizer, max_length=128, 
                 augment=False, word_dropout_prob=0.15, augment_prob=0.7):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.augment = augment
        self.word_dropout_prob = word_dropout_prob
        self.augment_prob = augment_prob
    
    def __len__(self):
        return len(self.texts)
    
    def _augment_text(self, text):
        """Enhanced augmentation with multiple techniques"""
        if not self.augment or random.random() > self.augment_prob:
            return text
        
        words = str(text).split()
        if len(words) <= 4:
            return text
        
        # Apply multiple augmentation techniques
        aug_type = random.random()
        
        if aug_type < 0.25:
            # Word dropout - remove random words
            words = [w for w in words if random.random() > self.word_dropout_prob]
        elif aug_type < 0.45:
            # Word swap - swap adjacent words
            if len(words) > 2:
                idx = random.randint(0, len(words) - 2)
                words[idx], words[idx + 1] = words[idx + 1], words[idx]
        elif aug_type < 0.60:
            # Random deletion - remove 1-2 words
            if len(words) > 5:
                for _ in range(random.randint(1, 2)):
                    if len(words) > 4:
                        del_idx = random.randint(1, len(words) - 2)
                        words.pop(del_idx)
        elif aug_type < 0.75:
            # Shuffle middle portion
            if len(words) > 5:
                mid_start = len(words) // 4
                mid_end = 3 * len(words) // 4
                middle = words[mid_start:mid_end]
                random.shuffle(middle)
                words = words[:mid_start] + middle + words[mid_end:]
        elif aug_type < 0.90:
            # Word duplication
            if len(words) > 3:
                dup_idx = random.randint(0, len(words) - 1)
                words.insert(dup_idx, words[dup_idx])
        # else: no augmentation (10%)
        
        return ' '.join(words) if words else text
    
    def __getitem__(self, idx):
        text = self._augment_text(self.texts[idx])
        
        encoding = self.tokenizer.encode_plus(
            str(text),
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Create datasets
train_dataset = SentimentDataset(
    train_df[text_col].values, train_df['label'].values, tokenizer,
    max_length=CONFIG['max_length'], augment=CONFIG['augment_train'],
    word_dropout_prob=CONFIG['word_dropout_prob'],
    augment_prob=CONFIG['augment_prob']
)
val_dataset = SentimentDataset(
    val_df[text_col].values, val_df['label'].values, tokenizer,
    max_length=CONFIG['max_length'], augment=False
)
test_dataset = SentimentDataset(
    test_df[text_col].values, test_df['label'].values, tokenizer,
    max_length=CONFIG['max_length'], augment=False
)

# Create dataloaders - smaller batch for better generalization
train_loader = DataLoader(
    train_dataset, batch_size=CONFIG['batch_size'], 
    shuffle=True, num_workers=2, pin_memory=True, drop_last=True
)
val_loader = DataLoader(
    val_dataset, batch_size=CONFIG['batch_size']*2, 
    shuffle=False, num_workers=2, pin_memory=True
)
test_loader = DataLoader(
    test_dataset, batch_size=CONFIG['batch_size']*2, 
    shuffle=False, num_workers=2, pin_memory=True
)

print(f'\n‚úÖ DataLoaders created:')
print(f'  Train: {len(train_dataset):,} samples, {len(train_loader)} batches (batch_size={CONFIG["batch_size"]})')
print(f'  Val:   {len(val_dataset):,} samples, {len(val_loader)} batches')
print(f'  Test:  {len(test_dataset):,} samples, {len(test_loader)} batches')

In [None]:
# ============================================
# CELL 7: MODEL ARCHITECTURE - IMPROVED
# ============================================

class IndoBERTClassifier(nn.Module):
    """
    IndoBERT for Sentiment Classification - IMPROVED v2
    - Less layer freezing for better learning
    - Hidden layer for better representation
    - Stronger dropout regularization
    """
    
    def __init__(self, model_name, num_classes, dropout_rate=0.5,
                 hidden_dropout=0.4, freeze_embeddings=True, freeze_layers=4,
                 use_hidden_layer=True, hidden_size=256):
        super().__init__()
        
        self.bert = BertModel.from_pretrained(model_name)
        self.bert_hidden_size = self.bert.config.hidden_size
        self.use_hidden_layer = use_hidden_layer
        
        # Freeze embeddings
        if freeze_embeddings:
            for param in self.bert.embeddings.parameters():
                param.requires_grad = False
        
        # Freeze first N encoder layers
        for i in range(freeze_layers):
            for param in self.bert.encoder.layer[i].parameters():
                param.requires_grad = False
        
        # Classifier head with optional hidden layer
        self.dropout1 = nn.Dropout(dropout_rate)
        
        if use_hidden_layer:
            self.hidden = nn.Linear(self.bert_hidden_size, hidden_size)
            self.layer_norm = nn.LayerNorm(hidden_size)
            self.dropout2 = nn.Dropout(hidden_dropout)
            self.classifier = nn.Linear(hidden_size, num_classes)
        else:
            self.classifier = nn.Linear(self.bert_hidden_size, num_classes)
        
        # Initialize weights
        self._init_weights()
        
        # Print info
        trainable_layers = 12 - freeze_layers
        print(f'‚úÖ Model initialized (IMPROVED v2)')
        print(f'   Embeddings frozen: {freeze_embeddings}')
        print(f'   Layers frozen: 0-{freeze_layers-1} ({freeze_layers} layers)')
        print(f'   Layers trainable: {freeze_layers}-11 ({trainable_layers} layers)')
        print(f'   Hidden layer: {use_hidden_layer} (size={hidden_size})')
        print(f'   Dropout: {dropout_rate} (classifier), {hidden_dropout} (hidden)')
    
    def _init_weights(self):
        """Initialize classifier weights"""
        if self.use_hidden_layer:
            nn.init.xavier_uniform_(self.hidden.weight)
            nn.init.zeros_(self.hidden.bias)
        nn.init.xavier_uniform_(self.classifier.weight)
        nn.init.zeros_(self.classifier.bias)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled = outputs.pooler_output
        
        x = self.dropout1(pooled)
        
        if self.use_hidden_layer:
            x = self.hidden(x)
            x = F.gelu(x)  # GELU activation
            x = self.layer_norm(x)
            x = self.dropout2(x)
        
        logits = self.classifier(x)
        return logits

# Initialize model with new config
model = IndoBERTClassifier(
    model_name=CONFIG['model_name'],
    num_classes=CONFIG['num_classes'],
    dropout_rate=CONFIG['dropout_rate'],
    hidden_dropout=CONFIG['hidden_dropout'],
    freeze_embeddings=CONFIG['freeze_embeddings'],
    freeze_layers=CONFIG['freeze_layers'],
    use_hidden_layer=CONFIG['use_hidden_layer'],
    hidden_size=CONFIG['hidden_size']
).to(device)

# Count parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'\nüìä Parameters:')
print(f'   Total: {total_params:,}')
print(f'   Trainable: {trainable_params:,} ({trainable_params/total_params*100:.1f}%)')
print(f'   Frozen: {total_params - trainable_params:,}')

In [None]:
# ============================================
# CELL 8: OPTIMIZER & SCHEDULER
# ============================================

# Loss with label smoothing
criterion = nn.CrossEntropyLoss(label_smoothing=CONFIG['label_smoothing'])

# Optimizer - only trainable parameters
no_decay = ['bias', 'LayerNorm.weight']
optimizer_params = [
    {
        'params': [p for n, p in model.named_parameters() 
                   if p.requires_grad and not any(nd in n for nd in no_decay)],
        'weight_decay': CONFIG['weight_decay']
    },
    {
        'params': [p for n, p in model.named_parameters() 
                   if p.requires_grad and any(nd in n for nd in no_decay)],
        'weight_decay': 0.0
    }
]

optimizer = AdamW(optimizer_params, lr=CONFIG['learning_rate'])

# Scheduler with warmup
total_steps = len(train_loader) * CONFIG['epochs']
warmup_steps = int(total_steps * CONFIG['warmup_ratio'])

scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps
)

print('‚úÖ Optimizer & Scheduler configured')
print(f'   LR: {CONFIG["learning_rate"]}')
print(f'   Weight Decay: {CONFIG["weight_decay"]}')
print(f'   Warmup Steps: {warmup_steps}')
print(f'   Total Steps: {total_steps}')

In [None]:
# ============================================
# CELL 9: TRAINING FUNCTIONS - IMPROVED
# ============================================

def compute_kl_loss(p, q):
    """KL divergence for R-Drop - symmetric"""
    p_loss = F.kl_div(F.log_softmax(p, dim=-1), F.softmax(q, dim=-1), reduction='batchmean')
    q_loss = F.kl_div(F.log_softmax(q, dim=-1), F.softmax(p, dim=-1), reduction='batchmean')
    return (p_loss + q_loss) / 2

def train_epoch(model, loader, criterion, optimizer, scheduler, device, 
                use_rdrop=True, rdrop_alpha=1.0, max_grad_norm=0.5):
    model.train()
    total_loss = 0
    total_ce_loss = 0
    total_kl_loss = 0
    all_preds, all_labels = [], []
    
    pbar = tqdm(loader, desc='Training')
    for batch in pbar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        optimizer.zero_grad()
        
        if use_rdrop:
            # R-Drop: 2 forward passes with different dropout
            logits1 = model(input_ids, attention_mask)
            logits2 = model(input_ids, attention_mask)
            
            ce_loss = (criterion(logits1, labels) + criterion(logits2, labels)) / 2
            kl_loss = compute_kl_loss(logits1, logits2)
            loss = ce_loss + rdrop_alpha * kl_loss
            
            total_ce_loss += ce_loss.item()
            total_kl_loss += kl_loss.item()
            
            logits = (logits1 + logits2) / 2
        else:
            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)
            total_ce_loss += loss.item()
        
        loss.backward()
        
        # Gradient clipping - prevent exploding gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        
        optimizer.step()
        scheduler.step()
        
        total_loss += loss.item()
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels.cpu().numpy())
        
        pbar.set_postfix({
            'loss': f'{loss.item():.4f}',
            'ce': f'{ce_loss.item():.4f}' if use_rdrop else f'{loss.item():.4f}'
        })
    
    avg_loss = total_loss / len(loader)
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='weighted')
    
    return avg_loss, accuracy, f1

def evaluate(model, loader, criterion, device, mc_dropout=False, n_samples=5):
    """
    Evaluate with optional Monte Carlo Dropout for uncertainty
    """
    if mc_dropout:
        model.train()  # Keep dropout active
    else:
        model.eval()
    
    total_loss = 0
    all_preds, all_labels, all_probs = [], [], []
    
    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            if mc_dropout:
                # Average over multiple dropout samples
                logits_list = []
                for _ in range(n_samples):
                    logits = model(input_ids, attention_mask)
                    logits_list.append(logits)
                logits = torch.stack(logits_list).mean(dim=0)
            else:
                logits = model(input_ids, attention_mask)
            
            loss = criterion(logits, labels)
            
            probs = F.softmax(logits, dim=1)
            preds = torch.argmax(logits, dim=1)
            
            total_loss += loss.item()
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            all_probs.extend(probs.cpu().numpy())
    
    avg_loss = total_loss / len(loader)
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='weighted')
    
    return avg_loss, accuracy, f1, all_preds, all_labels, all_probs

class EarlyStopping:
    """Early stopping with gap monitoring"""
    
    def __init__(self, patience=4, min_delta=0.001, max_gap=0.08):
        self.patience = patience
        self.min_delta = min_delta
        self.max_gap = max_gap  # Max allowed train-val gap
        self.counter = 0
        self.best_score = None
        self.best_model = None
        self.best_gap = None
        self.early_stop = False
    
    def __call__(self, val_score, model, train_score=None):
        gap = (train_score - val_score) if train_score else 0
        
        # Check if improved AND gap is acceptable
        improved = False
        if self.best_score is None:
            improved = True
        elif val_score > self.best_score + self.min_delta:
            # Only accept if gap is not too large
            if gap < self.max_gap or (self.best_gap and gap < self.best_gap):
                improved = True
        
        if improved:
            self.best_score = val_score
            self.best_model = copy.deepcopy(model.state_dict())
            self.best_gap = gap
            self.counter = 0
            return True
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
            return False

print('‚úÖ Training functions defined (IMPROVED)')
print('   ‚Ä¢ R-Drop with stronger alpha')
print('   ‚Ä¢ Early stopping with gap monitoring')
print('   ‚Ä¢ Optional MC Dropout for evaluation')

In [None]:
# ============================================
# CELL 10: TRAINING LOOP - IMPROVED
# ============================================

history = {
    'train_loss': [], 'train_acc': [], 'train_f1': [],
    'val_loss': [], 'val_acc': [], 'val_f1': [],
    'gap': [], 'lr': []
}

# Early stopping with gap monitoring
early_stopping = EarlyStopping(
    patience=CONFIG['early_stopping_patience'],
    max_gap=0.08  # Stop if gap > 8% and no improvement
)

print('='*60)
print('üöÄ TRAINING STARTED (OPTIMIZED v2)')
print('='*60)
print(f'Strategy: Less freezing + stronger regularization')
print(f'Epochs: {CONFIG["epochs"]} | Batch: {CONFIG["batch_size"]} | LR: {CONFIG["learning_rate"]}')
print(f'Trainable layers: {12 - CONFIG["freeze_layers"]}/12 | Dropout: {CONFIG["dropout_rate"]}')
print(f'R-Drop: Œ±={CONFIG["rdrop_alpha"]} | Weight Decay: {CONFIG["weight_decay"]}')
print('-'*60)

start_time = time.time()
best_val_f1 = 0
best_val_acc = 0
best_epoch = 0
best_gap = 1.0

for epoch in range(CONFIG['epochs']):
    print(f'\nüìç Epoch {epoch + 1}/{CONFIG["epochs"]}')
    
    # Get current learning rate
    current_lr = optimizer.param_groups[0]['lr']
    
    # Train
    train_loss, train_acc, train_f1 = train_epoch(
        model, train_loader, criterion, optimizer, scheduler, device,
        use_rdrop=CONFIG['use_rdrop'], rdrop_alpha=CONFIG['rdrop_alpha'],
        max_grad_norm=CONFIG['max_grad_norm']
    )
    
    # Validate
    val_loss, val_acc, val_f1, _, _, _ = evaluate(model, val_loader, criterion, device)
    
    # Calculate gap
    gap = train_acc - val_acc
    
    # Save history
    history['train_loss'].append(train_loss)
    history['train_acc'].append(train_acc)
    history['train_f1'].append(train_f1)
    history['val_loss'].append(val_loss)
    history['val_acc'].append(val_acc)
    history['val_f1'].append(val_f1)
    history['gap'].append(gap)
    history['lr'].append(current_lr)
    
    # Print metrics
    print(f'  Train | Loss: {train_loss:.4f} | Acc: {train_acc:.4f} | F1: {train_f1:.4f}')
    print(f'  Val   | Loss: {val_loss:.4f} | Acc: {val_acc:.4f} | F1: {val_f1:.4f}')
    print(f'  Gap   | {gap*100:.2f}% | LR: {current_lr:.2e}', end='')
    
    # Gap status
    if gap > 0.10:
        print(' ‚ö†Ô∏è OVERFITTING!')
    elif gap > 0.05:
        print(' ‚ö° Watch gap')
    else:
        print(' ‚úÖ Good')
    
    # Early stopping check (considers both F1 and gap)
    improved = early_stopping(val_f1, model, train_f1)
    
    if improved:
        best_val_f1 = val_f1
        best_val_acc = val_acc
        best_epoch = epoch + 1
        best_gap = gap
        print(f'  ‚≠ê New best! F1: {val_f1:.4f}, Acc: {val_acc:.4f}, Gap: {gap*100:.1f}%')
    else:
        print(f'  üìä No improvement ({early_stopping.counter}/{early_stopping.patience})')
    
    if early_stopping.early_stop:
        print(f'\nüõë Early stopping at epoch {epoch + 1}')
        print(f'   Best was epoch {best_epoch} with F1={best_val_f1:.4f}, Gap={best_gap*100:.1f}%')
        break

# Load best model
if early_stopping.best_model is not None:
    model.load_state_dict(early_stopping.best_model)

total_time = time.time() - start_time
print(f'\n‚úÖ Training completed in {total_time/60:.1f} minutes')
print(f'   Best epoch: {best_epoch}')
print(f'   Best val F1: {best_val_f1:.4f}')
print(f'   Best val Acc: {best_val_acc:.4f}')
print(f'   Best gap: {best_gap*100:.2f}%')

In [None]:
# ============================================
# CELL 11: TRAINING VISUALIZATION
# ============================================

fig, axes = plt.subplots(1, 3, figsize=(15, 4))

epochs_range = range(1, len(history['train_loss']) + 1)

# Loss
axes[0].plot(epochs_range, history['train_loss'], 'b-o', label='Train', markersize=4)
axes[0].plot(epochs_range, history['val_loss'], 'r-s', label='Val', markersize=4)
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].set_title('Loss', fontweight='bold')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Accuracy
axes[1].plot(epochs_range, history['train_acc'], 'b-o', label='Train', markersize=4)
axes[1].plot(epochs_range, history['val_acc'], 'r-s', label='Val', markersize=4)
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Accuracy')
axes[1].set_title('Accuracy', fontweight='bold')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

# F1 Score
axes[2].plot(epochs_range, history['train_f1'], 'b-o', label='Train', markersize=4)
axes[2].plot(epochs_range, history['val_f1'], 'r-s', label='Val', markersize=4)
axes[2].axhline(y=best_val_f1, color='g', linestyle='--', label=f'Best: {best_val_f1:.4f}')
axes[2].set_xlabel('Epoch')
axes[2].set_ylabel('F1 Score')
axes[2].set_title('F1 Score', fontweight='bold')
axes[2].legend()
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('training_history_3class.png', dpi=150, bbox_inches='tight')
plt.show()

# Gap analysis
plt.figure(figsize=(8, 4))
gaps = [t - v for t, v in zip(history['train_acc'], history['val_acc'])]
colors = ['red' if g > 0.10 else 'orange' if g > 0.05 else 'green' for g in gaps]
plt.bar(epochs_range, [g*100 for g in gaps], color=colors)
plt.axhline(y=10, color='red', linestyle='--', label='Overfitting threshold (10%)')
plt.axhline(y=5, color='orange', linestyle='--', label='Warning threshold (5%)')
plt.xlabel('Epoch')
plt.ylabel('Train-Val Gap (%)')
plt.title('Overfitting Analysis', fontweight='bold')
plt.legend()
plt.tight_layout()
plt.savefig('overfitting_analysis_3class.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# ============================================
# CELL 12: TEST EVALUATION
# ============================================

print('='*60)
print('üß™ FINAL TEST EVALUATION')
print('='*60)

test_loss, test_acc, test_f1, test_preds, test_labels, test_probs = evaluate(
    model, test_loader, criterion, device
)

print(f'\nüìä Test Results:')
print(f'   Loss: {test_loss:.4f}')
print(f'   Accuracy: {test_acc:.4f} ({test_acc*100:.2f}%)')
print(f'   F1 Score: {test_f1:.4f}')

# Classification report
print('\nüìã Classification Report:')
print(classification_report(test_labels, test_preds, target_names=LABEL_NAMES, digits=4))

# Confusion matrix
cm = confusion_matrix(test_labels, test_preds)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=LABEL_NAMES, yticklabels=LABEL_NAMES)
plt.title('Confusion Matrix - Test Set', fontweight='bold')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.tight_layout()
plt.savefig('confusion_matrix_3class.png', dpi=150, bbox_inches='tight')
plt.show()

# Per-class accuracy
print('\nüìä Per-Class Accuracy:')
for i, name in enumerate(LABEL_NAMES):
    class_mask = np.array(test_labels) == i
    class_acc = np.mean(np.array(test_preds)[class_mask] == i)
    print(f'   {name}: {class_acc:.4f} ({class_acc*100:.2f}%)')

In [None]:
# ============================================
# CELL 13: SAVE MODEL
# ============================================

# Save model
save_path = '/kaggle/working/indobert_sentiment_3class.pt'

torch.save({
    'model_state_dict': model.state_dict(),
    'config': CONFIG,
    'label_map': LABEL_MAP,
    'label_names': LABEL_NAMES,
    'metrics': {
        'test_accuracy': test_acc,
        'test_f1': test_f1,
        'best_val_f1': best_val_f1,
        'best_epoch': best_epoch
    },
    'history': history
}, save_path)

print(f'‚úÖ Model saved to: {save_path}')
print(f'   File size: {os.path.getsize(save_path) / 1024 / 1024:.1f} MB')

# Save training history
history_path = '/kaggle/working/training_history_3class.json'
with open(history_path, 'w') as f:
    json.dump(history, f, indent=2)
print(f'‚úÖ History saved to: {history_path}')

In [None]:
# ============================================
# CELL 14: INFERENCE FUNCTION
# ============================================

def predict_sentiment(text, model, tokenizer, device, label_names):
    """Predict sentiment for a single text"""
    model.eval()
    
    encoding = tokenizer.encode_plus(
        str(text),
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    with torch.no_grad():
        logits = model(input_ids, attention_mask)
        probs = F.softmax(logits, dim=1)
        pred = torch.argmax(logits, dim=1).item()
    
    return {
        'sentiment': label_names[pred],
        'confidence': probs[0][pred].item(),
        'probabilities': {name: probs[0][i].item() for i, name in enumerate(label_names)}
    }

# Test predictions
print('='*60)
print('üîÆ SAMPLE PREDICTIONS')
print('='*60)

test_texts = [
    "Aplikasi sangat membantu, driver ramah dan cepat sampai",
    "Biasa saja, tidak ada yang istimewa",
    "Aplikasi error terus, driver tidak profesional, sangat mengecewakan",
    "Gojek memudahkan transportasi sehari-hari saya",
    "Harga mahal, promo tidak jelas"
]

for text in test_texts:
    result = predict_sentiment(text, model, tokenizer, device, LABEL_NAMES)
    print(f'\nüìù "{text[:50]}..."' if len(text) > 50 else f'\nüìù "{text}"')
    print(f'   Sentiment: {result["sentiment"].upper()}')
    print(f'   Confidence: {result["confidence"]*100:.1f}%')
    print(f'   Probs: {result["probabilities"]}')

In [None]:
# ============================================
# CELL 15: FINAL SUMMARY
# ============================================

print('='*60)
print('üìä TRAINING SUMMARY (OPTIMIZED v2)')
print('='*60)

summary = f"""
üéØ MODEL: IndoBERT Sentiment 3-Class (Optimized v2)

üìà METRICS:
   ‚Ä¢ Test Accuracy: {test_acc*100:.2f}%
   ‚Ä¢ Test F1 Score: {test_f1:.4f}
   ‚Ä¢ Best Val F1:   {best_val_f1:.4f} (epoch {best_epoch})
   ‚Ä¢ Best Val Acc:  {best_val_acc*100:.2f}%

‚öôÔ∏è CONFIGURATION (Optimized):
   ‚Ä¢ Model: {CONFIG['model_name']}
   ‚Ä¢ Epochs trained: {len(history['train_loss'])}
   ‚Ä¢ Batch size: {CONFIG['batch_size']}
   ‚Ä¢ Learning rate: {CONFIG['learning_rate']}
   ‚Ä¢ Frozen layers: {CONFIG['freeze_layers']}/12 (trainable: {12-CONFIG['freeze_layers']})
   ‚Ä¢ Dropout: {CONFIG['dropout_rate']} + {CONFIG['hidden_dropout']}
   ‚Ä¢ Hidden layer: {CONFIG['hidden_size']}
   ‚Ä¢ R-Drop alpha: {CONFIG['rdrop_alpha']}
   ‚Ä¢ Weight decay: {CONFIG['weight_decay']}
   ‚Ä¢ Label smoothing: {CONFIG['label_smoothing']}

üìÇ DATA:
   ‚Ä¢ Train: {len(train_df):,}
   ‚Ä¢ Val: {len(val_df):,}
   ‚Ä¢ Test: {len(test_df):,}

üíæ SAVED FILES:
   ‚Ä¢ Model: indobert_sentiment_3class.pt
   ‚Ä¢ History: training_history_3class.json
   ‚Ä¢ Plots: training_history_3class.png, confusion_matrix_3class.png
"""

print(summary)

# Overfitting check
final_gap = history['train_acc'][-1] - history['val_acc'][-1]
print(f'üîç OVERFITTING CHECK:')
print(f'   Final train-val gap: {final_gap*100:.2f}%')
print(f'   Best model gap: {best_gap*100:.2f}%')

if best_gap < 0.05:
    print('   ‚úÖ Excellent generalization!')
elif best_gap < 0.08:
    print('   ‚úÖ Good generalization')
elif best_gap < 0.10:
    print('   ‚ö° Acceptable gap')
else:
    print('   ‚ö†Ô∏è Some overfitting - consider more regularization')

# Compare with baseline
print(f'\nüìä IMPROVEMENT vs BASELINE:')
print(f'   Baseline: Acc=63.18%, F1=0.6354, Gap=20.70%')
print(f'   Current:  Acc={test_acc*100:.2f}%, F1={test_f1:.4f}, Gap={best_gap*100:.2f}%')

acc_change = (test_acc - 0.6318) * 100
f1_change = test_f1 - 0.6354
gap_change = 0.207 - best_gap

print(f'   Accuracy change: {acc_change:+.2f}%')
print(f'   F1 change: {f1_change:+.4f}')
print(f'   Gap reduction: {gap_change*100:+.2f}%')

print('\n' + '='*60)
print('‚úÖ Training complete! Download model from /kaggle/working/')
print('='*60)