In [None]:
# IMPORTS
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import random
import os
import time
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW  # IMPORT ADAMW FROM TORCH.OPTIM INSTEAD OF TRANSFORMERS
from tqdm.notebook import tqdm

# KAGGLE SPECIFIC CONFIGURATION
BASE_DIR = '/kaggle/input/ai-2-dl-for-nlp-2025-homework-3'
TRAIN_CSV_PATH = os.path.join(BASE_DIR, 'train_dataset.csv')
VAL_CSV_PATH = os.path.join(BASE_DIR, 'val_dataset.csv')
TEST_CSV_PATH = os.path.join(BASE_DIR, 'test_dataset.csv')

# OUTPUT DIRECTORY FOR SAVING MODELS AND SUBMISSIONS (KAGGLE'S WRITABLE DIRECTORY)
OUTPUT_DIR = '/kaggle/working'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

# MODEL AND TRAINING HYPERPARAMETERS 
BATCH_SIZE = 64      # OPTIMIZED FOR KAGGLE T4 GPU [GOOGLE]
MAX_LENGTH = 128     # MAX SEQUENCE LENGTH FOR TOKENIZATION
BERT_LEARNING_RATE = 2e-5  # TYPICAL LEARNING RATE FOR BERT FINE-TUNING
BERT_EPOCHS = 4      # NUMBER OF EPOCHS FOR FINE-TUNING
WARMUP_STEPS = 0     # NO WARMUP STEPS

# KAGGLE GPU OPTIMIZATION
print("=== CHECKING KAGGLE GPU CONFIGURATION ===")
if torch.cuda.is_available():
    # CONFIGURE FOR KAGGLE T4 GPU
    print("KAGGLE T4 GPU DETECTED!")
    # SET OPTIMAL BATCH SIZE FOR T4 (16GB VRAM)
    # USUALLY 64 WORKS WELL FOR BERT ON T4
    gpu_mem = torch.cuda.get_device_properties(0).total_memory / 1e9  # GB
    print(f"GPU MEMORY: {gpu_mem:.1f} GB")
    
    # ENABLE MIXED PRECISION TRAINING FOR FASTER PERFORMANCE
    try:
        from torch.cuda.amp import autocast
        from torch.amp import GradScaler
        scaler = GradScaler('cuda')
        mixed_precision_available = True
        print("MIXED PRECISION TRAINING ENABLED (FASTER TRAINING)")
    except:
        mixed_precision_available = False
        print("MIXED PRECISION TRAINING NOT AVAILABLE")
    
    # SET CUDNN BENCHMARK FOR OPTIMAL PERFORMANCE
    torch.backends.cudnn.benchmark = True
    print("CUDNN BENCHMARK ENABLED FOR OPTIMAL PERFORMANCE")
    
    # PRINT GPU DETAILS
    print(f"CUDA VERSION: {torch.version.cuda}")
    print(f"GPU NAME: {torch.cuda.get_device_name(0)}")
else:
    print("WARNING: GPU NOT DETECTED! TRAINING WILL BE SLOW.")
    mixed_precision_available = False

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"USING DEVICE: {DEVICE}")

# SET RANDOM SEEDS FOR REPRODUCIBILITY
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(RANDOM_SEED)

CUDA AVAILABLE: False
USING DEVICE: cpu


In [None]:
# VERIFY DATASET PATHS FOR KAGGLE
print("CHECKING KAGGLE DATASET PATHS...")
print(f"TRAIN DATASET: {os.path.exists(TRAIN_CSV_PATH)}")
print(f"VALIDATION DATASET: {os.path.exists(VAL_CSV_PATH)}")
print(f"TEST DATASET: {os.path.exists(TEST_CSV_PATH)}")

# GPU MEMORY MANAGEMENT FOR KAGGLE T4 GPU
def clear_gpu_memory():
    """CLEAR GPU MEMORY CACHE TO FREE UP RESOURCES."""
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        print("GPU MEMORY CACHE CLEARED")
        # PRINT CURRENT MEMORY USAGE
        print(f"GPU MEMORY: ALLOCATED: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB, "
              f"CACHED: {torch.cuda.memory_reserved(0) / 1e9:.2f} GB")
    else:
        print("NO GPU AVAILABLE")
        
# PRINT INITIAL GPU MEMORY INFO
if torch.cuda.is_available():
    print("\nINITIAL GPU MEMORY USAGE:")
    print(f"ALLOCATED: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB")
    print(f"CACHED: {torch.cuda.memory_reserved(0) / 1e9:.2f} GB")

In [None]:
# LOAD DATASETS
print("LOADING DATASETS...")

try:
    # READ CSV FILES
    train_df = pd.read_csv(TRAIN_CSV_PATH)
    val_df = pd.read_csv(VAL_CSV_PATH)
    test_df = pd.read_csv(TEST_CSV_PATH)
    
    print("DATASETS LOADED SUCCESSFULLY.")
    
    # BASIC DATA EXPLORATION
    print(f"TRAINING SET: {train_df.shape[0]} SAMPLES")
    print(f"VALIDATION SET: {val_df.shape[0]} SAMPLES")
    print(f"TEST SET: {test_df.shape[0]} SAMPLES")
    
    # CLASS BALANCE IN TRAINING SET
    class_counts = train_df['Label'].value_counts()
    print("\nCLASS DISTRIBUTION IN TRAINING SET:")
    for label, count in class_counts.items():
        print(f"CLASS {label}: {count} SAMPLES ({count/len(train_df)*100:.2f}%)")
    
except Exception as e:
    print(f"ERROR DURING LOADING: {e}")
    raise e

LOADING DATASETS...
DATASETS LOADED SUCCESSFULLY.
TRAINING SET: 148388 SAMPLES
VALIDATION SET: 42396 SAMPLES
TEST SET: 21199 SAMPLES

CLASS DISTRIBUTION IN TRAINING SET:
CLASS 1: 74196 SAMPLES (50.00%)
CLASS 0: 74192 SAMPLES (50.00%)


In [None]:
# INITIALIZE BERT TOKENIZER
print("INITIALIZING BERT TOKENIZER...")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# DEFINE CUSTOM DATASET FOR SENTIMENT ANALYSIS WITH BERT
class SentimentDataset(Dataset):
    def __init__(self, texts, labels=None, tokenizer=None, max_length=128, is_test=False):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.is_test = is_test
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        
        # TOKENIZE THE TEXT FOR BERT
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,  # ADDS [CLS] AND [SEP]
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        # CONVERT TO FLAT TENSORS FROM BATCHED TENSORS
        item = {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }
        
        # ADD LABEL FOR TRAINING DATA
        if not self.is_test and self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
            
        # ADD ID FOR TEST DATA
        if self.is_test:
            item['id'] = torch.tensor(idx)
            
        return item

# CREATE DATASETS
print("CREATING DATASETS...")
train_dataset = SentimentDataset(
    train_df['Text'], 
    train_df['Label'].values, 
    tokenizer,
    max_length=MAX_LENGTH
)

val_dataset = SentimentDataset(
    val_df['Text'], 
    val_df['Label'].values, 
    tokenizer,
    max_length=MAX_LENGTH
)

test_dataset = SentimentDataset(
    test_df['Text'], 
    labels=None,  
    tokenizer=tokenizer,
    max_length=MAX_LENGTH,
    is_test=True
)

# CREATE DATALOADERS
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

print(f"DATALOADERS CREATED WITH BATCH SIZE: {BATCH_SIZE}")

INITIALIZING BERT TOKENIZER...
CREATING DATASETS...
DATALOADERS CREATED WITH BATCH SIZE: 32
CREATING DATASETS...
DATALOADERS CREATED WITH BATCH SIZE: 32


In [None]:
# INITIALIZE BERT MODEL FOR SEQUENCE CLASSIFICATION
print("INITIALIZING BERT MODEL...")
bert_model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2  # BINARY CLASSIFICATION (0 OR 1)
).to(DEVICE)

# DEFINE OPTIMIZER WITH WEIGHT DECAY
optimizer = AdamW(
    bert_model.parameters(),
    lr=BERT_LEARNING_RATE,
    # PYTORCH'S ADAMW DOESN'T HAVE CORRECT_BIAS PARAMETER
    weight_decay=0.01  # DEFAULT WEIGHT DECAY FOR BERT FINE-TUNING
)

# CALCULATE TRAINING STEPS AND SETUP SCHEDULER
total_steps = len(train_loader) * BERT_EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=WARMUP_STEPS,
    num_training_steps=total_steps
)

print(f"BERT MODEL INITIALIZED WITH LEARNING RATE: {BERT_LEARNING_RATE}")
print(f"TRAINING FOR {BERT_EPOCHS} EPOCHS, {total_steps} TOTAL STEPS")

INITIALIZING BERT MODEL...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERT MODEL INITIALIZED WITH LEARNING RATE: 2e-05
TRAINING FOR 4 EPOCHS, 18552 TOTAL STEPS


In [None]:
# BERT TRAINING FUNCTION
def train_bert(model, dataloader, optimizer, scheduler, device):
    """
    TRAINS THE BERT MODEL FOR ONE EPOCH WITH MIXED PRECISION IF AVAILABLE
    """
    # SET MODEL TO TRAINING MODE
    model.train()
    
    # TRACK METRICS
    epoch_loss = 0
    epoch_corrects = 0
    total_samples = 0
    
    # ADD PROGRESS BAR
    for batch in tqdm(dataloader, desc="TRAINING"):
        # MOVE BATCH TO DEVICE
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        # CLEAR GRADIENTS
        optimizer.zero_grad()
        
        # USE MIXED PRECISION TRAINING IF AVAILABLE (FASTER ON T4 GPU)
        if mixed_precision_available:
            with autocast():
                # FORWARD PASS - BERT MODELS RETURN A NAMEDTUPLE WITH MANY FIELDS
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )
                
                # EXTRACT LOSS AND LOGITS
                loss = outputs.loss
                logits = outputs.logits
            
            # BACKWARD PASS WITH GRADIENT SCALING
            scaler.scale(loss).backward()
            
            # CLIP GRADIENTS - PREVENTS EXPLODING GRADIENTS
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            
            # UPDATE WEIGHTS WITH SCALING
            scaler.step(optimizer)
            scaler.update()
        else:
            # STANDARD TRAINING WITHOUT MIXED PRECISION
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            # EXTRACT LOSS AND LOGITS
            loss = outputs.loss
            logits = outputs.logits
            
            # BACKWARD PASS
            loss.backward()
            
            # CLIP GRADIENTS - PREVENTS EXPLODING GRADIENTS
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            
            # UPDATE WEIGHTS
            optimizer.step()
        
        # UPDATE LEARNING RATE SCHEDULE
        scheduler.step()
        
        # GET PREDICTIONS AND CALCULATE ACCURACY
        preds = torch.argmax(logits, dim=1)
        corrects = (preds == labels).sum().item()
        
        # ACCUMULATE METRICS
        batch_size = labels.size(0)
        epoch_loss += loss.item() * batch_size
        epoch_corrects += corrects
        total_samples += batch_size
    
    # CALCULATE EPOCH METRICS
    avg_loss = epoch_loss / total_samples
    accuracy = epoch_corrects / total_samples
    
    return avg_loss, accuracy

# BERT EVALUATION FUNCTION
def evaluate_bert(model, dataloader, device):
    """
    EVALUATES THE BERT MODEL ON A VALIDATION OR TEST SET
    """
    # SET MODEL TO EVALUATION MODE
    model.eval()
    
    # TRACK METRICS
    epoch_loss = 0
    all_preds = []
    all_labels = []
    total_samples = 0
    
    # NO GRADIENT CALCULATION DURING EVALUATION
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="EVALUATING"):
            # MOVE BATCH TO DEVICE
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            # FORWARD PASS
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            # EXTRACT LOSS AND LOGITS
            loss = outputs.loss
            logits = outputs.logits
            
            # GET PREDICTIONS
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            labels_cpu = labels.cpu().numpy()
            
            # ACCUMULATE METRICS
            batch_size = labels.size(0)
            epoch_loss += loss.item() * batch_size
            all_preds.extend(preds)
            all_labels.extend(labels_cpu)
            total_samples += batch_size
    
    # CALCULATE EPOCH METRICS
    avg_loss = epoch_loss / total_samples
    
    # CALCULATE CLASSIFICATION METRICS
    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(
        all_labels, all_preds, average='binary', zero_division=0
    )
    
    return avg_loss, accuracy, precision, recall, f1

In [None]:
# TRAINING LOOP FOR BERT
# INITIALIZE LISTS TO TRACK METRICS
train_losses = []
val_losses = []
train_accs = []
val_accs = []
val_precisions = []
val_recalls = []
val_f1s = []

# BEST MODEL TRACKING - USING VALIDATION LOSS INSTEAD OF F1
best_val_loss = float('inf')  # INITIALIZE WITH INFINITY FOR LOSS (LOWER IS BETTER)
BERT_MODEL_SAVE_PATH = os.path.join(OUTPUT_DIR, 'best_bert_model.pt')

print("STARTING BERT FINE-TUNING...")

# IF AVAILABLE MEMORY IS LOW, CLEAR IT BEFORE TRAINING
if torch.cuda.is_available() and torch.cuda.memory_allocated(0) > 1e9:
    clear_gpu_memory()

for epoch in range(BERT_EPOCHS):
    print(f"\nEPOCH {epoch+1}/{BERT_EPOCHS}")
    print("-" * 40)
    
    # TRAINING PHASE
    start_time = time.time()
    train_loss, train_acc = train_bert(bert_model, train_loader, optimizer, scheduler, DEVICE)
    train_time = time.time() - start_time
    
    # EVALUATION PHASE
    start_time = time.time()
    val_loss, val_acc, val_precision, val_recall, val_f1 = evaluate_bert(bert_model, val_loader, DEVICE)
    eval_time = time.time() - start_time
    
    # SAVE METRICS
    train_losses.append(train_loss)
    val_losses.append(val_loss)
    train_accs.append(train_acc)
    val_accs.append(val_acc)
    val_precisions.append(val_precision)
    val_recalls.append(val_recall)
    val_f1s.append(val_f1)
    
    # SAVE BEST MODEL BASED ON VALIDATION LOSS (LOWER IS BETTER)
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(bert_model.state_dict(), BERT_MODEL_SAVE_PATH)
        print(f"NEW BEST VALIDATION LOSS: {val_loss:.4f} - SAVED MODEL TO {BERT_MODEL_SAVE_PATH}")
    
    # PRINT METRICS
    print(f"TRAIN - Loss: {train_loss:.4f}, Accuracy: {train_acc:.4f}, Time: {train_time:.2f}s")
    print(f"EVAL  - Loss: {val_loss:.4f}, Accuracy: {val_acc:.4f}, Time: {eval_time:.2f}s")
    print(f"EVAL  - Precision: {val_precision:.4f}, Recall: {val_recall:.4f}, F1: {val_f1:.4f}")
    
    # Clear cache between epochs if using GPU
    if torch.cuda.is_available():
        clear_gpu_memory()

print("\nBERT FINE-TUNING COMPLETE!")

STARTING BERT FINE-TUNING...

EPOCH 1/4
----------------------------------------


KeyboardInterrupt: 

In [None]:
# PREDICTION AND SUBMISSION GENERATION FOR BERT

def predict_test_bert(model, dataloader, device):
    """
    GENERATE PREDICTIONS ON THE TEST SET USING THE FINE-TUNED BERT MODEL
    """
    model.eval()
    all_preds = []
    all_ids = []
    
    with torch.no_grad():
        for idx, batch in enumerate(tqdm(dataloader, desc="PREDICTING")):
            # GET IDS FROM THE TEST DATAFRAME
            start_idx = idx * BATCH_SIZE
            end_idx = min(start_idx + BATCH_SIZE, len(test_df))
            batch_ids = test_df['ID'].iloc[start_idx:end_idx].values
            
            # MOVE BATCH TO DEVICE
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            
            # FORWARD PASS
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            
            # GET PREDICTIONS
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            
            # STORE PREDICTIONS AND IDS
            all_preds.extend(preds)
            all_ids.extend(batch_ids)
    
    return all_ids, all_preds

# LOAD THE BEST SAVED BERT MODEL
print(f"LOADING BEST BERT MODEL FROM {BERT_MODEL_SAVE_PATH}...")
bert_model.load_state_dict(torch.load(BERT_MODEL_SAVE_PATH))
bert_model = bert_model.to(DEVICE)

# GENERATE PREDICTIONS
print("GENERATING PREDICTIONS ON TEST SET...")
test_ids, test_preds = predict_test_bert(bert_model, test_loader, DEVICE)

# CREATE SUBMISSION DATAFRAME
submission_df = pd.DataFrame({
    'ID': test_ids,
    'Label': test_preds
})

# SAVE SUBMISSION FILE TO KAGGLE WORKING DIRECTORY
BERT_SUBMISSION_PATH = os.path.join(OUTPUT_DIR, 'bert_submission.csv')
submission_df.to_csv(BERT_SUBMISSION_PATH, index=False)

print(f"BERT SUBMISSION SAVED TO {BERT_SUBMISSION_PATH}")
print("\nFIRST FEW ROWS OF SUBMISSION:")
print(submission_df.head())

In [None]:
# VISUALIZATION OF BERT MODEL PERFORMANCE

# PLOT LEARNING CURVES
plt.figure(figsize=(12, 5))

# LOSS CURVE
plt.subplot(1, 2, 1)
plt.plot(range(1, BERT_EPOCHS+1), train_losses, 'b-', label='TRAINING LOSS')
plt.plot(range(1, BERT_EPOCHS+1), val_losses, 'r-', label='VALIDATION LOSS')
plt.xlabel('EPOCH')
plt.ylabel('LOSS')
plt.title('BERT LOSS CURVES')
plt.legend()
plt.grid(True)

# ACCURACY CURVE
plt.subplot(1, 2, 2)
plt.plot(range(1, BERT_EPOCHS+1), train_accs, 'b-', label='TRAINING ACCURACY')
plt.plot(range(1, BERT_EPOCHS+1), val_accs, 'r-', label='VALIDATION ACCURACY')
plt.xlabel('EPOCH')
plt.ylabel('ACCURACY')
plt.title('BERT ACCURACY CURVES')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, 'bert_learning_curves.png'))
plt.show()

# PLOT F1, PRECISION, RECALL
plt.figure(figsize=(10, 6))
plt.plot(range(1, BERT_EPOCHS+1), val_precisions, 'g-', label='PRECISION')
plt.plot(range(1, BERT_EPOCHS+1), val_recalls, 'b-', label='RECALL')
plt.plot(range(1, BERT_EPOCHS+1), val_f1s, 'r-', label='F1 SCORE')
plt.xlabel('EPOCH')
plt.ylabel('SCORE')
plt.title('BERT PERFORMANCE METRICS')
plt.legend()
plt.grid(True)
plt.savefig(os.path.join(OUTPUT_DIR, 'bert_performance_metrics.png'))
plt.show()

# COMPUTE PREDICTIONS FOR CONFUSION MATRIX AND ROC CURVE
print("COMPUTING FINAL VALIDATION METRICS...")
bert_model.eval()
all_labels = []
all_probs = []

with torch.no_grad():
    for batch in tqdm(val_loader, desc="COMPUTING METRICS"):
        # MOVE BATCH TO DEVICE
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].cpu().numpy()
        
        # FORWARD PASS
        outputs = bert_model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        
        # GET PROBABILITIES FOR POSITIVE CLASS (CLASS 1)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=1)[:, 1].cpu().numpy()
        
        all_labels.extend(labels)
        all_probs.extend(probs)

all_labels = np.array(all_labels)
all_probs = np.array(all_probs)
all_preds = (all_probs >= 0.5).astype(int)

# CONFUSION MATRIX
cm = confusion_matrix(all_labels, all_preds)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['NEGATIVE (0)', 'POSITIVE (1)'],
            yticklabels=['NEGATIVE (0)', 'POSITIVE (1)'])
plt.xlabel('PREDICTED LABEL')
plt.ylabel('TRUE LABEL')
plt.title('BERT CONFUSION MATRIX')
plt.savefig(os.path.join(OUTPUT_DIR, 'bert_confusion_matrix.png'))
plt.show()

# ROC CURVE
fpr, tpr, _ = roc_curve(all_labels, all_probs)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, 'b-', label=f'ROC CURVE (AUC = {roc_auc:.3f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('FALSE POSITIVE RATE')
plt.ylabel('TRUE POSITIVE RATE')
plt.title('BERT ROC CURVE')
plt.legend(loc='lower right')
plt.grid(True)
plt.savefig(os.path.join(OUTPUT_DIR, 'bert_roc_curve.png'))
plt.show()

# PRINT FINAL METRICS
print(f"\nBERT FINAL VALIDATION METRICS:")
print(f"ACCURACY: {accuracy_score(all_labels, all_preds):.4f}")
print(f"PRECISION: {val_precisions[-1]:.4f}")
print(f"RECALL: {val_recalls[-1]:.4f}")
print(f"F1 SCORE: {val_f1s[-1]:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")

# SAVE METRICS FOR FUTURE COMPARISON
try:
    np.savez(os.path.join(OUTPUT_DIR, 'bert_metrics.npz'), 
             train_losses=train_losses, 
             val_losses=val_losses,
             train_accs=train_accs, 
             val_accs=val_accs,
             val_precisions=val_precisions,
             val_recalls=val_recalls,
             val_f1s=val_f1s)
    print("\nBERT metrics saved for future comparison with other models.")
except Exception as e:
    print(f"Error saving metrics: {e}")

# SAVE A SUMMARY OF THE RESULTS IN A TEXT FILE
with open(os.path.join(OUTPUT_DIR, 'bert_results_summary.txt'), 'w') as f:
    f.write("BERT SENTIMENT CLASSIFICATION RESULTS\n")
    f.write("===================================\n\n")
    f.write(f"Best Validation Loss: {best_val_loss:.4f}\n")
    f.write(f"Final Accuracy: {accuracy_score(all_labels, all_preds):.4f}\n")
    f.write(f"Final Precision: {val_precisions[-1]:.4f}\n")
    f.write(f"Final Recall: {val_recalls[-1]:.4f}\n")
    f.write(f"Final F1 Score: {val_f1s[-1]:.4f}\n")
    f.write(f"ROC AUC: {roc_auc:.4f}\n\n")
    f.write("Training completed successfully.\n")
    f.write(f"Submission file saved to: {BERT_SUBMISSION_PATH}\n")

print("\nResults summary saved to:", os.path.join(OUTPUT_DIR, 'bert_results_summary.txt'))

In [None]:
# GPU MEMORY MANAGEMENT
def clear_gpu_memory():
    """CLEAR GPU MEMORY CACHE TO FREE UP RESOURCES."""
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        print("GPU MEMORY CACHE CLEARED")
        # PRINT CURRENT MEMORY USAGE
        print(f"GPU MEMORY: ALLOCATED: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB, "
              f"CACHED: {torch.cuda.memory_reserved(0) / 1e9:.2f} GB")
    elif hasattr(torch, 'mps') and torch.backends.mps.is_available():
        # MPS DOESN'T HAVE EXPLICIT MEMORY MANAGEMENT FUNCTIONS LIKE CUDA
        print("NOTE: MPS (APPLE SILICON) DOESN'T SUPPORT EXPLICIT MEMORY CLEARING")
    else:
        print("NO GPU AVAILABLE")
