In [None]:
# IMPORTS
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import random
import os
import time
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW  # Import AdamW from torch.optim instead of transformers
from tqdm.notebook import tqdm

# SPECIFIC CONFIGURATION
BASE_DIR = '/kaggle/input/ai-2-dl-for-nlp-2025-homework-3'
TRAIN_CSV_PATH = os.path.join(BASE_DIR, 'train_dataset.csv')
VAL_CSV_PATH = os.path.join(BASE_DIR, 'val_dataset.csv')
TEST_CSV_PATH = os.path.join(BASE_DIR, 'test_dataset.csv')

# MODEL AND TRAINING HYPERPARAMETERS 
BATCH_SIZE = 64      # INCREASED BATCH SIZE FOR GPU TRAINING
MAX_LENGTH = 128     # MAX SEQUENCE LENGTH FOR TOKENIZATION
LEARNING_RATE = 5e-5  # TYPICAL LEARNING RATE FOR DISTILBERT FINE-TUNING (SLIGHTLY HIGHER THAN BERT)
EPOCHS = 4           # NUMBER OF EPOCHS FOR FINE-TUNING
WARMUP_STEPS = 0     # NO WARMUP STEPS
MODEL_NAME = "distilbert-base-uncased"  # USING DISTILBERT INSTEAD OF BERT

# FOR GPU MEMORY OPTIMIZATION
if torch.cuda.is_available():
    # Try to find optimal batch size based on available GPU memory
    gpu_mem = torch.cuda.get_device_properties(0).total_memory / 1e9  # GB
    if gpu_mem > 10:  # High-end GPUs
        BATCH_SIZE = 64
    elif gpu_mem > 6:  # Mid-range GPUs
        BATCH_SIZE = 32
    else:  # Lower-end GPUs
        BATCH_SIZE = 16
    print(f"OPTIMIZED BATCH SIZE FOR GPU WITH {gpu_mem:.1f}GB MEMORY: {BATCH_SIZE}")

# DEVICE CONFIGURATION
# CHECK IF GPU IS AVAILABLE AND PRINT DETAILED INFORMATION
print(f"CUDA AVAILABLE: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA VERSION: {torch.version.cuda}")
    print(f"NUMBER OF GPUS: {torch.cuda.device_count()}")
    print(f"CURRENT GPU: {torch.cuda.current_device()}")
    print(f"GPU NAME: {torch.cuda.get_device_name(0)}")
    print(f"GPU MEMORY: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"USING DEVICE: {DEVICE}")

# SET RANDOM SEEDS FOR REPRODUCIBILITY
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(RANDOM_SEED)

# ENSURE DETERMINISTIC BEHAVIOR
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [None]:
# LOAD DATASETS
print("LOADING DATASETS...")

try:
    # READ CSV FILES
    train_df = pd.read_csv(TRAIN_CSV_PATH)
    val_df = pd.read_csv(VAL_CSV_PATH)
    test_df = pd.read_csv(TEST_CSV_PATH)
    
    print("DATASETS LOADED SUCCESSFULLY.")
    
    # BASIC DATA EXPLORATION
    print(f"TRAINING SET: {train_df.shape[0]} SAMPLES")
    print(f"VALIDATION SET: {val_df.shape[0]} SAMPLES")
    print(f"TEST SET: {test_df.shape[0]} SAMPLES")
    
    # CLASS BALANCE IN TRAINING SET
    class_counts = train_df['Label'].value_counts()
    print("\nCLASS DISTRIBUTION IN TRAINING SET:")
    for label, count in class_counts.items():
        print(f"CLASS {label}: {count} SAMPLES ({count/len(train_df)*100:.2f}%)")
    
except Exception as e:
    print(f"ERROR DURING LOADING: {e}")
    raise e

In [None]:
# INITIALIZE DISTILBERT TOKENIZER
print(f"INITIALIZING {MODEL_NAME} TOKENIZER...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# DEFINE CUSTOM DATASET FOR SENTIMENT ANALYSIS WITH DISTILBERT
class SentimentDataset(Dataset):
    def __init__(self, texts, labels=None, tokenizer=None, max_length=128, is_test=False):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.is_test = is_test
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        
        # TOKENIZE THE TEXT FOR DISTILBERT
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,  # ADDS [CLS] AND [SEP]
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        # CONVERT TO FLAT TENSORS FROM BATCHED TENSORS
        item = {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }
        
        # ADD LABEL FOR TRAINING DATA
        if not self.is_test and self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
            
        # ADD ID FOR TEST DATA
        if self.is_test:
            item['id'] = torch.tensor(idx)
            
        return item

# CREATE DATASETS
print("CREATING DATASETS...")
train_dataset = SentimentDataset(
    train_df['Text'], 
    train_df['Label'].values, 
    tokenizer,
    max_length=MAX_LENGTH
)

val_dataset = SentimentDataset(
    val_df['Text'], 
    val_df['Label'].values, 
    tokenizer,
    max_length=MAX_LENGTH
)

test_dataset = SentimentDataset(
    test_df['Text'], 
    labels=None,  
    tokenizer=tokenizer,
    max_length=MAX_LENGTH,
    is_test=True
)

# CREATE DATALOADERS
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

print(f"DATALOADERS CREATED WITH BATCH SIZE: {BATCH_SIZE}")

In [None]:
# INITIALIZE DISTILBERT MODEL FOR SEQUENCE CLASSIFICATION
print(f"INITIALIZING {MODEL_NAME} MODEL...")
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2  # BINARY CLASSIFICATION (0 OR 1)
).to(DEVICE)

# DEFINE OPTIMIZER WITH WEIGHT DECAY
optimizer = AdamW(
    model.parameters(),
    lr=LEARNING_RATE,
    weight_decay=0.01  # Default weight decay for BERT/DistilBERT fine-tuning
)

# CALCULATE TRAINING STEPS AND SETUP SCHEDULER
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=WARMUP_STEPS,
    num_training_steps=total_steps
)

print(f"MODEL INITIALIZED WITH LEARNING RATE: {LEARNING_RATE}")
print(f"TRAINING FOR {EPOCHS} EPOCHS, {total_steps} TOTAL STEPS")

In [None]:
# TRAINING FUNCTION
def train_model(model, dataloader, optimizer, scheduler, device):
    """
    TRAINS THE MODEL FOR ONE EPOCH
    """
    # SET MODEL TO TRAINING MODE
    model.train()
    
    # TRACK METRICS
    epoch_loss = 0
    epoch_corrects = 0
    total_samples = 0
    
    for batch in tqdm(dataloader, desc="TRAINING"):
        # MOVE BATCH TO DEVICE
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        # CLEAR GRADIENTS
        optimizer.zero_grad()
        
        # FORWARD PASS - MODELS RETURN A NAMEDTUPLE WITH MANY FIELDS
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        
        # EXTRACT LOSS AND LOGITS
        loss = outputs.loss
        logits = outputs.logits
        
        # BACKWARD PASS
        loss.backward()
        
        # CLIP GRADIENTS - PREVENTS EXPLODING GRADIENTS
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        # UPDATE WEIGHTS
        optimizer.step()
        
        # UPDATE LEARNING RATE SCHEDULE
        scheduler.step()
        
        # GET PREDICTIONS AND CALCULATE ACCURACY
        preds = torch.argmax(logits, dim=1)
        corrects = (preds == labels).sum().item()
        
        # ACCUMULATE METRICS
        batch_size = labels.size(0)
        epoch_loss += loss.item() * batch_size
        epoch_corrects += corrects
        total_samples += batch_size
    
    # CALCULATE EPOCH METRICS
    avg_loss = epoch_loss / total_samples
    accuracy = epoch_corrects / total_samples
    
    return avg_loss, accuracy

# EVALUATION FUNCTION
def evaluate_model(model, dataloader, device):
    """
    EVALUATES THE MODEL ON A VALIDATION OR TEST SET
    """
    # SET MODEL TO EVALUATION MODE
    model.eval()
    
    # TRACK METRICS
    epoch_loss = 0
    all_preds = []
    all_labels = []
    total_samples = 0
    
    # NO GRADIENT CALCULATION DURING EVALUATION
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="EVALUATING"):
            # MOVE BATCH TO DEVICE
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            # FORWARD PASS
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            # EXTRACT LOSS AND LOGITS
            loss = outputs.loss
            logits = outputs.logits
            
            # GET PREDICTIONS
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            labels_cpu = labels.cpu().numpy()
            
            # ACCUMULATE METRICS
            batch_size = labels.size(0)
            epoch_loss += loss.item() * batch_size
            all_preds.extend(preds)
            all_labels.extend(labels_cpu)
            total_samples += batch_size
    
    # CALCULATE EPOCH METRICS
    avg_loss = epoch_loss / total_samples
    
    # CALCULATE CLASSIFICATION METRICS
    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(
        all_labels, all_preds, average='binary', zero_division=0
    )
    
    return avg_loss, accuracy, precision, recall, f1

In [None]:
# TRAINING LOOP
# INITIALIZE LISTS TO TRACK METRICS
train_losses = []
val_losses = []
train_accs = []
val_accs = []
val_precisions = []
val_recalls = []
val_f1s = []

# BEST MODEL TRACKING - USING VALIDATION LOSS AS THE CRITERIA
best_val_loss = float('inf')  # Initialize with infinity for loss (lower is better)
MODEL_SAVE_PATH = f'best_{MODEL_NAME.replace("-", "_")}_model.pt'

print(f"STARTING {MODEL_NAME} FINE-TUNING...")

for epoch in range(EPOCHS):
    print(f"\nEPOCH {epoch+1}/{EPOCHS}")
    print("-" * 40)
    
    # TRAINING PHASE
    start_time = time.time()
    train_loss, train_acc = train_model(model, train_loader, optimizer, scheduler, DEVICE)
    train_time = time.time() - start_time
    
    # EVALUATION PHASE
    start_time = time.time()
    val_loss, val_acc, val_precision, val_recall, val_f1 = evaluate_model(model, val_loader, DEVICE)
    eval_time = time.time() - start_time
    
    # SAVE METRICS
    train_losses.append(train_loss)
    val_losses.append(val_loss)
    train_accs.append(train_acc)
    val_accs.append(val_acc)
    val_precisions.append(val_precision)
    val_recalls.append(val_recall)
    val_f1s.append(val_f1)
    
    # SAVE BEST MODEL BASED ON VALIDATION LOSS (LOWER IS BETTER)
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), MODEL_SAVE_PATH)
        print(f"NEW BEST VALIDATION LOSS: {val_loss:.4f} - SAVED MODEL TO {MODEL_SAVE_PATH}")
    
    # PRINT METRICS
    print(f"TRAIN - Loss: {train_loss:.4f}, Accuracy: {train_acc:.4f}, Time: {train_time:.2f}s")
    print(f"EVAL  - Loss: {val_loss:.4f}, Accuracy: {val_acc:.4f}, Time: {eval_time:.2f}s")
    print(f"EVAL  - Precision: {val_precision:.4f}, Recall: {val_recall:.4f}, F1: {val_f1:.4f}")

print(f"\n{MODEL_NAME} FINE-TUNING COMPLETE!")

In [None]:
# PREDICTION AND SUBMISSION GENERATION

def predict_test(model, dataloader, device):
    """
    GENERATE PREDICTIONS ON THE TEST SET USING THE FINE-TUNED MODEL
    """
    model.eval()
    all_preds = []
    all_ids = []
    
    with torch.no_grad():
        for idx, batch in enumerate(tqdm(dataloader, desc="PREDICTING")):
            # GET IDS FROM THE TEST DATAFRAME
            start_idx = idx * BATCH_SIZE
            end_idx = min(start_idx + BATCH_SIZE, len(test_df))
            batch_ids = test_df['ID'].iloc[start_idx:end_idx].values
            
            # MOVE BATCH TO DEVICE
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            
            # FORWARD PASS
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            
            # GET PREDICTIONS
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            
            # STORE PREDICTIONS AND IDS
            all_preds.extend(preds)
            all_ids.extend(batch_ids)
    
    return all_ids, all_preds

# LOAD THE BEST SAVED MODEL
print(f"LOADING BEST MODEL FROM {MODEL_SAVE_PATH}...")
model.load_state_dict(torch.load(MODEL_SAVE_PATH))
model = model.to(DEVICE)

# GENERATE PREDICTIONS
print("GENERATING PREDICTIONS ON TEST SET...")
test_ids, test_preds = predict_test(model, test_loader, DEVICE)

# CREATE SUBMISSION DATAFRAME
submission_df = pd.DataFrame({
    'ID': test_ids,
    'Label': test_preds
})

# SAVE SUBMISSION FILE
SUBMISSION_PATH = f'{MODEL_NAME.replace("-", "_")}_submission.csv'
submission_df.to_csv(SUBMISSION_PATH, index=False)

print(f"SUBMISSION SAVED TO {SUBMISSION_PATH}")
print("\nFIRST FEW ROWS OF SUBMISSION:")
print(submission_df.head())

In [None]:
# VISUALIZATION OF MODEL PERFORMANCE

# PLOT LEARNING CURVES
plt.figure(figsize=(12, 5))

# LOSS CURVE
plt.subplot(1, 2, 1)
plt.plot(range(1, EPOCHS+1), train_losses, 'b-', label='TRAINING LOSS')
plt.plot(range(1, EPOCHS+1), val_losses, 'r-', label='VALIDATION LOSS')
plt.xlabel('EPOCH')
plt.ylabel('LOSS')
plt.title(f'{MODEL_NAME} LOSS CURVES')
plt.legend()
plt.grid(True)

# ACCURACY CURVE
plt.subplot(1, 2, 2)
plt.plot(range(1, EPOCHS+1), train_accs, 'b-', label='TRAINING ACCURACY')
plt.plot(range(1, EPOCHS+1), val_accs, 'r-', label='VALIDATION ACCURACY')
plt.xlabel('EPOCH')
plt.ylabel('ACCURACY')
plt.title(f'{MODEL_NAME} ACCURACY CURVES')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.savefig(f'{MODEL_NAME.replace("-", "_")}_learning_curves.png')
plt.show()

# PLOT F1, PRECISION, RECALL
plt.figure(figsize=(10, 6))
plt.plot(range(1, EPOCHS+1), val_precisions, 'g-', label='PRECISION')
plt.plot(range(1, EPOCHS+1), val_recalls, 'b-', label='RECALL')
plt.plot(range(1, EPOCHS+1), val_f1s, 'r-', label='F1 SCORE')
plt.xlabel('EPOCH')
plt.ylabel('SCORE')
plt.title(f'{MODEL_NAME} PERFORMANCE METRICS')
plt.legend()
plt.grid(True)
plt.savefig(f'{MODEL_NAME.replace("-", "_")}_performance_metrics.png')
plt.show()

# COMPUTE PREDICTIONS FOR CONFUSION MATRIX AND ROC CURVE
print("COMPUTING FINAL VALIDATION METRICS...")
model.eval()
all_labels = []
all_probs = []

with torch.no_grad():
    for batch in tqdm(val_loader, desc="COMPUTING METRICS"):
        # MOVE BATCH TO DEVICE
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].cpu().numpy()
        
        # FORWARD PASS
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        
        # GET PROBABILITIES FOR POSITIVE CLASS (CLASS 1)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=1)[:, 1].cpu().numpy()
        
        all_labels.extend(labels)
        all_probs.extend(probs)

all_labels = np.array(all_labels)
all_probs = np.array(all_probs)
all_preds = (all_probs >= 0.5).astype(int)

# CONFUSION MATRIX
cm = confusion_matrix(all_labels, all_preds)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['NEGATIVE (0)', 'POSITIVE (1)'],
            yticklabels=['NEGATIVE (0)', 'POSITIVE (1)'])
plt.xlabel('PREDICTED LABEL')
plt.ylabel('TRUE LABEL')
plt.title(f'{MODEL_NAME} CONFUSION MATRIX')
plt.savefig(f'{MODEL_NAME.replace("-", "_")}_confusion_matrix.png')
plt.show()

# ROC CURVE
fpr, tpr, _ = roc_curve(all_labels, all_probs)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, 'b-', label=f'ROC CURVE (AUC = {roc_auc:.3f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('FALSE POSITIVE RATE')
plt.ylabel('TRUE POSITIVE RATE')
plt.title(f'{MODEL_NAME} ROC CURVE')
plt.legend(loc='lower right')
plt.grid(True)
plt.savefig(f'{MODEL_NAME.replace("-", "_")}_roc_curve.png')
plt.show()

# PRINT FINAL METRICS
print(f"\n{MODEL_NAME} FINAL VALIDATION METRICS:")
print(f"ACCURACY: {accuracy_score(all_labels, all_preds):.4f}")
print(f"PRECISION: {val_precisions[-1]:.4f}")
print(f"RECALL: {val_recalls[-1]:.4f}")
print(f"F1 SCORE: {val_f1s[-1]:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")

In [None]:
# GPU MEMORY MANAGEMENT [STACKOVERFLOW]
def clear_gpu_memory():
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        print("GPU memory cache cleared")
        # Print current memory usage
        print(f"GPU Memory: Allocated: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB, "
              f"Cached: {torch.cuda.memory_reserved(0) / 1e9:.2f} GB")
    elif hasattr(torch, 'mps') and torch.backends.mps.is_available():
        # MPS doesn't have explicit memory management functions like CUDA
        print("Note: MPS (Apple Silicon) doesn't support explicit memory clearing")
    else:
        print("No GPU available")

clear_gpu_memory()