In [1]:
# ==================== COMPLETE ENHANCED MEME ANALYSIS PIPELINE ====================
# This code includes BOTH data preparation AND enhanced training
# Run this entire script from start to finish

# ==================== PART 0: SETUP & DEPENDENCIES ====================
import sys, subprocess, os, json, zipfile, shutil, random, warnings
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from torch.cuda.amp import GradScaler, autocast
import torchvision.transforms as transforms
from PIL import Image
from tqdm import tqdm
import yaml

warnings.filterwarnings('ignore')

print("=" * 80)
print("INSTALLING DEPENDENCIES...")
print("=" * 80)

# Install required packages
subprocess.check_call([
    sys.executable, "-m", "pip", "install", "-q",
    "transformers>=4.40.0", "accelerate", "torch", "timm",
    "scikit-learn", "pandas", "matplotlib", "seaborn",
    "huggingface_hub>=0.18.0", "gdown", "iterative-stratification"
])

# Import additional packages
from sklearn.metrics import (
    accuracy_score, precision_recall_fscore_support,
    mean_squared_error, mean_absolute_error, f1_score
)
from transformers import AutoModel, AutoTokenizer, CLIPModel
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit

print("âœ“ All dependencies installed\n")

# ==================== PART 1: DATA PREPARATION ====================
print("=" * 80)
print("PART 1: DATA PREPARATION")
print("=" * 80)

# Download and extract dataset
print("\nDownloading dataset...")
subprocess.run(["gdown", "1jEJ2nf5CDJknq80ogzU-Uyz7jbBi-1LZ", "--fuzzy"], 
               check=False, capture_output=True)

print("Extracting dataset...")
zip_files = [f for f in os.listdir('.') if f.endswith('.zip')]
if zip_files:
    subprocess.run(["unzip", "-q", "-o", zip_files[0]], check=False, capture_output=True)

# Download additional files
subprocess.run([
    "gdown", "--folder", "19yaav8ORSVj9DeJUaHKq1H3HtVnkClBw", "--remaining-ok"
], check=False, capture_output=True)

# Extract password-protected archive
print("Extracting protected archive...")
zip_path = '/kaggle/working/Memotion 3/memotion3.zip'
extract_to = '/kaggle/working/'
password = b'memotion3taskaaai@22'

if os.path.exists(zip_path):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to, pwd=password)
    print(f"âœ“ Extracted to: {extract_to}")

# Setup paths
ORIGINAL_TRAIN_IMG_DIR = '/kaggle/working/trainImages/'
ORIGINAL_CSV_PATH = '/kaggle/working/memotion3/train.csv'
VALIDATION_SPLIT_RATIO = 0.15

OUTPUT_BASE_DIR = '/kaggle/working/'
NEW_VAL_DIR = os.path.join(OUTPUT_BASE_DIR, 'validation_images/')
NEW_TRAIN_DIR = os.path.join(OUTPUT_BASE_DIR, 'new_train_images/')

os.makedirs(NEW_VAL_DIR, exist_ok=True)
os.makedirs(NEW_TRAIN_DIR, exist_ok=True)

# Load and process CSV
print("\nLoading CSV file...")
df = pd.read_csv(ORIGINAL_CSV_PATH)

# Detect image column
possible_image_cols = ['image_name', 'image', 'img_name', 'filename', 'Unnamed: 0']
IMAGE_FILENAME_COLUMN = next((col for col in possible_image_cols if col in df.columns), df.columns[0])
print(f"âœ“ Image column: {IMAGE_FILENAME_COLUMN}")

# Normalize labels
for col in ['offensive', 'motivational', 'humour', 'humor', 'sarcastic', 'sarcasm', 'overall', 'sentiment']:
    if col in df.columns:
        df[col] = df[col].astype(str).str.lower().str.strip()

# Create binary labels
def create_binary_label(value, positive_values):
    if pd.isna(value) or value in ['nan', 'none', '']:
        return 0
    return 1 if value in positive_values else 0

if 'offensive' in df.columns:
    df['offensive_bin'] = df['offensive'].apply(
        lambda x: create_binary_label(x, ['slight', 'very_offensive', 'hateful_offensive'])
    )
else:
    df['offensive_bin'] = 0

if 'motivational' in df.columns:
    df['motivational_bin'] = df['motivational'].apply(
        lambda x: create_binary_label(x, ['motivational'])
    )
else:
    df['motivational_bin'] = 0

if 'humour' in df.columns or 'humor' in df.columns:
    humor_col = 'humour' if 'humour' in df.columns else 'humor'
    df['humor_bin'] = df[humor_col].apply(
        lambda x: create_binary_label(x, ['funny', 'very_funny', 'hilarious'])
    )
else:
    df['humor_bin'] = 0

if 'sarcastic' in df.columns or 'sarcasm' in df.columns:
    sarcasm_col = 'sarcastic' if 'sarcastic' in df.columns else 'sarcasm'
    df['sarcasm_bin'] = df[sarcasm_col].apply(
        lambda x: create_binary_label(x, ['general', 'twisted_meaning', 'very_twisted'])
    )
else:
    df['sarcasm_bin'] = 0

# Stratified split
stratify_columns = ['offensive_bin', 'motivational_bin', 'humor_bin', 'sarcasm_bin']
y_stratify = df[stratify_columns].values

msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=VALIDATION_SPLIT_RATIO, random_state=42)
train_idx, val_idx = next(msss.split(df, y_stratify))

train_df = df.iloc[train_idx].reset_index(drop=True)
val_df = df.iloc[val_idx].reset_index(drop=True)

print(f"\nâœ“ Stratified split complete:")
print(f"  Training: {len(train_df)} samples")
print(f"  Validation: {len(val_df)} samples")

# Calculate label priors
label_priors = {
    'offensive_pos_rate': float(train_df['offensive_bin'].sum() / len(train_df)),
    'motivational_pos_rate': float(train_df['motivational_bin'].sum() / len(train_df)),
    'humor_pos_rate': float(train_df['humor_bin'].sum() / len(train_df)),
    'sarcasm_pos_rate': float(train_df['sarcasm_bin'].sum() / len(train_df))
}

priors_path = os.path.join(OUTPUT_BASE_DIR, 'label_priors.json')
with open(priors_path, 'w') as f:
    json.dump(label_priors, f, indent=2)

print(f"\nâœ“ Label priors calculated:")
for key, val in label_priors.items():
    print(f"  {key}: {val:.4f}")

# Copy images
def copy_images(df_subset, dest_dir, source_dir, image_col):
    copied = 0
    missing = 0
    
    for idx in tqdm(df_subset[image_col], desc=f"Copying to {dest_dir}"):
        filename = str(idx)
        if not any(filename.lower().endswith(ext) for ext in ['.jpg', '.jpeg', '.png', '.gif']):
            for ext in ['.jpg', '.jpeg', '.png']:
                test_path = os.path.join(source_dir, f"{filename}{ext}")
                if os.path.exists(test_path):
                    filename = f"{filename}{ext}"
                    break
            else:
                filename = f"{filename}.jpg"
        
        source_path = os.path.join(source_dir, filename)
        if os.path.exists(source_path):
            shutil.copy(source_path, os.path.join(dest_dir, filename))
            copied += 1
        else:
            missing += 1
    
    return copied, missing

print("\nCopying images...")
copied_val, missing_val = copy_images(val_df, NEW_VAL_DIR, ORIGINAL_TRAIN_IMG_DIR, IMAGE_FILENAME_COLUMN)
copied_train, missing_train = copy_images(train_df, NEW_TRAIN_DIR, ORIGINAL_TRAIN_IMG_DIR, IMAGE_FILENAME_COLUMN)

print(f"âœ“ Validation: {copied_val} copied, {missing_val} missing")
print(f"âœ“ Training: {copied_train} copied, {missing_train} missing")

# Save CSVs
train_csv_path = os.path.join(OUTPUT_BASE_DIR, 'train_split.csv')
val_csv_path = os.path.join(OUTPUT_BASE_DIR, 'validation_split.csv')

train_df.to_csv(train_csv_path, index=False)
val_df.to_csv(val_csv_path, index=False)

print(f"\nâœ“ Saved train CSV: {train_csv_path}")
print(f"âœ“ Saved validation CSV: {val_csv_path}")
print("\nâœ… DATA PREPARATION COMPLETE\n")

# ==================== PART 2: CONFIGURATION ====================
print("=" * 80)
print("PART 2: CONFIGURATION")
print("=" * 80)

CONFIG_YAML = """
TEXT_MODEL: "google/muril-base-cased"
IMAGE_MODEL: "openai/clip-vit-base-patch32"
TEXT_DIM: 768
IMAGE_DIM: 768
FUSION_DIM: 512
FUSION_OUT_DIM: 512

MAX_LEN: 128
IMG_SIZE: 224
BATCH_SIZE: 16
GRADIENT_ACCUMULATION_STEPS: 2
LR_HEADS: 0.001
LR_BACKBONE: 0.00002
WEIGHT_DECAY: 0.01
EPOCHS: 20
SEED: 42
DEVICE: "cuda"
CHECKPOINT_PATH: "/kaggle/working/checkpoints"

NUM_SENTIMENT_CLASSES: 5
NUM_EMOTION_CLASSES: 4

USE_ORDINAL_REGRESSION: true
ORDINAL_LINK: "logit"

LOSS_WEIGHTS:
  sentiment: 2.0
  emotion: 1.5
  intensity: 0.5

ASL_GAMMA_NEG: 6.0
ASL_GAMMA_POS: 0.5
ASL_CLIP: 0.05
ASL_PRIOR_TAU: 1.2

EMOTION_LABELS: ["humor", "sarcasm", "offensive", "motivational"]
EMO_THRESHOLDS: [0.5, 0.5, 0.60, 0.60]

POOLING: "mean"
USE_AMP: true
GRADIENT_CLIP: 1.0
SCHEDULER: "cosine"
UNFREEZE_BACKBONE_EPOCH: 2
UNFREEZE_LAYERS: 3

MOTIVATIONAL_OVERSAMPLE_FACTOR: 8.0

CROSS_ATTN_HEADS: 8
CROSS_ATTN_DROPOUT: 0.1

SENTIMENT_MAP_REV:
  0: "very_positive"
  1: "positive"
  2: "neutral"
  3: "negative"
  4: "very_negative"
"""

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

cfg = yaml.safe_load(CONFIG_YAML)
set_seed(cfg['SEED'])

# Load priors
with open(priors_path, 'r') as f:
    priors = json.load(f)

cfg['EMO_PRIORS'] = [
    priors['humor_pos_rate'],
    priors['sarcasm_pos_rate'],
    priors['offensive_pos_rate'],
    priors['motivational_pos_rate']
]

device = torch.device(cfg['DEVICE'] if torch.cuda.is_available() else 'cpu')

print(f"\nâœ“ Configuration loaded:")
print(f"  Device: {device}")
print(f"  Epochs: {cfg['EPOCHS']}")
print(f"  Batch size: {cfg['BATCH_SIZE']}")
print(f"  Motivational oversampling: {cfg['MOTIVATIONAL_OVERSAMPLE_FACTOR']}x")
print(f"  Emotion priors: {[f'{p:.3f}' for p in cfg['EMO_PRIORS']]}")

# ==================== PART 3: MODEL COMPONENTS ====================
print("\n" + "=" * 80)
print("PART 3: MODEL COMPONENTS")
print("=" * 80)

class EnhancedAsymmetricLoss(nn.Module):
    """Enhanced ASL with prior adjustment"""
    def __init__(self, gamma_neg=6.0, gamma_pos=0.5, clip=0.05, priors=None, prior_tau=1.2, eps=1e-8):
        super().__init__()
        self.gamma_neg = gamma_neg
        self.gamma_pos = gamma_pos
        self.clip = clip
        self.eps = eps
        self.priors = priors
        self.prior_tau = prior_tau
    
    def forward(self, logits, targets):
        # Prior adjustment
        if self.priors is not None:
            priors_tensor = torch.tensor(self.priors, device=logits.device, dtype=logits.dtype)
            adjustment = self.prior_tau * torch.log(priors_tensor.clamp(min=self.eps))
            logits = logits - adjustment
        
        xs_pos = torch.sigmoid(logits)
        xs_neg = 1 - xs_pos
        
        if self.clip is not None and self.clip > 0:
            xs_neg = (xs_neg + self.clip).clamp(max=1)
        
        los_pos = targets * torch.log(xs_pos.clamp(min=self.eps))
        los_neg = (1 - targets) * torch.log(xs_neg.clamp(min=self.eps))
        
        if self.gamma_neg > 0 or self.gamma_pos > 0:
            pt0 = xs_pos * targets
            pt1 = xs_neg * (1 - targets)
            pt = pt0 + pt1
            one_sided_gamma = self.gamma_pos * targets + self.gamma_neg * (1 - targets)
            one_sided_w = torch.pow(1 - pt, one_sided_gamma)
            loss = one_sided_w * (los_pos + los_neg)
        else:
            loss = los_pos + los_neg
        
        return -loss.mean()

class OrdinalRegressionHead(nn.Module):
    """Ordinal regression using cumulative link model"""
    def __init__(self, input_dim, num_classes):
        super().__init__()
        self.num_classes = num_classes
        self.num_thresholds = num_classes - 1
        
        self.projection = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 1)
        )
        
        initial_thresholds = torch.linspace(-2, 2, self.num_thresholds)
        self.thresholds = nn.Parameter(initial_thresholds)
    
    def forward(self, x):
        score = self.projection(x).squeeze(-1)
        ordered_thresholds = torch.cumsum(F.softplus(self.thresholds), dim=0)
        cumulative_logits = ordered_thresholds.unsqueeze(0) - score.unsqueeze(1)
        cumulative_probs = torch.sigmoid(cumulative_logits)
        
        batch_size = cumulative_probs.size(0)
        class_probs = torch.zeros(batch_size, self.num_classes, device=x.device)
        
        class_probs[:, 0] = cumulative_probs[:, 0]
        for k in range(1, self.num_thresholds):
            class_probs[:, k] = cumulative_probs[:, k] - cumulative_probs[:, k-1]
        class_probs[:, -1] = 1.0 - cumulative_probs[:, -1]
        class_probs = torch.clamp(class_probs, min=1e-7, max=1.0)
        
        return {'cumulative_logits': cumulative_logits, 'class_probs': class_probs}

class CrossAttentionFusion(nn.Module):
    """Bidirectional cross-attention"""
    def __init__(self, dim, num_heads=8, dropout=0.1):
        super().__init__()
        self.text_to_image_attn = nn.MultiheadAttention(dim, num_heads, dropout, batch_first=True)
        self.image_to_text_attn = nn.MultiheadAttention(dim, num_heads, dropout, batch_first=True)
        self.text_norm = nn.LayerNorm(dim)
        self.image_norm = nn.LayerNorm(dim)
        self.text_ffn = nn.Sequential(
            nn.Linear(dim, dim * 4), nn.GELU(), nn.Dropout(dropout),
            nn.Linear(dim * 4, dim), nn.Dropout(dropout)
        )
        self.image_ffn = nn.Sequential(
            nn.Linear(dim, dim * 4), nn.GELU(), nn.Dropout(dropout),
            nn.Linear(dim * 4, dim), nn.Dropout(dropout)
        )
        self.ffn_norm_text = nn.LayerNorm(dim)
        self.ffn_norm_image = nn.LayerNorm(dim)
    
    def forward(self, text_emb, image_emb):
        text_seq = text_emb.unsqueeze(1)
        image_seq = image_emb.unsqueeze(1)
        
        text_attended, _ = self.text_to_image_attn(text_seq, image_seq, image_seq)
        text_out = self.text_norm(text_emb + text_attended.squeeze(1))
        
        image_attended, _ = self.image_to_text_attn(image_seq, text_seq, text_seq)
        image_out = self.image_norm(image_emb + image_attended.squeeze(1))
        
        text_final = self.ffn_norm_text(text_out + self.text_ffn(text_out))
        image_final = self.ffn_norm_image(image_out + self.image_ffn(image_out))
        
        return text_final, image_final

class EnhancedFusionModel(nn.Module):
    """Multi-modal model with ordinal regression + enhanced ASL"""
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg
        
        self.text_model = AutoModel.from_pretrained(cfg['TEXT_MODEL'])
        clip_model = CLIPModel.from_pretrained(cfg['IMAGE_MODEL'])
        self.image_model = clip_model.vision_model
        
        self._freeze_encoders()
        
        self.text_proj = nn.Linear(cfg['TEXT_DIM'], cfg['FUSION_DIM'])
        self.image_proj = nn.Linear(cfg['IMAGE_DIM'], cfg['FUSION_DIM'])
        
        self.cross_attention = CrossAttentionFusion(
            dim=cfg['FUSION_DIM'],
            num_heads=cfg['CROSS_ATTN_HEADS'],
            dropout=cfg['CROSS_ATTN_DROPOUT']
        )
        
        fusion_input_dim = cfg['FUSION_DIM'] * 2
        self.fusion_norm = nn.LayerNorm(fusion_input_dim)
        self.fusion_mlp = nn.Sequential(
            nn.Linear(fusion_input_dim, 512), nn.GELU(), nn.Dropout(0.2),
            nn.Linear(512, cfg['FUSION_OUT_DIM']), nn.LayerNorm(cfg['FUSION_OUT_DIM'])
        )
        
        self.sentiment_head = OrdinalRegressionHead(cfg['FUSION_OUT_DIM'], cfg['NUM_SENTIMENT_CLASSES'])
        self.emotion_head = nn.Sequential(
            nn.Linear(cfg['FUSION_OUT_DIM'], 256), nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(256, cfg['NUM_EMOTION_CLASSES'])
        )
        self.intensity_head = nn.Sequential(
            nn.Linear(cfg['FUSION_OUT_DIM'], 128), nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(128, 1)
        )
    
    def _freeze_encoders(self):
        for param in self.text_model.parameters():
            param.requires_grad = False
        for param in self.image_model.parameters():
            param.requires_grad = False
    
    def unfreeze_backbone(self, layers_to_unfreeze=3):
        if hasattr(self.text_model, 'encoder') and hasattr(self.text_model.encoder, 'layer'):
            for layer in list(self.text_model.encoder.layer[-layers_to_unfreeze:]):
                for param in layer.parameters():
                    param.requires_grad = True
        
        if hasattr(self.image_model, 'encoder') and hasattr(self.image_model.encoder, 'layers'):
            for layer in list(self.image_model.encoder.layers[-layers_to_unfreeze:]):
                for param in layer.parameters():
                    param.requires_grad = True
    
    def pool_text(self, model_output, attention_mask):
        last_hidden = model_output.last_hidden_state
        if self.cfg['POOLING'] == 'cls':
            return last_hidden[:, 0]
        mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden.size()).float()
        sum_embeddings = torch.sum(last_hidden * mask_expanded, 1)
        sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
        return sum_embeddings / sum_mask
    
    def forward(self, input_ids, attention_mask, image):
        text_output = self.text_model(input_ids=input_ids, attention_mask=attention_mask)
        text_emb = self.pool_text(text_output, attention_mask)
        
        image_output = self.image_model(pixel_values=image)
        image_emb = image_output.pooler_output
        
        text_proj = self.text_proj(text_emb)
        image_proj = self.image_proj(image_emb)
        
        text_cross, image_cross = self.cross_attention(text_proj, image_proj)
        
        fused = torch.cat([text_cross, image_cross], dim=1)
        fused = self.fusion_norm(fused)
        fused = self.fusion_mlp(fused)
        
        sentiment_outputs = self.sentiment_head(fused)
        emotion_logits = self.emotion_head(fused)
        intensity = self.intensity_head(fused).squeeze(-1)
        
        return {
            'sentiment': sentiment_outputs,
            'emotion_logits': emotion_logits,
            'intensity': intensity
        }

print("âœ“ Model components defined")

# ==================== PART 4: DATASET ====================
print("\n" + "=" * 80)
print("PART 4: DATASET")
print("=" * 80)

class MemeDataset(Dataset):
    def __init__(self, df, tokenizer, image_transform, image_dir, cfg):
        self.df = df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.image_transform = image_transform
        self.image_dir = image_dir
        self.cfg = cfg
        self._detect_columns()
    
    def _detect_columns(self):
        cols = self.df.columns.tolist()
        self.image_col = next((c for c in ['image_name', 'image', 'img_name', 'filename', 'Unnamed: 0'] if c in cols), cols[0])
        self.text_col = next((c for c in ['text', 'ocr_text', 'caption', 'OCR_extracted_text'] if c in cols), None)
        self.sentiment_col = next((c for c in ['sentiment', 'overall_sentiment', 'overall'] if c in cols), None)
        
        self.sentiment_map = {'very_positive': 0, 'positive': 1, 'neutral': 2, 'negative': 3, 'very_negative': 4}
        self.humor_map = {'not_funny': 0, 'funny': 1, 'very_funny': 1, 'hilarious': 1}
        self.sarcasm_map = {'not_sarcastic': 0, 'general': 1, 'twisted_meaning': 1, 'very_twisted': 1}
        self.offensive_map = {'not_offensive': 0, 'slight': 1, 'very_offensive': 1, 'hateful_offensive': 1}
        self.motivational_map = {'not_motivational': 0, 'motivational': 1}
    
    def _map_label(self, value, mapping, default=0):
        if pd.isna(value):
            return default
        if isinstance(value, str):
            return mapping.get(value.lower().strip(), default)
        return int(value)
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        
        image_name = str(row[self.image_col])
        if not any(image_name.lower().endswith(ext) for ext in ['.jpg', '.jpeg', '.png']):
            image_name = f"{image_name}.jpg"
        image_path = os.path.join(self.image_dir, image_name)
        
        try:
            image = Image.open(image_path).convert('RGB')
            image = self.image_transform(image)
        except:
            image = torch.zeros(3, self.cfg['IMG_SIZE'], self.cfg['IMG_SIZE'])
        
        text = str(row.get(self.text_col, '')) if self.text_col else 'No text'
        encoding = self.tokenizer(text, max_length=self.cfg['MAX_LEN'], padding='max_length', truncation=True, return_tensors='pt')
        
        sentiment_val = row.get(self.sentiment_col, 'neutral') if self.sentiment_col else 'neutral'
        sentiment_label = self._map_label(sentiment_val, self.sentiment_map, default=2)
        
        emotion_labels = torch.tensor([
            float(self._map_label(row.get('humour', row.get('humor', 0)), self.humor_map, 0)),
            float(self._map_label(row.get('sarcastic', row.get('sarcasm', 0)), self.sarcasm_map, 0)),
            float(self._map_label(row.get('offensive', 0), self.offensive_map, 0)),
            float(self._map_label(row.get('motivational', 0), self.motivational_map, 0))
        ], dtype=torch.float)
        
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'image': image,
            'sentiment_label': torch.tensor(sentiment_label, dtype=torch.long),
            'emotion_labels': emotion_labels,
            'intensity': torch.tensor(0.5, dtype=torch.float),
            'motivational_flag': emotion_labels[3]
        }

print("âœ“ Dataset class defined")

# ==================== PART 5: LOSS & METRICS ====================
print("\n" + "=" * 80)
print("PART 5: LOSS FUNCTIONS & METRICS")
print("=" * 80)

def ordinal_regression_loss(cumulative_logits, labels):
    batch_size = labels.size(0)
    num_thresholds = cumulative_logits.size(1)
    target_cumulative = torch.zeros_like(cumulative_logits)
    
    for i in range(batch_size):
        y = int(labels[i].item())
        if y < num_thresholds:
            target_cumulative[i, y:] = 1.0
    
    return F.binary_cross_entropy_with_logits(cumulative_logits, target_cumulative, reduction='mean')

def combined_loss(outputs, batch, cfg, emotion_loss_fn):
    loss_sent = ordinal_regression_loss(outputs['sentiment']['cumulative_logits'], batch['sentiment_label'])
    loss_emotion = emotion_loss_fn(outputs['emotion_logits'], batch['emotion_labels'])
    loss_intensity = F.smooth_l1_loss(outputs['intensity'], batch['intensity'])
    
    total_loss = (
        cfg['LOSS_WEIGHTS']['sentiment'] * loss_sent +
        cfg['LOSS_WEIGHTS']['emotion'] * loss_emotion +
        cfg['LOSS_WEIGHTS']['intensity'] * loss_intensity
    )
    
    return total_loss, loss_sent, loss_emotion, loss_intensity

def compute_metrics(sentiment_outputs, sentiment_labels, emotion_logits, emotion_labels, thresholds):
    # Sentiment
    class_probs = sentiment_outputs['class_probs']
    y_pred = torch.argmax(class_probs, dim=1).cpu().numpy()
    y_true = sentiment_labels.cpu().numpy()
    
    sent_acc = accuracy_score(y_true, y_pred)
    _, _, sent_f1, _ = precision_recall_fscore_support(y_true, y_pred, average='macro', zero_division=0)
    sent_mae = mean_absolute_error(y_true, y_pred)
    sent_1off = np.mean(np.abs(y_true - y_pred) <= 1)
    
    # Emotions
    emo_probs = torch.sigmoid(emotion_logits).cpu().numpy()
    emo_true = emotion_labels.cpu().numpy()
    thresholds = np.array(thresholds)
    emo_pred = (emo_probs >= thresholds).astype(float)
    
    _, _, emo_f1, _ = precision_recall_fscore_support(emo_true, emo_pred, average='samples', zero_division=0)
    
    return {
        'sentiment_accuracy': sent_acc,
        'sentiment_f1': sent_f1,
        'sentiment_mae': sent_mae,
        'sentiment_1off_accuracy': sent_1off,
        'emotion_f1': emo_f1
    }

print("âœ“ Loss functions and metrics defined")

# ==================== PART 6: TRAINER ====================
print("\n" + "=" * 80)
print("PART 6: TRAINER")
print("=" * 80)

class Trainer:
    def __init__(self, model, cfg, train_loader, val_loader, device, emotion_loss_fn):
        self.model = model
        self.cfg = cfg
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.device = device
        self.emotion_loss_fn = emotion_loss_fn
        
        self.optimizer = self.make_optimizer()
        self.scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(self.optimizer, T_max=cfg['EPOCHS'])
        self.scaler = GradScaler() if cfg['USE_AMP'] else None
        self.best_metric = -float('inf')
    
    def make_optimizer(self):
        head_params = []
        backbone_params = []
        
        for name, param in self.model.named_parameters():
            if param.requires_grad:
                if 'text_model' in name or 'image_model' in name:
                    backbone_params.append(param)
                else:
                    head_params.append(param)
        
        param_groups = [{'params': head_params, 'lr': self.cfg['LR_HEADS']}]
        if backbone_params:
            param_groups.append({'params': backbone_params, 'lr': self.cfg['LR_BACKBONE']})
        
        return torch.optim.AdamW(param_groups, weight_decay=self.cfg['WEIGHT_DECAY'])
    
    def train_epoch(self, epoch):
        self.model.train()
        total_loss = 0.0
        
        pbar = tqdm(self.train_loader, desc=f"Epoch {epoch+1}/{self.cfg['EPOCHS']} [Train]")
        self.optimizer.zero_grad()
        
        for batch_idx, batch in enumerate(pbar):
            batch_device = {k: v.to(self.device) for k, v in batch.items() if k != 'motivational_flag'}
            
            if self.cfg['USE_AMP']:
                with autocast():
                    outputs = self.model(batch_device['input_ids'], batch_device['attention_mask'], batch_device['image'])
                    loss, l_sent, l_emo, l_int = combined_loss(outputs, batch_device, self.cfg, self.emotion_loss_fn)
                    loss = loss / self.cfg['GRADIENT_ACCUMULATION_STEPS']
                
                self.scaler.scale(loss).backward()
                
                if (batch_idx + 1) % self.cfg['GRADIENT_ACCUMULATION_STEPS'] == 0:
                    if self.cfg['GRADIENT_CLIP'] > 0:
                        self.scaler.unscale_(self.optimizer)
                        torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.cfg['GRADIENT_CLIP'])
                    self.scaler.step(self.optimizer)
                    self.scaler.update()
                    self.optimizer.zero_grad()
            else:
                outputs = self.model(batch_device['input_ids'], batch_device['attention_mask'], batch_device['image'])
                loss, l_sent, l_emo, l_int = combined_loss(outputs, batch_device, self.cfg, self.emotion_loss_fn)
                loss = loss / self.cfg['GRADIENT_ACCUMULATION_STEPS']
                loss.backward()
                
                if (batch_idx + 1) % self.cfg['GRADIENT_ACCUMULATION_STEPS'] == 0:
                    if self.cfg['GRADIENT_CLIP'] > 0:
                        torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.cfg['GRADIENT_CLIP'])
                    self.optimizer.step()
                    self.optimizer.zero_grad()
            
            total_loss += loss.item() * self.cfg['GRADIENT_ACCUMULATION_STEPS']
            pbar.set_postfix({'loss': f"{loss.item() * self.cfg['GRADIENT_ACCUMULATION_STEPS']:.4f}"})
        
        return total_loss / len(self.train_loader)
    
    def validate(self, epoch):
        self.model.eval()
        total_loss = 0.0
        
        all_sentiment_labels = []
        all_sentiment_outputs = []
        all_emotion_labels = []
        all_emotion_logits = []
        
        with torch.no_grad():
            for batch in tqdm(self.val_loader, desc=f"Epoch {epoch+1} [Val]"):
                batch_device = {k: v.to(self.device) for k, v in batch.items() if k != 'motivational_flag'}
                
                outputs = self.model(batch_device['input_ids'], batch_device['attention_mask'], batch_device['image'])
                loss, _, _, _ = combined_loss(outputs, batch_device, self.cfg, self.emotion_loss_fn)
                total_loss += loss.item()
                
                all_sentiment_labels.append(batch_device['sentiment_label'].cpu())
                all_sentiment_outputs.append({
                    'cumulative_logits': outputs['sentiment']['cumulative_logits'].cpu(),
                    'class_probs': outputs['sentiment']['class_probs'].cpu()
                })
                all_emotion_labels.append(batch_device['emotion_labels'].cpu())
                all_emotion_logits.append(outputs['emotion_logits'].cpu())
        
        all_sentiment_labels = torch.cat(all_sentiment_labels)
        combined_sentiment = {
            'cumulative_logits': torch.cat([o['cumulative_logits'] for o in all_sentiment_outputs]),
            'class_probs': torch.cat([o['class_probs'] for o in all_sentiment_outputs])
        }
        all_emotion_labels = torch.cat(all_emotion_labels)
        all_emotion_logits = torch.cat(all_emotion_logits)
        
        metrics = compute_metrics(combined_sentiment, all_sentiment_labels, all_emotion_logits, 
                                 all_emotion_labels, self.cfg['EMO_THRESHOLDS'])
        
        avg_loss = total_loss / len(self.val_loader)
        
        print(f"\n{'='*70}")
        print(f"Validation Results (Epoch {epoch+1}):")
        print(f"  Loss: {avg_loss:.4f}")
        print(f"  Sentiment Accuracy: {metrics['sentiment_accuracy']:.4f}")
        print(f"  Sentiment F1: {metrics['sentiment_f1']:.4f}")
        print(f"  Sentiment MAE: {metrics['sentiment_mae']:.4f}")
        print(f"  Sentiment 1-off Acc: {metrics['sentiment_1off_accuracy']:.4f}")
        print(f"  Emotion F1: {metrics['emotion_f1']:.4f}")
        print(f"{'='*70}\n")
        
        return {**metrics, 'val_loss': avg_loss}
    
    def fit(self):
        print(f"\n{'='*70}")
        print(f"STARTING TRAINING: {self.cfg['EPOCHS']} EPOCHS")
        print(f"{'='*70}\n")
        
        for epoch in range(self.cfg['EPOCHS']):
            # Early backbone unfreezing
            if epoch == self.cfg['UNFREEZE_BACKBONE_EPOCH']:
                print(f"\n{'='*70}")
                print(f"ðŸ”“ UNFREEZING BACKBONE at epoch {epoch+1}")
                print(f"{'='*70}\n")
                self.model.unfreeze_backbone(layers_to_unfreeze=self.cfg['UNFREEZE_LAYERS'])
                self.optimizer = self.make_optimizer()
                self.scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(self.optimizer, T_max=self.cfg['EPOCHS'])
            
            train_loss = self.train_epoch(epoch)
            print(f"\nTrain Loss: {train_loss:.4f}")
            
            val_metrics = self.validate(epoch)
            
            if self.scheduler:
                self.scheduler.step()
            
            # Composite metric (emphasis on emotion F1)
            composite = (
                val_metrics['sentiment_f1'] +
                val_metrics['sentiment_1off_accuracy'] -
                val_metrics['sentiment_mae'] +
                val_metrics['emotion_f1'] * 1.5
            )
            
            if composite > self.best_metric:
                self.best_metric = composite
                os.makedirs(self.cfg['CHECKPOINT_PATH'], exist_ok=True)
                checkpoint_path = os.path.join(self.cfg['CHECKPOINT_PATH'], 'best_model_enhanced.pt')
                torch.save({
                    'epoch': epoch,
                    'model_state_dict': self.model.state_dict(),
                    'optimizer_state_dict': self.optimizer.state_dict(),
                    'best_metric': self.best_metric,
                    'metrics': val_metrics,
                    'config': self.cfg
                }, checkpoint_path)
                print(f"âœ“ Saved best model (composite: {composite:.4f})")
        
        print("\nâœ… TRAINING COMPLETED!")
        return self.best_metric

print("âœ“ Trainer class defined")

# ==================== PART 7: DATA LOADING ====================
print("\n" + "=" * 80)
print("PART 7: DATA LOADING & PREPARATION")
print("=" * 80)

# Initialize tokenizer and transforms
tokenizer = AutoTokenizer.from_pretrained(cfg['TEXT_MODEL'])

train_transform = transforms.Compose([
    transforms.Resize((cfg['IMG_SIZE'], cfg['IMG_SIZE'])),
    transforms.RandomHorizontalFlip(p=0.3),
    transforms.ColorJitter(brightness=0.2, contrast=0.2),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073],
                       std=[0.26862954, 0.26130258, 0.27577711])
])

val_transform = transforms.Compose([
    transforms.Resize((cfg['IMG_SIZE'], cfg['IMG_SIZE'])),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073],
                       std=[0.26862954, 0.26130258, 0.27577711])
])

print("âœ“ Tokenizer and transforms initialized")

# Create datasets
train_dataset = MemeDataset(train_df, tokenizer, train_transform, NEW_TRAIN_DIR, cfg)
val_dataset = MemeDataset(val_df, tokenizer, val_transform, NEW_VAL_DIR, cfg)

print(f"âœ“ Train dataset: {len(train_dataset)} samples")
print(f"âœ“ Val dataset: {len(val_dataset)} samples")

# Create weighted sampler for motivational oversampling
print("\nCreating weighted sampler...")
sample_weights = []
motivational_count = 0

for idx in range(len(train_dataset)):
    item = train_dataset[idx]
    is_motivational = int(item['motivational_flag'].item())
    
    if is_motivational:
        weight = cfg['MOTIVATIONAL_OVERSAMPLE_FACTOR']
        motivational_count += 1
    else:
        weight = 1.0
    
    sample_weights.append(weight)

print(f"  Motivational samples: {motivational_count} ({motivational_count/len(train_dataset)*100:.2f}%)")
print(f"  Oversampling factor: {cfg['MOTIVATIONAL_OVERSAMPLE_FACTOR']}x")
print(f"  Effective representation: {motivational_count * cfg['MOTIVATIONAL_OVERSAMPLE_FACTOR'] / len(train_dataset) * 100:.1f}%")

sampler = WeightedRandomSampler(weights=sample_weights, num_samples=len(sample_weights), replacement=True)

# Create dataloaders
train_loader = DataLoader(
    train_dataset, batch_size=cfg['BATCH_SIZE'], sampler=sampler,
    num_workers=2, pin_memory=True, drop_last=True
)

val_loader = DataLoader(
    val_dataset, batch_size=cfg['BATCH_SIZE'], shuffle=False,
    num_workers=2, pin_memory=True
)

print(f"âœ“ Train batches: {len(train_loader)}")
print(f"âœ“ Val batches: {len(val_loader)}")

# ==================== PART 8: MODEL INITIALIZATION & TRAINING ====================
print("\n" + "=" * 80)
print("PART 8: MODEL INITIALIZATION")
print("=" * 80)

model = EnhancedFusionModel(cfg).to(device)

total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"\nModel Statistics:")
print(f"  Total parameters: {total_params:,}")
print(f"  Trainable parameters: {trainable_params:,}")
print(f"  Frozen parameters: {total_params - trainable_params:,}")

# Initialize enhanced emotion loss
emotion_loss_fn = EnhancedAsymmetricLoss(
    gamma_neg=cfg['ASL_GAMMA_NEG'],
    gamma_pos=cfg['ASL_GAMMA_POS'],
    clip=cfg['ASL_CLIP'],
    priors=cfg['EMO_PRIORS'],
    prior_tau=cfg['ASL_PRIOR_TAU']
)

print(f"\nâœ“ Enhanced ASL initialized:")
print(f"  Î³_neg={cfg['ASL_GAMMA_NEG']}, Î³_pos={cfg['ASL_GAMMA_POS']}")
print(f"  Prior adjustment: Ï„={cfg['ASL_PRIOR_TAU']}")
print(f"  Priors: {[f'{p:.3f}' for p in cfg['EMO_PRIORS']]}")

# Initialize trainer
trainer = Trainer(model, cfg, train_loader, val_loader, device, emotion_loss_fn)

print("\nâœ“ Trainer initialized")

# ==================== START TRAINING ====================
print("\n" + "=" * 80)
print("STARTING TRAINING")
print("=" * 80)

best_metric = trainer.fit()

print(f"\n{'='*80}")
print(f"âœ… TRAINING COMPLETED!")
print(f"{'='*80}")
print(f"Best composite metric: {best_metric:.4f}")
print(f"Model saved to: {cfg['CHECKPOINT_PATH']}/best_model_enhanced.pt")

# Generate model card
model_card = f"""# Enhanced Multi-modal Meme Analysis Model

## Overview
This model uses a hybrid loss strategy combining ordinal regression for sentiment 
and enhanced asymmetric loss (ASL) with prior adjustment for emotions.

## Key Improvements

### 1. Hybrid Loss Strategy
- **Sentiment**: Ordinal regression respects natural class ordering
- **Emotions**: Enhanced ASL with positive focusing (Î³_pos={cfg['ASL_GAMMA_POS']}) 
  and prior adjustment (Ï„={cfg['ASL_PRIOR_TAU']})
- **Intensity**: Smooth L1 loss

### 2. Motivational Oversampling
- Factor: {cfg['MOTIVATIONAL_OVERSAMPLE_FACTOR']}x
- Original representation: {motivational_count/len(train_dataset)*100:.2f}%
- Effective representation: {motivational_count * cfg['MOTIVATIONAL_OVERSAMPLE_FACTOR'] / len(train_dataset) * 100:.1f}%

### 3. Early Backbone Unfreezing
- Unfreezes at epoch {cfg['UNFREEZE_BACKBONE_EPOCH']}
- Layers unfrozen: {cfg['UNFREEZE_LAYERS']}

## Architecture
- **Text**: {cfg['TEXT_MODEL']}
- **Image**: {cfg['IMAGE_MODEL']}
- **Fusion**: Bidirectional cross-attention
- **Params**: {total_params:,} total, {trainable_params:,} trainable

## Training Details
- Epochs: {cfg['EPOCHS']}
- Batch size: {cfg['BATCH_SIZE']}
- LR (heads): {cfg['LR_HEADS']}
- LR (backbone): {cfg['LR_BACKBONE']}
- Loss weights: Sentiment={cfg['LOSS_WEIGHTS']['sentiment']}, 
  Emotion={cfg['LOSS_WEIGHTS']['emotion']}, Intensity={cfg['LOSS_WEIGHTS']['intensity']}

## Performance
- Best composite metric: {best_metric:.4f}

## Dataset
- Training samples: {len(train_df):,}
- Validation samples: {len(val_df):,}

## Usage

```python
checkpoint = torch.load('best_model_enhanced.pt')
model = EnhancedFusionModel(checkpoint['config']).to(device)
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()

with torch.no_grad():
    outputs = model(input_ids, attention_mask, image)
    sentiment = outputs['sentiment']['class_probs']
    emotions = torch.sigmoid(outputs['emotion_logits'])
```
"""

model_card_path = os.path.join(cfg['CHECKPOINT_PATH'], 'model_card.md')
with open(model_card_path, 'w') as f:
    f.write(model_card)

print(f"\nâœ“ Model card saved to: {model_card_path}")
print(f"\n{'='*80}")
print("ALL DONE! ðŸŽ‰")
print(f"{'='*80}\n")

INSTALLING DEPENDENCIES...


2025-11-20 21:49:04.146254: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1763675344.168900     202 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1763675344.175760     202 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

âœ“ All dependencies installed

PART 1: DATA PREPARATION

Downloading dataset...
Extracting dataset...
Extracting protected archive...
âœ“ Extracted to: /kaggle/working/

Loading CSV file...
âœ“ Image column: Unnamed: 0

âœ“ Stratified split complete:
  Training: 5950 samples
  Validation: 1050 samples

âœ“ Label priors calculated:
  offensive_pos_rate: 0.3909
  motivational_pos_rate: 0.1187
  humor_pos_rate: 0.8558
  sarcasm_pos_rate: 0.7891

Copying images...


Copying to /kaggle/working/validation_images/: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1050/1050 [00:00<00:00, 5463.37it/s]
Copying to /kaggle/working/new_train_images/: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 5950/5950 [00:01<00:00, 5799.96it/s]


âœ“ Validation: 1050 copied, 0 missing
âœ“ Training: 5950 copied, 0 missing

âœ“ Saved train CSV: /kaggle/working/train_split.csv
âœ“ Saved validation CSV: /kaggle/working/validation_split.csv

âœ… DATA PREPARATION COMPLETE

PART 2: CONFIGURATION

âœ“ Configuration loaded:
  Device: cuda
  Epochs: 20
  Batch size: 16
  Motivational oversampling: 8.0x
  Emotion priors: ['0.856', '0.789', '0.391', '0.119']

PART 3: MODEL COMPONENTS
âœ“ Model components defined

PART 4: DATASET
âœ“ Dataset class defined

PART 5: LOSS FUNCTIONS & METRICS
âœ“ Loss functions and metrics defined

PART 6: TRAINER
âœ“ Trainer class defined

PART 7: DATA LOADING & PREPARATION
âœ“ Tokenizer and transforms initialized
âœ“ Train dataset: 5950 samples
âœ“ Val dataset: 1050 samples

Creating weighted sampler...
  Motivational samples: 706 (11.87%)
  Oversampling factor: 8.0x
  Effective representation: 94.9%
âœ“ Train batches: 371
âœ“ Val batches: 66

PART 8: MODEL INITIALIZATION

Model Statistics:
  Total parameters

Epoch 1/20 [Train]:   0%|          | 0/371 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 1/20 [Train]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 371/371 [00:38<00:00,  9.57it/s, loss=0.9950]



Train Loss: 1.1381


Epoch 1 [Val]:   0%|          | 0/66 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 1 [Val]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 66/66 [00:05<00:00, 11.31it/s]



Validation Results (Epoch 1):
  Loss: 1.0792
  Sentiment Accuracy: 0.1524
  Sentiment F1: 0.1141
  Sentiment MAE: 1.4781
  Sentiment 1-off Acc: 0.5200
  Emotion F1: 0.7725

âœ“ Saved best model (composite: 0.3147)


Epoch 2/20 [Train]:   0%|          | 0/371 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 2/20 [Train]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 371/371 [00:37<00:00,  9.83it/s, loss=0.9703]



Train Loss: 1.0789


Epoch 2 [Val]:   0%|          | 0/66 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 2 [Val]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 66/66 [00:05<00:00, 11.48it/s]



Validation Results (Epoch 2):
  Loss: 1.0800
  Sentiment Accuracy: 0.2533
  Sentiment F1: 0.1556
  Sentiment MAE: 1.0990
  Sentiment 1-off Acc: 0.7019
  Emotion F1: 0.7721

âœ“ Saved best model (composite: 0.9165)

ðŸ”“ UNFREEZING BACKBONE at epoch 3



Epoch 3/20 [Train]:   0%|          | 0/371 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 3/20 [Train]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 371/371 [00:53<00:00,  6.88it/s, loss=0.8796]



Train Loss: 1.0138


Epoch 3 [Val]:   0%|          | 0/66 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 3 [Val]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 66/66 [00:05<00:00, 11.42it/s]



Validation Results (Epoch 3):
  Loss: 1.1298
  Sentiment Accuracy: 0.2733
  Sentiment F1: 0.1728
  Sentiment MAE: 1.1486
  Sentiment 1-off Acc: 0.6667
  Emotion F1: 0.7724



Epoch 4/20 [Train]:   0%|          | 0/371 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 4/20 [Train]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 371/371 [00:53<00:00,  6.89it/s, loss=0.7981]



Train Loss: 0.8568


Epoch 4 [Val]:   0%|          | 0/66 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 4 [Val]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 66/66 [00:05<00:00, 11.41it/s]



Validation Results (Epoch 4):
  Loss: 1.0088
  Sentiment Accuracy: 0.3686
  Sentiment F1: 0.2045
  Sentiment MAE: 0.8781
  Sentiment 1-off Acc: 0.7981
  Emotion F1: 0.7725

âœ“ Saved best model (composite: 1.2832)


Epoch 5/20 [Train]:   0%|          | 0/371 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 5/20 [Train]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 371/371 [00:54<00:00,  6.85it/s, loss=1.1279]



Train Loss: 0.7734


Epoch 5 [Val]:   0%|          | 0/66 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 5 [Val]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 66/66 [00:05<00:00, 11.38it/s]



Validation Results (Epoch 5):
  Loss: 1.0306
  Sentiment Accuracy: 0.3743
  Sentiment F1: 0.1956
  Sentiment MAE: 0.8495
  Sentiment 1-off Acc: 0.8190
  Emotion F1: 0.7721

âœ“ Saved best model (composite: 1.3233)


Epoch 6/20 [Train]:   0%|          | 0/371 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 6/20 [Train]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 371/371 [00:54<00:00,  6.82it/s, loss=0.6593]



Train Loss: 0.7128


Epoch 6 [Val]:   0%|          | 0/66 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 6 [Val]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 66/66 [00:05<00:00, 11.40it/s]



Validation Results (Epoch 6):
  Loss: 0.9872
  Sentiment Accuracy: 0.4276
  Sentiment F1: 0.1934
  Sentiment MAE: 0.7048
  Sentiment 1-off Acc: 0.8790
  Emotion F1: 0.7721

âœ“ Saved best model (composite: 1.5258)


Epoch 7/20 [Train]:   0%|          | 0/371 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 7/20 [Train]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 371/371 [00:54<00:00,  6.84it/s, loss=0.7044]



Train Loss: 0.6817


Epoch 7 [Val]:   0%|          | 0/66 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 7 [Val]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 66/66 [00:05<00:00, 11.38it/s]



Validation Results (Epoch 7):
  Loss: 1.0010
  Sentiment Accuracy: 0.4114
  Sentiment F1: 0.2067
  Sentiment MAE: 0.7562
  Sentiment 1-off Acc: 0.8562
  Emotion F1: 0.7723



Epoch 8/20 [Train]:   0%|          | 0/371 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 8/20 [Train]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 371/371 [00:54<00:00,  6.86it/s, loss=0.6159]



Train Loss: 0.6349


Epoch 8 [Val]:   0%|          | 0/66 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 8 [Val]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 66/66 [00:05<00:00, 11.38it/s]



Validation Results (Epoch 8):
  Loss: 1.0518
  Sentiment Accuracy: 0.3781
  Sentiment F1: 0.1987
  Sentiment MAE: 0.8686
  Sentiment 1-off Acc: 0.8048
  Emotion F1: 0.7719



Epoch 9/20 [Train]:   0%|          | 0/371 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 9/20 [Train]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 371/371 [00:54<00:00,  6.86it/s, loss=0.6831]



Train Loss: 0.6123


Epoch 9 [Val]:   0%|          | 0/66 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 9 [Val]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 66/66 [00:05<00:00, 11.42it/s]



Validation Results (Epoch 9):
  Loss: 1.0411
  Sentiment Accuracy: 0.3990
  Sentiment F1: 0.1990
  Sentiment MAE: 0.8067
  Sentiment 1-off Acc: 0.8343
  Emotion F1: 0.7718



Epoch 10/20 [Train]:   0%|          | 0/371 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 10/20 [Train]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 371/371 [00:54<00:00,  6.86it/s, loss=0.4957]



Train Loss: 0.5884


Epoch 10 [Val]:   0%|          | 0/66 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 10 [Val]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 66/66 [00:05<00:00, 11.41it/s]



Validation Results (Epoch 10):
  Loss: 1.0069
  Sentiment Accuracy: 0.4124
  Sentiment F1: 0.2019
  Sentiment MAE: 0.7438
  Sentiment 1-off Acc: 0.8648
  Emotion F1: 0.7723



Epoch 11/20 [Train]:   0%|          | 0/371 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 11/20 [Train]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 371/371 [00:54<00:00,  6.86it/s, loss=0.5324]



Train Loss: 0.5470


Epoch 11 [Val]:   0%|          | 0/66 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 11 [Val]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 66/66 [00:05<00:00, 11.39it/s]



Validation Results (Epoch 11):
  Loss: 1.0307
  Sentiment Accuracy: 0.3924
  Sentiment F1: 0.1944
  Sentiment MAE: 0.8076
  Sentiment 1-off Acc: 0.8352
  Emotion F1: 0.7723



Epoch 12/20 [Train]:   0%|          | 0/371 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 12/20 [Train]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 371/371 [00:54<00:00,  6.85it/s, loss=0.5565]



Train Loss: 0.5366


Epoch 12 [Val]:   0%|          | 0/66 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 12 [Val]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 66/66 [00:05<00:00, 11.39it/s]



Validation Results (Epoch 12):
  Loss: 1.0420
  Sentiment Accuracy: 0.3886
  Sentiment F1: 0.2020
  Sentiment MAE: 0.8333
  Sentiment 1-off Acc: 0.8210
  Emotion F1: 0.7725



Epoch 13/20 [Train]:   0%|          | 0/371 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 13/20 [Train]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 371/371 [00:54<00:00,  6.86it/s, loss=0.7034]



Train Loss: 0.5215


Epoch 13 [Val]:   0%|          | 0/66 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 13 [Val]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 66/66 [00:05<00:00, 11.41it/s]



Validation Results (Epoch 13):
  Loss: 1.0205
  Sentiment Accuracy: 0.4076
  Sentiment F1: 0.1921
  Sentiment MAE: 0.7686
  Sentiment 1-off Acc: 0.8495
  Emotion F1: 0.7723



Epoch 14/20 [Train]:   0%|          | 0/371 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 14/20 [Train]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 371/371 [00:54<00:00,  6.85it/s, loss=0.5655]



Train Loss: 0.4980


Epoch 14 [Val]:   0%|          | 0/66 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 14 [Val]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 66/66 [00:05<00:00, 11.38it/s]



Validation Results (Epoch 14):
  Loss: 1.0112
  Sentiment Accuracy: 0.4248
  Sentiment F1: 0.2011
  Sentiment MAE: 0.7057
  Sentiment 1-off Acc: 0.8829
  Emotion F1: 0.7723

âœ“ Saved best model (composite: 1.5367)


Epoch 15/20 [Train]:   0%|          | 0/371 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 15/20 [Train]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 371/371 [00:54<00:00,  6.86it/s, loss=0.4832]



Train Loss: 0.4759


Epoch 15 [Val]:   0%|          | 0/66 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 15 [Val]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 66/66 [00:05<00:00, 11.40it/s]



Validation Results (Epoch 15):
  Loss: 1.0116
  Sentiment Accuracy: 0.4219
  Sentiment F1: 0.2154
  Sentiment MAE: 0.7133
  Sentiment 1-off Acc: 0.8810
  Emotion F1: 0.7721

âœ“ Saved best model (composite: 1.5412)


Epoch 16/20 [Train]:   0%|          | 0/371 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 16/20 [Train]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 371/371 [00:54<00:00,  6.85it/s, loss=0.4140]



Train Loss: 0.4620


Epoch 16 [Val]:   0%|          | 0/66 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 16 [Val]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 66/66 [00:05<00:00, 11.38it/s]



Validation Results (Epoch 16):
  Loss: 1.0295
  Sentiment Accuracy: 0.4229
  Sentiment F1: 0.2226
  Sentiment MAE: 0.7181
  Sentiment 1-off Acc: 0.8752
  Emotion F1: 0.7723



Epoch 17/20 [Train]:   0%|          | 0/371 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 17/20 [Train]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 371/371 [00:54<00:00,  6.85it/s, loss=0.6387]



Train Loss: 0.4498


Epoch 17 [Val]:   0%|          | 0/66 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 17 [Val]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 66/66 [00:05<00:00, 11.39it/s]



Validation Results (Epoch 17):
  Loss: 1.0208
  Sentiment Accuracy: 0.4390
  Sentiment F1: 0.2266
  Sentiment MAE: 0.7000
  Sentiment 1-off Acc: 0.8771
  Emotion F1: 0.7725

âœ“ Saved best model (composite: 1.5625)


Epoch 18/20 [Train]:   0%|          | 0/371 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 18/20 [Train]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 371/371 [00:54<00:00,  6.86it/s, loss=0.4542]



Train Loss: 0.4430


Epoch 18 [Val]:   0%|          | 0/66 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 18 [Val]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 66/66 [00:05<00:00, 11.40it/s]



Validation Results (Epoch 18):
  Loss: 1.0401
  Sentiment Accuracy: 0.4343
  Sentiment F1: 0.2351
  Sentiment MAE: 0.7057
  Sentiment 1-off Acc: 0.8762
  Emotion F1: 0.7723

âœ“ Saved best model (composite: 1.5640)


Epoch 19/20 [Train]:   0%|          | 0/371 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 19/20 [Train]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 371/371 [00:53<00:00,  6.87it/s, loss=0.4440]



Train Loss: 0.4292


Epoch 19 [Val]:   0%|          | 0/66 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 19 [Val]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 66/66 [00:05<00:00, 11.41it/s]



Validation Results (Epoch 19):
  Loss: 1.0500
  Sentiment Accuracy: 0.4390
  Sentiment F1: 0.2273
  Sentiment MAE: 0.6943
  Sentiment 1-off Acc: 0.8800
  Emotion F1: 0.7723

âœ“ Saved best model (composite: 1.5714)


Epoch 20/20 [Train]:   0%|          | 0/371 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 20/20 [Train]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 371/371 [00:54<00:00,  6.85it/s, loss=0.3672]



Train Loss: 0.4199


Epoch 20 [Val]:   0%|          | 0/66 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 20 [Val]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 66/66 [00:05<00:00, 11.40it/s]


Validation Results (Epoch 20):
  Loss: 1.0659
  Sentiment Accuracy: 0.4371
  Sentiment F1: 0.2295
  Sentiment MAE: 0.7048
  Sentiment 1-off Acc: 0.8752
  Emotion F1: 0.7723


âœ… TRAINING COMPLETED!

âœ… TRAINING COMPLETED!
Best composite metric: 1.5714
Model saved to: /kaggle/working/checkpoints/best_model_enhanced.pt

âœ“ Model card saved to: /kaggle/working/checkpoints/model_card.md

ALL DONE! ðŸŽ‰






In [4]:
# ==================== PART 9: INFERENCE & EVALUATION ====================
print("\n" + "=" * 80)
print("PART 9: INFERENCE & COMPREHENSIVE EVALUATION")
print("=" * 80)

class Predictor:
    """Inference class for the enhanced model"""
    def __init__(self, checkpoint_path, device='cuda'):
        # Handle both string and torch.device inputs
        if isinstance(device, torch.device):
            self.device = device if torch.cuda.is_available() else torch.device('cpu')
        else:
            self.device = torch.device(device if torch.cuda.is_available() else 'cpu')
        
        print(f"Loading checkpoint from: {checkpoint_path}")
        # IMPORTANT: weights_only=False to avoid UnpicklingError in PyTorch 2.6+
        checkpoint = torch.load(
            checkpoint_path,
            map_location=self.device,
            weights_only=False
        )
        self.cfg = checkpoint['config']

        # ---------- SAFE DEFAULTS / BACKWARD COMPATIBILITY ----------
        # Default sentiment mapping (index -> label)
        default_sentiment_map = {
            0: "very_positive",
            1: "positive",
            2: "neutral",
            3: "negative",
            4: "very_negative"
        }
        sentiment_map_cfg = self.cfg.get('SENTIMENT_MAP_REV', default_sentiment_map)

        # Normalize keys to int in case YAML stored them as strings
        self.sentiment_map = {int(k): v for k, v in sentiment_map_cfg.items()}

        # Emotion labels and thresholds
        self.emotion_labels = self.cfg.get('EMOTION_LABELS', ["humor", "sarcasm", "offensive", "motivational"])
        self.emotion_thresholds = self.cfg.get('EMO_THRESHOLDS', [0.5, 0.5, 0.60, 0.60])

        print("Initializing model...")
        self.model = EnhancedFusionModel(self.cfg).to(self.device)
        self.model.load_state_dict(checkpoint['model_state_dict'])
        self.model.eval()
        
        print("Loading tokenizer...")
        self.tokenizer = AutoTokenizer.from_pretrained(self.cfg['TEXT_MODEL'])
        
        self.transform = transforms.Compose([
            transforms.Resize((self.cfg['IMG_SIZE'], self.cfg['IMG_SIZE'])),
            transforms.ToTensor(),
            transforms.Normalize(
                mean=[0.48145466, 0.4578275, 0.40821073],
                std=[0.26862954, 0.26130258, 0.27577711]
            )
        ])
        
        print(f"âœ“ Predictor ready on {self.device}")
    
    def predict(self, text, image_path):
        """Predict sentiment and emotions for a single sample"""
        # Tokenize text
        encoding = self.tokenizer(
            text,
            max_length=self.cfg['MAX_LEN'],
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].to(self.device)
        attention_mask = encoding['attention_mask'].to(self.device)
        
        # Load and transform image
        try:
            image = Image.open(image_path).convert('RGB')
            image_tensor = self.transform(image).unsqueeze(0).to(self.device)
        except Exception as e:
            print(f"âš  Could not load image: {image_path} ({e})")
            image_tensor = torch.zeros(
                1, 3,
                self.cfg['IMG_SIZE'],
                self.cfg['IMG_SIZE']
            ).to(self.device)
        
        # Predict
        with torch.no_grad():
            outputs = self.model(input_ids, attention_mask, image_tensor)
        
        # ----- Sentiment -----
        class_probs = outputs['sentiment']['class_probs'].squeeze(0).cpu().numpy()
        sentiment_pred = int(np.argmax(class_probs))
        sentiment_label = self.sentiment_map.get(sentiment_pred, str(sentiment_pred))
        expected_class = float(np.sum(class_probs * np.arange(len(class_probs))))
        
        sentiment_probs_dict = {
            self.sentiment_map.get(i, str(i)): f"{p:.2%}"
            for i, p in enumerate(class_probs)
        }
        
        # ----- Emotions -----
        emotion_logits = outputs['emotion_logits'].squeeze(0).cpu().numpy()
        emotion_probs = 1 / (1 + np.exp(-emotion_logits))
        emotion_preds = (emotion_probs >= np.array(self.emotion_thresholds)).astype(int)
        
        emotion_results = {
            label: {
                'probability': f"{prob:.2%}",
                'predicted': bool(pred),
                'threshold': f"{thr:.3f}"
            }
            for label, prob, pred, thr in zip(
                self.emotion_labels, emotion_probs, emotion_preds, self.emotion_thresholds
            )
        }
        
        # ----- Intensity -----
        intensity = float(outputs['intensity'].squeeze().cpu().item())
        
        return {
            'sentiment': sentiment_label,
            'sentiment_confidence': f"{class_probs[sentiment_pred]:.2%}",
            'sentiment_expected_class': f"{expected_class:.2f}",
            'sentiment_probs': sentiment_probs_dict,
            'emotions': emotion_results,
            'intensity': f"{intensity:.4f}"
        }


# Initialize predictor
checkpoint_path = os.path.join(cfg['CHECKPOINT_PATH'], 'best_model_enhanced.pt')
if os.path.exists(checkpoint_path):
    predictor = Predictor(checkpoint_path, device=device)
    
    # ==================== SAMPLE PREDICTIONS ====================
    print("\n" + "=" * 80)
    print("SAMPLE PREDICTIONS")
    print("=" * 80)
    
    # Detect columns
    image_col = next(
        (c for c in ['image_name', 'image', 'img_name', 'filename', 'Unnamed: 0']
         if c in val_df.columns),
        val_df.columns[0]
    )
    text_col = next(
        (c for c in ['text', 'ocr_text', 'caption', 'OCR_extracted_text']
         if c in val_df.columns),
        None
    )
    
    # Random samples
    random.seed(42)
    sample_indices = random.sample(range(len(val_df)), min(5, len(val_df)))
    
    for idx in sample_indices:
        row = val_df.iloc[idx]
        sample_text = str(row.get(text_col, '')) if text_col else ''
        sample_image_name = str(row.get(image_col, f'{idx}.jpg'))
        
        if not any(sample_image_name.lower().endswith(ext) for ext in ['.jpg', '.jpeg', '.png']):
            sample_image_name = f"{sample_image_name}.jpg"
        
        sample_image_path = os.path.join(NEW_VAL_DIR, sample_image_name)
        
        if not os.path.exists(sample_image_path):
            print(f"\nâš  Image not found: {sample_image_path}")
            continue
        
        print(f"\n{'='*70}")
        print(f"Sample {idx}:")
        print(f"  Text: '{sample_text[:150]}...'")
        print(f"  Image: {sample_image_name}")
        
        result = predictor.predict(sample_text, sample_image_path)
        
        print(f"\nPredictions:")
        print(f"  Sentiment: {result['sentiment']} (confidence: {result['sentiment_confidence']})")
        print(f"  Expected class: {result['sentiment_expected_class']}")
        print(f"  Sentiment distribution:")
        for sent, prob in result['sentiment_probs'].items():
            print(f"    {sent}: {prob}")
        print(f"  Intensity: {result['intensity']}")
        print(f"  Emotions:")
        for emotion, info in result['emotions'].items():
            status = "âœ“" if info['predicted'] else "âœ—"
            print(f"    {status} {emotion.capitalize()}: {info['probability']} (threshold: {info['threshold']})")
    
    # ==================== FULL EVALUATION ====================
    print("\n" + "=" * 80)
    print("FULL EVALUATION ON VALIDATION SET")
    print("=" * 80)
    
    model.eval()
    all_results = []
    
    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Evaluating"):
            batch_device = {
                k: v.to(device)
                for k, v in batch.items()
                if k != 'motivational_flag'
            }
            
            outputs = model(
                batch_device['input_ids'],
                batch_device['attention_mask'],
                batch_device['image']
            )
            
            # Sentiment predictions
            class_probs = outputs['sentiment']['class_probs'].cpu().numpy()
            sentiment_preds = np.argmax(class_probs, axis=1)
            expected_classes = np.sum(
                class_probs * np.arange(class_probs.shape[1])[None, :],
                axis=1
            )
            
            # Emotion predictions
            emotion_logits = outputs['emotion_logits'].cpu().numpy()
            emotion_probs = 1 / (1 + np.exp(-emotion_logits))
            emotion_preds = (emotion_probs >= np.array(cfg['EMO_THRESHOLDS'])).astype(int)
            
            # True labels
            sentiment_true = batch_device['sentiment_label'].cpu().numpy()
            emotion_true = batch_device['emotion_labels'].cpu().numpy()
            
            # Store results
            for i in range(len(sentiment_preds)):
                all_results.append({
                    'sentiment_pred': int(sentiment_preds[i]),
                    'sentiment_expected': float(expected_classes[i]),
                    'sentiment_true': int(sentiment_true[i]),
                    'emotion_pred': emotion_preds[i].tolist(),
                    'emotion_true': emotion_true[i].tolist(),
                    'emotion_probs': emotion_probs[i].tolist()
                })
    
    # Create results dataframe
    results_df = pd.DataFrame(all_results)

    # Use sentiment map from config if available, else default
    sentiment_map_for_eval = {
        0: "very_positive",
        1: "positive",
        2: "neutral",
        3: "negative",
        4: "very_negative"
    }
    if 'SENTIMENT_MAP_REV' in cfg:
        try:
            sentiment_map_for_eval = {int(k): v for k, v in cfg['SENTIMENT_MAP_REV'].items()}
        except Exception:
            pass

    results_df['sentiment_pred_label'] = results_df['sentiment_pred'].map(sentiment_map_for_eval)
    results_df['sentiment_true_label'] = results_df['sentiment_true'].map(sentiment_map_for_eval)
    results_df['sentiment_error'] = np.abs(results_df['sentiment_pred'] - results_df['sentiment_true'])
    results_df['sentiment_correct'] = results_df['sentiment_pred'] == results_df['sentiment_true']
    results_df['sentiment_1off'] = results_df['sentiment_error'] <= 1
    
    print("\n" + "=" * 80)
    print("OVERALL METRICS")
    print("=" * 80)
    
    # Sentiment metrics
    sent_acc = results_df['sentiment_correct'].mean()
    sent_1off = results_df['sentiment_1off'].mean()
    sent_mae = results_df['sentiment_error'].mean()
    
    print(f"\nSentiment:")
    print(f"  Accuracy: {sent_acc:.4f}")
    print(f"  1-off Accuracy: {sent_1off:.4f}")
    print(f"  MAE: {sent_mae:.4f}")
    
    # Per-class sentiment metrics
    print(f"\nPer-Class Sentiment:")
    for label in sorted(results_df['sentiment_true_label'].dropna().unique()):
        class_df = results_df[results_df['sentiment_true_label'] == label]
        class_acc = class_df['sentiment_correct'].mean()
        class_1off = class_df['sentiment_1off'].mean()
        class_mae = class_df['sentiment_error'].mean()
        print(
            f"  {label}: Acc={class_acc:.4f}, "
            f"1-off={class_1off:.4f}, "
            f"MAE={class_mae:.4f} (n={len(class_df)})"
        )
    
    # Emotion metrics
    print(f"\nEmotions:")
    for i, emotion in enumerate(cfg['EMOTION_LABELS']):
        true_labels = [row[i] for row in results_df['emotion_true']]
        pred_labels = [row[i] for row in results_df['emotion_pred']]
        
        acc = accuracy_score(true_labels, pred_labels)
        prec, rec, f1, _ = precision_recall_fscore_support(
            true_labels, pred_labels,
            average='binary',
            zero_division=0
        )
        
        print(f"  {emotion.capitalize()}:")
        print(
            f"    Accuracy: {acc:.4f}, "
            f"Precision: {prec:.4f}, "
            f"Recall: {rec:.4f}, "
            f"F1: {f1:.4f}"
        )
    
    # Save detailed results
    results_csv_path = os.path.join(cfg['CHECKPOINT_PATH'], 'detailed_results.csv')
    results_df.to_csv(results_csv_path, index=False)
    print(f"\nâœ“ Detailed results saved to: {results_csv_path}")

else:
    print(f"âš  Checkpoint not found: {checkpoint_path}")



PART 9: INFERENCE & COMPREHENSIVE EVALUATION
Loading checkpoint from: /kaggle/working/checkpoints/best_model_enhanced.pt
Initializing model...
Loading tokenizer...
âœ“ Predictor ready on cuda

SAMPLE PREDICTIONS

Sample 228:
  Text: '...'
  Image: 1708.jpg

Predictions:
  Sentiment: neutral (confidence: 57.59%)
  Expected class: 2.01
  Sentiment distribution:
    very_positive: 4.82%
    positive: 13.77%
    neutral: 57.59%
    negative: 22.92%
    very_negative: 0.90%
  Intensity: 0.4974
  Emotions:
    âœ“ Humor: 88.74% (threshold: 0.500)
    âœ“ Sarcasm: 77.98% (threshold: 0.500)
    âœ— Offensive: 41.95% (threshold: 0.600)
    âœ— Motivational: 12.66% (threshold: 0.600)

Sample 51:
  Text: '...'
  Image: 341.jpg

Predictions:
  Sentiment: positive (confidence: 35.93%)
  Expected class: 1.00
  Sentiment distribution:
    very_positive: 33.43%
    positive: 35.93%
    neutral: 27.58%
    negative: 2.96%
    very_negative: 0.09%
  Intensity: 0.4964
  Emotions:
    âœ“ Humor: 98.60% (

Evaluating:   0%|          | 0/66 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Evaluating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 66/66 [00:05<00:00, 11.43it/s]


OVERALL METRICS

Sentiment:
  Accuracy: 0.4371
  1-off Accuracy: 0.8752
  MAE: 0.7048

Per-Class Sentiment:
  negative: Acc=0.2267, 1-off=0.9111, MAE=0.9111 (n=225)
  neutral: Acc=0.8367, 1-off=0.9620, MAE=0.2013 (n=447)
  positive: Acc=0.1053, 1-off=0.9404, MAE=0.9544 (n=285)
  very_negative: Acc=0.0000, 1-off=0.2041, MAE=1.9184 (n=49)
  very_positive: Acc=0.0909, 1-off=0.1364, MAE=1.7955 (n=44)

Emotions:
  Humor:
    Accuracy: 0.8552, Precision: 0.8552, Recall: 1.0000, F1: 0.9220
  Sarcasm:
    Accuracy: 0.7895, Precision: 0.7895, Recall: 1.0000, F1: 0.8824
  Offensive:
    Accuracy: 0.6086, Precision: 0.0000, Recall: 0.0000, F1: 0.0000
  Motivational:
    Accuracy: 0.8810, Precision: 0.0000, Recall: 0.0000, F1: 0.0000

âœ“ Detailed results saved to: /kaggle/working/checkpoints/detailed_results.csv





In [5]:
print("\n" + "=" * 80)
print("PART 10: VISUALIZATION & ANALYSIS")
print("=" * 80)

if os.path.exists(checkpoint_path) and 'results_df' in locals():
    import matplotlib.pyplot as plt
    import seaborn as sns
    from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
    
    sns.set_style("whitegrid")
    
    # Create analysis directory
    analysis_dir = os.path.join(cfg['CHECKPOINT_PATH'], 'analysis')
    os.makedirs(analysis_dir, exist_ok=True)
    
    # 1. SENTIMENT CONFUSION MATRIX
    print("\nGenerating sentiment confusion matrix...")
    sentiment_labels = sorted(results_df['sentiment_true_label'].unique())
    cm = confusion_matrix(
        results_df['sentiment_true_label'],
        results_df['sentiment_pred_label'],
        labels=sentiment_labels
    )
    
    fig, ax = plt.subplots(figsize=(10, 8))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=sentiment_labels)
    disp.plot(cmap='Blues', ax=ax, xticks_rotation='vertical', values_format='d')
    plt.title('Sentiment Classification Confusion Matrix\n(Ordinal Regression)', fontsize=14, fontweight='bold')
    plt.tight_layout()
    cm_path = os.path.join(analysis_dir, 'sentiment_confusion_matrix.png')
    plt.savefig(cm_path, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"âœ“ Saved to: {cm_path}")
    
    # 2. SENTIMENT ERROR DISTRIBUTION
    print("\nGenerating error distribution plot...")
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Error distance histogram
    ax = axes[0]
    error_counts = results_df['sentiment_error'].value_counts().sort_index()
    ax.bar(error_counts.index, error_counts.values, color='coral', alpha=0.7, edgecolor='black')
    ax.set_xlabel('Ordinal Distance Error', fontsize=12)
    ax.set_ylabel('Count', fontsize=12)
    ax.set_title('Distribution of Ordinal Errors', fontsize=13, fontweight='bold')
    ax.grid(axis='y', alpha=0.3)
    
    # Cumulative error percentage
    ax = axes[1]
    cumulative_pct = np.cumsum([
        (results_df['sentiment_error'] == i).sum() for i in range(5)
    ]) / len(results_df) * 100
    ax.plot(range(5), cumulative_pct, marker='o', linewidth=2, markersize=8, color='steelblue')
    ax.set_xlabel('Maximum Allowed Error', fontsize=12)
    ax.set_ylabel('Cumulative Percentage (%)', fontsize=12)
    ax.set_title('Cumulative Error Tolerance', fontsize=13, fontweight='bold')
    ax.set_xticks(range(5))
    ax.grid(alpha=0.3)
    ax.axhline(y=90, color='red', linestyle='--', alpha=0.5, label='90% threshold')
    ax.legend()
    
    plt.tight_layout()
    error_path = os.path.join(analysis_dir, 'error_distribution.png')
    plt.savefig(error_path, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"âœ“ Saved to: {error_path}")
    
    # 3. EMOTION PERFORMANCE
    print("\nGenerating emotion performance plots...")
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    
    emotion_metrics = []
    for i, emotion in enumerate(cfg['EMOTION_LABELS']):
        true_labels = [row[i] for row in results_df['emotion_true']]
        pred_labels = [row[i] for row in results_df['emotion_pred']]
        probs = [row[i] for row in results_df['emotion_probs']]
        
        acc = accuracy_score(true_labels, pred_labels)
        prec, rec, f1, _ = precision_recall_fscore_support(
            true_labels, pred_labels, average='binary', zero_division=0
        )
        
        emotion_metrics.append({
            'emotion': emotion,
            'accuracy': acc,
            'precision': prec,
            'recall': rec,
            'f1': f1,
            'positive_rate': np.mean(true_labels),
            'pred_positive_rate': np.mean(pred_labels)
        })
    
    emotion_df = pd.DataFrame(emotion_metrics)
    
    # Plot 1: Overall metrics
    ax = axes[0, 0]
    x = np.arange(len(cfg['EMOTION_LABELS']))
    width = 0.2
    ax.bar(x - 1.5*width, emotion_df['accuracy'], width, label='Accuracy', alpha=0.8)
    ax.bar(x - 0.5*width, emotion_df['precision'], width, label='Precision', alpha=0.8)
    ax.bar(x + 0.5*width, emotion_df['recall'], width, label='Recall', alpha=0.8)
    ax.bar(x + 1.5*width, emotion_df['f1'], width, label='F1', alpha=0.8)
    ax.set_xlabel('Emotion', fontsize=11)
    ax.set_ylabel('Score', fontsize=11)
    ax.set_title('Emotion Classification Metrics', fontsize=12, fontweight='bold')
    ax.set_xticks(x)
    ax.set_xticklabels([e.capitalize() for e in cfg['EMOTION_LABELS']], rotation=45, ha='right')
    ax.legend()
    ax.set_ylim([0, 1])
    ax.grid(axis='y', alpha=0.3)
    
    # Plot 2: Class distribution
    ax = axes[0, 1]
    x = np.arange(len(cfg['EMOTION_LABELS']))
    width = 0.35
    ax.bar(x - width/2, emotion_df['positive_rate'] * 100, width, label='True', alpha=0.8)
    ax.bar(x + width/2, emotion_df['pred_positive_rate'] * 100, width, label='Predicted', alpha=0.8)
    ax.set_xlabel('Emotion', fontsize=11)
    ax.set_ylabel('Positive Class (%)', fontsize=11)
    ax.set_title('Class Distribution: True vs Predicted', fontsize=12, fontweight='bold')
    ax.set_xticks(x)
    ax.set_xticklabels([e.capitalize() for e in cfg['EMOTION_LABELS']], rotation=45, ha='right')
    ax.legend()
    ax.grid(axis='y', alpha=0.3)
    
    # Plot 3 & 4: Individual emotion confusion matrices
    for idx, emotion in enumerate(cfg['EMOTION_LABELS'][:2]):
        ax = axes[1, idx]
        true_labels = [row[idx] for row in results_df['emotion_true']]
        pred_labels = [row[idx] for row in results_df['emotion_pred']]
        
        cm = np.array([
            [sum((t == 0) & (p == 0) for t, p in zip(true_labels, pred_labels)),
             sum((t == 0) & (p == 1) for t, p in zip(true_labels, pred_labels))],
            [sum((t == 1) & (p == 0) for t, p in zip(true_labels, pred_labels)),
             sum((t == 1) & (p == 1) for t, p in zip(true_labels, pred_labels))]
        ])
        
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax,
                   xticklabels=['Negative', 'Positive'],
                   yticklabels=['Negative', 'Positive'])
        ax.set_title(f'{emotion.capitalize()} Confusion Matrix', fontsize=11, fontweight='bold')
        ax.set_ylabel('True')
        ax.set_xlabel('Predicted')
    
    plt.tight_layout()
    emotion_path = os.path.join(analysis_dir, 'emotion_performance.png')
    plt.savefig(emotion_path, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"âœ“ Saved to: {emotion_path}")
    
    # 4. WORST PREDICTIONS
    print("\nAnalyzing worst predictions...")
    worst_df = results_df.nlargest(20, 'sentiment_error')
    
    print(f"\nTop 10 Worst Sentiment Predictions:")
    print("=" * 70)
    for i, (idx, row) in enumerate(worst_df.head(10).iterrows(), 1):
        print(f"\n{i}. Index: {idx}")
        print(f"   True: {row['sentiment_true_label']} (class {row['sentiment_true']})")
        print(f"   Predicted: {row['sentiment_pred_label']} (class {row['sentiment_pred']})")
        print(f"   Error: {row['sentiment_error']:.0f} classes")
    
    worst_csv_path = os.path.join(analysis_dir, 'worst_predictions.csv')
    worst_df.to_csv(worst_csv_path, index=True)
    print(f"\nâœ“ Worst predictions saved to: {worst_csv_path}")
    
    # 5. SUMMARY REPORT
    print("\n" + "=" * 80)
    print("FINAL SUMMARY REPORT")
    print("=" * 80)
    
    summary = {
        'Total Samples': len(results_df),
        'Sentiment Accuracy': results_df['sentiment_correct'].mean(),
        'Sentiment 1-off Accuracy': results_df['sentiment_1off'].mean(),
        'Sentiment MAE': results_df['sentiment_error'].mean(),
    }
    
    for i, emotion in enumerate(cfg['EMOTION_LABELS']):
        true_labels = [row[i] for row in results_df['emotion_true']]
        pred_labels = [row[i] for row in results_df['emotion_pred']]
        acc = accuracy_score(true_labels, pred_labels)
        _, _, f1, _ = precision_recall_fscore_support(true_labels, pred_labels, average='binary', zero_division=0)
        summary[f'{emotion.capitalize()} Accuracy'] = acc
        summary[f'{emotion.capitalize()} F1'] = f1
    
    summary_df = pd.DataFrame([summary]).T
    summary_df.columns = ['Value']
    
    print("\nðŸ“Š SUMMARY METRICS:")
    print(summary_df.to_string())
    
    summary_path = os.path.join(analysis_dir, 'summary_metrics.csv')
    summary_df.to_csv(summary_path)
    print(f"\nâœ“ Summary saved to: {summary_path}")
    
    print("\n" + "=" * 80)
    print("âœ… ALL ANALYSIS COMPLETE!")
    print("=" * 80)
    print(f"\nGenerated files in {analysis_dir}:")
    print("  1. sentiment_confusion_matrix.png")
    print("  2. error_distribution.png")
    print("  3. emotion_performance.png")
    print("  4. worst_predictions.csv")
    print("  5. summary_metrics.csv")
    print(f"\nDetailed results: {results_csv_path}")
    print(f"Model checkpoint: {checkpoint_path}")
    print(f"Model card: {model_card_path}")
    print("\n" + "=" * 80)

print("\nðŸŽ‰ COMPLETE PIPELINE FINISHED! ðŸŽ‰\n")


PART 10: VISUALIZATION & ANALYSIS

Generating sentiment confusion matrix...
âœ“ Saved to: /kaggle/working/checkpoints/analysis/sentiment_confusion_matrix.png

Generating error distribution plot...
âœ“ Saved to: /kaggle/working/checkpoints/analysis/error_distribution.png

Generating emotion performance plots...
âœ“ Saved to: /kaggle/working/checkpoints/analysis/emotion_performance.png

Analyzing worst predictions...

Top 10 Worst Sentiment Predictions:

1. Index: 810
   True: very_negative (class 4)
   Predicted: very_positive (class 0)
   Error: 4 classes

2. Index: 23
   True: negative (class 3)
   Predicted: very_positive (class 0)
   Error: 3 classes

3. Index: 36
   True: negative (class 3)
   Predicted: very_positive (class 0)
   Error: 3 classes

4. Index: 97
   True: negative (class 3)
   Predicted: very_positive (class 0)
   Error: 3 classes

5. Index: 262
   True: negative (class 3)
   Predicted: very_positive (class 0)
   Error: 3 classes

6. Index: 284
   True: negative (cl

In [6]:
print("\n" + "=" * 80)
print("PART 10: VISUALIZATION & ANALYSIS")
print("=" * 80)

if os.path.exists(checkpoint_path) and 'results_df' in locals():
    import matplotlib.pyplot as plt
    import seaborn as sns
    from sklearn.metrics import (
        confusion_matrix,
        ConfusionMatrixDisplay,
        f1_score,
        accuracy_score,
        precision_recall_fscore_support
    )
    
    sns.set_style("whitegrid")
    
    # Create analysis directory
    analysis_dir = os.path.join(cfg['CHECKPOINT_PATH'], 'analysis')
    os.makedirs(analysis_dir, exist_ok=True)
    
    # 1. SENTIMENT CONFUSION MATRIX (TASK A)
    print("\nGenerating sentiment confusion matrix...")
    sentiment_labels = sorted(results_df['sentiment_true_label'].unique())
    cm = confusion_matrix(
        results_df['sentiment_true_label'],
        results_df['sentiment_pred_label'],
        labels=sentiment_labels
    )
    
    fig, ax = plt.subplots(figsize=(10, 8))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=sentiment_labels)
    disp.plot(cmap='Blues', ax=ax, xticks_rotation='vertical', values_format='d')
    plt.title('Sentiment Classification Confusion Matrix\n(Ordinal Regression)', fontsize=14, fontweight='bold')
    plt.tight_layout()
    cm_path = os.path.join(analysis_dir, 'sentiment_confusion_matrix.png')
    plt.savefig(cm_path, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"âœ“ Saved to: {cm_path}")
    
    # 2. SENTIMENT ERROR DISTRIBUTION
    print("\nGenerating error distribution plot...")
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Error distance histogram
    ax = axes[0]
    error_counts = results_df['sentiment_error'].value_counts().sort_index()
    ax.bar(error_counts.index, error_counts.values, color='coral', alpha=0.7, edgecolor='black')
    ax.set_xlabel('Ordinal Distance Error', fontsize=12)
    ax.set_ylabel('Count', fontsize=12)
    ax.set_title('Distribution of Ordinal Errors', fontsize=13, fontweight='bold')
    ax.grid(axis='y', alpha=0.3)
    
    # Cumulative error percentage
    ax = axes[1]
    max_err = int(results_df['sentiment_error'].max())
    cumulative_pct = np.cumsum([
        (results_df['sentiment_error'] == i).sum() for i in range(max_err + 1)
    ]) / len(results_df) * 100
    ax.plot(range(max_err + 1), cumulative_pct, marker='o', linewidth=2, markersize=8, color='steelblue')
    ax.set_xlabel('Maximum Allowed Error', fontsize=12)
    ax.set_ylabel('Cumulative Percentage (%)', fontsize=12)
    ax.set_title('Cumulative Error Tolerance', fontsize=13, fontweight='bold')
    ax.set_xticks(range(max_err + 1))
    ax.grid(alpha=0.3)
    ax.axhline(y=90, color='red', linestyle='--', alpha=0.5, label='90% threshold')
    ax.legend()
    
    plt.tight_layout()
    error_path = os.path.join(analysis_dir, 'error_distribution.png')
    plt.savefig(error_path, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"âœ“ Saved to: {error_path}")
    
    # 3. EMOTION PERFORMANCE (TASK B)
    print("\nGenerating emotion performance plots...")
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    
    emotion_metrics = []
    for i, emotion in enumerate(cfg['EMOTION_LABELS']):
        true_labels = [row[i] for row in results_df['emotion_true']]
        pred_labels = [row[i] for row in results_df['emotion_pred']]
        probs = [row[i] for row in results_df['emotion_probs']]
        
        acc = accuracy_score(true_labels, pred_labels)
        prec, rec, f1_bin, _ = precision_recall_fscore_support(
            true_labels, pred_labels, average='binary', zero_division=0
        )
        # Weighted-F1 per subtask (this is what Memotion uses per B1â€“B4)
        f1_weighted = f1_score(true_labels, pred_labels, average='weighted', zero_division=0)
        
        emotion_metrics.append({
            'emotion': emotion,
            'accuracy': acc,
            'precision': prec,
            'recall': rec,
            'f1_binary': f1_bin,
            'f1_weighted': f1_weighted,
            'positive_rate': np.mean(true_labels),
            'pred_positive_rate': np.mean(pred_labels)
        })
    
    emotion_df = pd.DataFrame(emotion_metrics)
    
    # Plot 1: Overall metrics (per emotion)
    ax = axes[0, 0]
    x = np.arange(len(cfg['EMOTION_LABELS']))
    width = 0.2
    ax.bar(x - 1.5*width, emotion_df['accuracy'], width, label='Accuracy', alpha=0.8)
    ax.bar(x - 0.5*width, emotion_df['precision'], width, label='Precision', alpha=0.8)
    ax.bar(x + 0.5*width, emotion_df['recall'], width, label='Recall', alpha=0.8)
    ax.bar(x + 1.5*width, emotion_df['f1_binary'], width, label='F1 (binary)', alpha=0.8)
    ax.set_xlabel('Emotion', fontsize=11)
    ax.set_ylabel('Score', fontsize=11)
    ax.set_title('Emotion Classification Metrics (Per Subtask)', fontsize=12, fontweight='bold')
    ax.set_xticks(x)
    ax.set_xticklabels([e.capitalize() for e in cfg['EMOTION_LABELS']], rotation=45, ha='right')
    ax.legend()
    ax.set_ylim([0, 1])
    ax.grid(axis='y', alpha=0.3)
    
    # Plot 2: Class distribution
    ax = axes[0, 1]
    x = np.arange(len(cfg['EMOTION_LABELS']))
    width = 0.35
    ax.bar(x - width/2, emotion_df['positive_rate'] * 100, width, label='True', alpha=0.8)
    ax.bar(x + width/2, emotion_df['pred_positive_rate'] * 100, width, label='Predicted', alpha=0.8)
    ax.set_xlabel('Emotion', fontsize=11)
    ax.set_ylabel('Positive Class (%)', fontsize=11)
    ax.set_title('Class Distribution: True vs Predicted', fontsize=12, fontweight='bold')
    ax.set_xticks(x)
    ax.set_xticklabels([e.capitalize() for e in cfg['EMOTION_LABELS']], rotation=45, ha='right')
    ax.legend()
    ax.grid(axis='y', alpha=0.3)
    
    # Plot 3 & 4: Individual emotion confusion matrices (for first two emotions)
    for idx, emotion in enumerate(cfg['EMOTION_LABELS'][:2]):
        ax = axes[1, idx]
        true_labels = [row[idx] for row in results_df['emotion_true']]
        pred_labels = [row[idx] for row in results_df['emotion_pred']]
        
        cm = np.array([
            [sum((t == 0) & (p == 0) for t, p in zip(true_labels, pred_labels)),
             sum((t == 0) & (p == 1) for t, p in zip(true_labels, pred_labels))],
            [sum((t == 1) & (p == 0) for t, p in zip(true_labels, pred_labels)),
             sum((t == 1) & (p == 1) for t, p in zip(true_labels, pred_labels))]
        ])
        
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax,
                    xticklabels=['Negative', 'Positive'],
                    yticklabels=['Negative', 'Positive'])
        ax.set_title(f'{emotion.capitalize()} Confusion Matrix', fontsize=11, fontweight='bold')
        ax.set_ylabel('True')
        ax.set_xlabel('Predicted')
    
    plt.tight_layout()
    emotion_path = os.path.join(analysis_dir, 'emotion_performance.png')
    plt.savefig(emotion_path, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"âœ“ Saved to: {emotion_path}")
    
    # 4. WORST PREDICTIONS (TASK A VIEW)
    print("\nAnalyzing worst predictions...")
    worst_df = results_df.nlargest(20, 'sentiment_error')
    
    print(f"\nTop 10 Worst Sentiment Predictions:")
    print("=" * 70)
    for i, (idx, row) in enumerate(worst_df.head(10).iterrows(), 1):
        print(f"\n{i}. Index: {idx}")
        print(f"   True: {row['sentiment_true_label']} (class {row['sentiment_true']})")
        print(f"   Predicted: {row['sentiment_pred_label']} (class {row['sentiment_pred']})")
        print(f"   Error: {row['sentiment_error']:.0f} classes")
    
    worst_csv_path = os.path.join(analysis_dir, 'worst_predictions.csv')
    worst_df.to_csv(worst_csv_path, index=True)
    print(f"\nâœ“ Worst predictions saved to: {worst_csv_path}")
    
    # 5. FINAL SUMMARY REPORT (MEMOTION 3 TASKS A/B/C)
    print("\n" + "=" * 80)
    print("FINAL SUMMARY REPORT â€“ MEMOTION 3 TASKS")
    print("=" * 80)
    
    # ---------- Task A: Sentiment Analysis ----------
    sent_true = results_df['sentiment_true'].values
    sent_pred = results_df['sentiment_pred'].values
    
    taskA_weighted_f1 = f1_score(sent_true, sent_pred, average='weighted', zero_division=0)
    taskA_macro_f1   = f1_score(sent_true, sent_pred, average='macro', zero_division=0)
    taskA_acc        = results_df['sentiment_correct'].mean()
    taskA_1off       = results_df['sentiment_1off'].mean()
    taskA_mae        = results_df['sentiment_error'].mean()
    
    # ---------- Task B: Emotion Classification ----------
    # Compute weighted-F1 for each of the 4 subtasks (B1â€“B4), then average (Memotion official way)
    taskB_weighted_f1_per_emotion = []
    taskB_binary_f1_per_emotion   = []
    for i, emotion in enumerate(cfg['EMOTION_LABELS']):
        true_labels = np.array([row[i] for row in results_df['emotion_true']], dtype=int)
        pred_labels = np.array([row[i] for row in results_df['emotion_pred']], dtype=int)
        
        f1_weighted = f1_score(true_labels, pred_labels, average='weighted', zero_division=0)
        f1_bin      = f1_score(true_labels, pred_labels, average='binary', zero_division=0)
        
        taskB_weighted_f1_per_emotion.append(f1_weighted)
        taskB_binary_f1_per_emotion.append(f1_bin)
    
    taskB_avg_weighted_f1 = float(np.mean(taskB_weighted_f1_per_emotion))
    taskB_avg_binary_f1   = float(np.mean(taskB_binary_f1_per_emotion))
    
    # Optional: sample-based F1 across all 4 emotions at once
    emo_true = np.array(results_df['emotion_true'].tolist(), dtype=int)   # (N, 4)
    emo_pred = np.array(results_df['emotion_pred'].tolist(), dtype=int)   # (N, 4)
    taskB_sample_f1 = f1_score(emo_true, emo_pred, average='samples', zero_division=0)
    
    # ---------- Task C: Intensity of Emotions ----------
    # NOTE: We can only compute Task C if you store ground-truth intensity labels into results_df.
    has_taskC = all(
        col in results_df.columns
        for col in [
            'humor_int_true', 'sarcasm_int_true', 'offensive_int_true', 'motivational_int_true',
            'humor_int_pred', 'sarcasm_int_pred', 'offensive_int_pred', 'motivational_int_pred'
        ]
    )
    
    taskC_info = "Not computed (no intensity labels in results_df)"
    taskC_avg_weighted_f1 = None
    
    if has_taskC:
        taskC_f1_list = []
        for emo in ['humor', 'sarcasm', 'offensive', 'motivational']:
            y_true = results_df[f'{emo}_int_true'].values
            y_pred = results_df[f'{emo}_int_pred'].values
            f1_w = f1_score(y_true, y_pred, average='weighted', zero_division=0)
            taskC_f1_list.append(f1_w)
        taskC_avg_weighted_f1 = float(np.mean(taskC_f1_list))
        taskC_info = f"Average weighted F1 across 4 intensity subtasks: {taskC_avg_weighted_f1:.4f}"
    
    # ---------- Print Memotion-style summary ----------
    print("\nðŸ“Œ Task A â€“ Sentiment Analysis")
    print(f"  Weighted F1 : {taskA_weighted_f1:.4f}")
    print(f"  Macro F1    : {taskA_macro_f1:.4f}")
    print(f"  Accuracy    : {taskA_acc:.4f}")
    print(f"  1-off Acc   : {taskA_1off:.4f}")
    print(f"  MAE         : {taskA_mae:.4f}")
    
    print("\nðŸ“Œ Task B â€“ Emotion Classification")
    print(f"  Avg Weighted F1 over 4 emotions (B1â€“B4): {taskB_avg_weighted_f1:.4f}")
    print(f"  Avg Binary F1 over 4 emotions         : {taskB_avg_binary_f1:.4f}")
    print(f"  Sample-based F1 (multi-label)         : {taskB_sample_f1:.4f}")
    
    print("\nðŸ“Œ Task C â€“ Emotion Intensity")
    print(f"  {taskC_info}")
    
    # ---------- Detailed summary dict ----------
    summary = {
        'Total Samples': len(results_df),
        'TaskA_Sentiment_Accuracy': taskA_acc,
        'TaskA_Sentiment_WeightedF1': taskA_weighted_f1,
        'TaskA_Sentiment_MacroF1': taskA_macro_f1,
        'TaskA_Sentiment_1offAccuracy': taskA_1off,
        'TaskA_Sentiment_MAE': taskA_mae,
        'TaskB_AvgWeightedF1_4Emotions': taskB_avg_weighted_f1,
        'TaskB_AvgBinaryF1_4Emotions': taskB_avg_binary_f1,
        'TaskB_SampleBasedF1': taskB_sample_f1,
        'TaskC_Info': taskC_info,
        'TaskC_AvgWeightedF1_4Intensities': taskC_avg_weighted_f1 if taskC_avg_weighted_f1 is not None else 'N/A',
    }
    
    # Also log per-emotion metrics
    for i, emotion in enumerate(cfg['EMOTION_LABELS']):
        true_labels = [row[i] for row in results_df['emotion_true']]
        pred_labels = [row[i] for row in results_df['emotion_pred']]
        acc = accuracy_score(true_labels, pred_labels)
        _, _, f1_bin, _ = precision_recall_fscore_support(
            true_labels, pred_labels,
            average='binary', zero_division=0
        )
        f1_w = f1_score(true_labels, pred_labels, average='weighted', zero_division=0)
        summary[f'{emotion.capitalize()}_Accuracy'] = acc
        summary[f'{emotion.capitalize()}_F1_Binary'] = f1_bin
        summary[f'{emotion.capitalize()}_F1_Weighted'] = f1_w
    
    summary_df = pd.DataFrame([summary]).T
    summary_df.columns = ['Value']
    
    print("\nðŸ“Š SUMMARY METRICS (DETAILED):")
    print(summary_df.to_string())
    
    summary_path = os.path.join(analysis_dir, 'summary_metrics.csv')
    summary_df.to_csv(summary_path)
    print(f"\nâœ“ Summary saved to: {summary_path}")
    
    print("\n" + "=" * 80)
    print("âœ… ALL ANALYSIS COMPLETE!")
    print("=" * 80)
    print(f"\nGenerated files in {analysis_dir}:")
    print("  1. sentiment_confusion_matrix.png")
    print("  2. error_distribution.png")
    print("  3. emotion_performance.png")
    print("  4. worst_predictions.csv")
    print("  5. summary_metrics.csv")
    print(f"\nDetailed results: {results_csv_path}")
    print(f"Model checkpoint: {checkpoint_path}")
    print(f"Model card: {model_card_path}")
    print("\n" + "=" * 80)

print("\nðŸŽ‰ COMPLETE PIPELINE FINISHED! ðŸŽ‰\n")



PART 10: VISUALIZATION & ANALYSIS

Generating sentiment confusion matrix...
âœ“ Saved to: /kaggle/working/checkpoints/analysis/sentiment_confusion_matrix.png

Generating error distribution plot...
âœ“ Saved to: /kaggle/working/checkpoints/analysis/error_distribution.png

Generating emotion performance plots...
âœ“ Saved to: /kaggle/working/checkpoints/analysis/emotion_performance.png

Analyzing worst predictions...

Top 10 Worst Sentiment Predictions:

1. Index: 810
   True: very_negative (class 4)
   Predicted: very_positive (class 0)
   Error: 4 classes

2. Index: 23
   True: negative (class 3)
   Predicted: very_positive (class 0)
   Error: 3 classes

3. Index: 36
   True: negative (class 3)
   Predicted: very_positive (class 0)
   Error: 3 classes

4. Index: 97
   True: negative (class 3)
   Predicted: very_positive (class 0)
   Error: 3 classes

5. Index: 262
   True: negative (class 3)
   Predicted: very_positive (class 0)
   Error: 3 classes

6. Index: 284
   True: negative (cl

In [7]:
# ==================== COMPLETE ENHANCED MEME ANALYSIS PIPELINE ====================
# This code includes BOTH data preparation AND enhanced training
# Run this entire script from start to finish

# ==================== PART 0: SETUP & DEPENDENCIES ====================
import sys, subprocess, os, json, zipfile, shutil, random, warnings
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from torch.cuda.amp import GradScaler, autocast
import torchvision.transforms as transforms
from PIL import Image
from tqdm import tqdm
import yaml

warnings.filterwarnings('ignore')

print("=" * 80)
print("INSTALLING DEPENDENCIES...")
print("=" * 80)

# Install required packages
subprocess.check_call([
    sys.executable, "-m", "pip", "install", "-q",
    "transformers>=4.40.0", "accelerate", "torch", "timm",
    "scikit-learn", "pandas", "matplotlib", "seaborn",
    "huggingface_hub>=0.18.0", "gdown", "iterative-stratification"
])

# Import additional packages
from sklearn.metrics import (
    accuracy_score, precision_recall_fscore_support,
    mean_squared_error, mean_absolute_error, f1_score
)
from transformers import AutoModel, AutoTokenizer, CLIPModel
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit

print("âœ“ All dependencies installed\n")

# ==================== PART 1: DATA PREPARATION ====================
print("=" * 80)
print("PART 1: DATA PREPARATION")
print("=" * 80)

# Download and extract dataset
print("\nDownloading dataset...")
subprocess.run(["gdown", "1jEJ2nf5CDJknq80ogzU-Uyz7jbBi-1LZ", "--fuzzy"], 
               check=False, capture_output=True)

print("Extracting dataset...")
zip_files = [f for f in os.listdir('.') if f.endswith('.zip')]
if zip_files:
    subprocess.run(["unzip", "-q", "-o", zip_files[0]], check=False, capture_output=True)

# Download additional files
subprocess.run([
    "gdown", "--folder", "19yaav8ORSVj9DeJUaHKq1H3HtVnkClBw", "--remaining-ok"
], check=False, capture_output=True)

# Extract password-protected archive
print("Extracting protected archive...")
zip_path = '/kaggle/working/Memotion 3/memotion3.zip'
extract_to = '/kaggle/working/'
password = b'memotion3taskaaai@22'

if os.path.exists(zip_path):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to, pwd=password)
    print(f"âœ“ Extracted to: {extract_to}")

# Setup paths
ORIGINAL_TRAIN_IMG_DIR = '/kaggle/working/trainImages/'
ORIGINAL_CSV_PATH = '/kaggle/working/memotion3/train.csv'
VALIDATION_SPLIT_RATIO = 0.15

OUTPUT_BASE_DIR = '/kaggle/working/'
NEW_VAL_DIR = os.path.join(OUTPUT_BASE_DIR, 'validation_images/')
NEW_TRAIN_DIR = os.path.join(OUTPUT_BASE_DIR, 'new_train_images/')

os.makedirs(NEW_VAL_DIR, exist_ok=True)
os.makedirs(NEW_TRAIN_DIR, exist_ok=True)

# Load and process CSV
print("\nLoading CSV file...")
df = pd.read_csv(ORIGINAL_CSV_PATH)

# Detect image column
possible_image_cols = ['image_name', 'image', 'img_name', 'filename', 'Unnamed: 0']
IMAGE_FILENAME_COLUMN = next((col for col in possible_image_cols if col in df.columns), df.columns[0])
print(f"âœ“ Image column: {IMAGE_FILENAME_COLUMN}")

# Normalize labels
for col in ['offensive', 'motivational', 'humour', 'humor', 'sarcastic', 'sarcasm', 'overall', 'sentiment']:
    if col in df.columns:
        df[col] = df[col].astype(str).str.lower().str.strip()

# Create binary labels
def create_binary_label(value, positive_values):
    if pd.isna(value) or value in ['nan', 'none', '']:
        return 0
    return 1 if value in positive_values else 0

if 'offensive' in df.columns:
    df['offensive_bin'] = df['offensive'].apply(
        lambda x: create_binary_label(x, ['slight', 'very_offensive', 'hateful_offensive'])
    )
else:
    df['offensive_bin'] = 0

if 'motivational' in df.columns:
    df['motivational_bin'] = df['motivational'].apply(
        lambda x: create_binary_label(x, ['motivational'])
    )
else:
    df['motivational_bin'] = 0

if 'humour' in df.columns or 'humor' in df.columns:
    humor_col = 'humour' if 'humour' in df.columns else 'humor'
    df['humor_bin'] = df[humor_col].apply(
        lambda x: create_binary_label(x, ['funny', 'very_funny', 'hilarious'])
    )
else:
    df['humor_bin'] = 0

if 'sarcastic' in df.columns or 'sarcasm' in df.columns:
    sarcasm_col = 'sarcastic' if 'sarcastic' in df.columns else 'sarcasm'
    df['sarcasm_bin'] = df[sarcasm_col].apply(
        lambda x: create_binary_label(x, ['general', 'twisted_meaning', 'very_twisted'])
    )
else:
    df['sarcasm_bin'] = 0

# Stratified split
stratify_columns = ['offensive_bin', 'motivational_bin', 'humor_bin', 'sarcasm_bin']
y_stratify = df[stratify_columns].values

msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=VALIDATION_SPLIT_RATIO, random_state=42)
train_idx, val_idx = next(msss.split(df, y_stratify))

train_df = df.iloc[train_idx].reset_index(drop=True)
val_df = df.iloc[val_idx].reset_index(drop=True)

print(f"\nâœ“ Stratified split complete:")
print(f"  Training: {len(train_df)} samples")
print(f"  Validation: {len(val_df)} samples")

# Calculate label priors
label_priors = {
    'offensive_pos_rate': float(train_df['offensive_bin'].sum() / len(train_df)),
    'motivational_pos_rate': float(train_df['motivational_bin'].sum() / len(train_df)),
    'humor_pos_rate': float(train_df['humor_bin'].sum() / len(train_df)),
    'sarcasm_pos_rate': float(train_df['sarcasm_bin'].sum() / len(train_df))
}

priors_path = os.path.join(OUTPUT_BASE_DIR, 'label_priors.json')
with open(priors_path, 'w') as f:
    json.dump(label_priors, f, indent=2)

print(f"\nâœ“ Label priors calculated:")
for key, val in label_priors.items():
    print(f"  {key}: {val:.4f}")

# Copy images
def copy_images(df_subset, dest_dir, source_dir, image_col):
    copied = 0
    missing = 0
    
    for idx in tqdm(df_subset[image_col], desc=f"Copying to {dest_dir}"):
        filename = str(idx)
        if not any(filename.lower().endswith(ext) for ext in ['.jpg', '.jpeg', '.png', '.gif']):
            for ext in ['.jpg', '.jpeg', '.png']:
                test_path = os.path.join(source_dir, f"{filename}{ext}")
                if os.path.exists(test_path):
                    filename = f"{filename}{ext}"
                    break
            else:
                filename = f"{filename}.jpg"
        
        source_path = os.path.join(source_dir, filename)
        if os.path.exists(source_path):
            shutil.copy(source_path, os.path.join(dest_dir, filename))
            copied += 1
        else:
            missing += 1
    
    return copied, missing

print("\nCopying images...")
copied_val, missing_val = copy_images(val_df, NEW_VAL_DIR, ORIGINAL_TRAIN_IMG_DIR, IMAGE_FILENAME_COLUMN)
copied_train, missing_train = copy_images(train_df, NEW_TRAIN_DIR, ORIGINAL_TRAIN_IMG_DIR, IMAGE_FILENAME_COLUMN)

print(f"âœ“ Validation: {copied_val} copied, {missing_val} missing")
print(f"âœ“ Training: {copied_train} copied, {missing_train} missing")

# Save CSVs
train_csv_path = os.path.join(OUTPUT_BASE_DIR, 'train_split.csv')
val_csv_path = os.path.join(OUTPUT_BASE_DIR, 'validation_split.csv')

train_df.to_csv(train_csv_path, index=False)
val_df.to_csv(val_csv_path, index=False)

print(f"\nâœ“ Saved train CSV: {train_csv_path}")
print(f"âœ“ Saved validation CSV: {val_csv_path}")
print("\nâœ… DATA PREPARATION COMPLETE\n")

# ==================== PART 2: CONFIGURATION ====================
print("=" * 80)
print("PART 2: CONFIGURATION")
print("=" * 80)

CONFIG_YAML = """
TEXT_MODEL: "google/muril-base-cased"
IMAGE_MODEL: "openai/clip-vit-base-patch32"
TEXT_DIM: 768
IMAGE_DIM: 768
FUSION_DIM: 512
FUSION_OUT_DIM: 512

MAX_LEN: 128
IMG_SIZE: 224
BATCH_SIZE: 16
GRADIENT_ACCUMULATION_STEPS: 2
LR_HEADS: 0.001
LR_BACKBONE: 0.00002
WEIGHT_DECAY: 0.01
EPOCHS: 20
SEED: 42
DEVICE: "cuda"
CHECKPOINT_PATH: "/kaggle/working/checkpoints"

NUM_SENTIMENT_CLASSES: 5
NUM_EMOTION_CLASSES: 4

USE_ORDINAL_REGRESSION: true
ORDINAL_LINK: "logit"

LOSS_WEIGHTS:
  sentiment: 2.0
  emotion: 1.5
  intensity: 0.5

ASL_GAMMA_NEG: 6.0
ASL_GAMMA_POS: 0.5
ASL_CLIP: 0.05
ASL_PRIOR_TAU: 1.2

EMOTION_LABELS: ["humor", "sarcasm", "offensive", "motivational"]
EMO_THRESHOLDS: [0.5, 0.5, 0.60, 0.60]

# NEW: thresholds for expected-value based sentiment decoding
SENTIMENT_EXPECTED_THRESHOLDS: [0.5, 1.5, 2.5, 3.5]

# NEW: class weights for ordinal loss (upweight extremes 0 and 4)
SENTIMENT_CLASS_WEIGHTS: [1.5, 1.0, 1.0, 1.0, 1.5]

POOLING: "mean"
USE_AMP: true
GRADIENT_CLIP: 1.0
SCHEDULER: "cosine"
UNFREEZE_BACKBONE_EPOCH: 2
UNFREEZE_LAYERS: 3

MOTIVATIONAL_OVERSAMPLE_FACTOR: 8.0

# NEW: oversampling factor for extreme sentiment classes (very_positive & very_negative)
EXTREME_SENTIMENT_OVERSAMPLE_FACTOR: 5.0

CROSS_ATTN_HEADS: 8
CROSS_ATTN_DROPOUT: 0.1

SENTIMENT_MAP_REV:
  0: "very_positive"
  1: "positive"
  2: "neutral"
  3: "negative"
  4: "very_negative"
"""

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

cfg = yaml.safe_load(CONFIG_YAML)
set_seed(cfg['SEED'])

# Load priors
with open(priors_path, 'r') as f:
    priors = json.load(f)

cfg['EMO_PRIORS'] = [
    priors['humor_pos_rate'],
    priors['sarcasm_pos_rate'],
    priors['offensive_pos_rate'],
    priors['motivational_pos_rate']
]

device = torch.device(cfg['DEVICE'] if torch.cuda.is_available() else 'cpu')

print(f"\nâœ“ Configuration loaded:")
print(f"  Device: {device}")
print(f"  Epochs: {cfg['EPOCHS']}")
print(f"  Batch size: {cfg['BATCH_SIZE']}")
print(f"  Motivational oversampling: {cfg['MOTIVATIONAL_OVERSAMPLE_FACTOR']}x")
print(f"  Extreme sentiment oversampling: {cfg['EXTREME_SENTIMENT_OVERSAMPLE_FACTOR']}x")
print(f"  Emotion priors: {[f'{p:.3f}' for p in cfg['EMO_PRIORS']]}")

# ==================== PART 3: MODEL COMPONENTS ====================
print("\n" + "=" * 80)
print("PART 3: MODEL COMPONENTS")
print("=" * 80)

class EnhancedAsymmetricLoss(nn.Module):
    """Enhanced ASL with prior adjustment"""
    def __init__(self, gamma_neg=6.0, gamma_pos=0.5, clip=0.05, priors=None, prior_tau=1.2, eps=1e-8):
        super().__init__()
        self.gamma_neg = gamma_neg
        self.gamma_pos = gamma_pos
        self.clip = clip
        self.eps = eps
        self.priors = priors
        self.prior_tau = prior_tau
    
    def forward(self, logits, targets):
        # Prior adjustment
        if self.priors is not None:
            priors_tensor = torch.tensor(self.priors, device=logits.device, dtype=logits.dtype)
            adjustment = self.prior_tau * torch.log(priors_tensor.clamp(min=self.eps))
            logits = logits - adjustment
        
        xs_pos = torch.sigmoid(logits)
        xs_neg = 1 - xs_pos
        
        if self.clip is not None and self.clip > 0:
            xs_neg = (xs_neg + self.clip).clamp(max=1)
        
        los_pos = targets * torch.log(xs_pos.clamp(min=self.eps))
        los_neg = (1 - targets) * torch.log(xs_neg.clamp(min=self.eps))
        
        if self.gamma_neg > 0 or self.gamma_pos > 0:
            pt0 = xs_pos * targets
            pt1 = xs_neg * (1 - targets)
            pt = pt0 + pt1
            one_sided_gamma = self.gamma_pos * targets + self.gamma_neg * (1 - targets)
            one_sided_w = torch.pow(1 - pt, one_sided_gamma)
            loss = one_sided_w * (los_pos + los_neg)
        else:
            loss = los_pos + los_neg
        
        return -loss.mean()

class OrdinalRegressionHead(nn.Module):
    """Ordinal regression using cumulative link model"""
    def __init__(self, input_dim, num_classes):
        super().__init__()
        self.num_classes = num_classes
        self.num_thresholds = num_classes - 1
        
        self.projection = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 1)
        )
        
        initial_thresholds = torch.linspace(-2, 2, self.num_thresholds)
        self.thresholds = nn.Parameter(initial_thresholds)
    
    def forward(self, x):
        score = self.projection(x).squeeze(-1)
        ordered_thresholds = torch.cumsum(F.softplus(self.thresholds), dim=0)
        cumulative_logits = ordered_thresholds.unsqueeze(0) - score.unsqueeze(1)
        cumulative_probs = torch.sigmoid(cumulative_logits)
        
        batch_size = cumulative_probs.size(0)
        class_probs = torch.zeros(batch_size, self.num_classes, device=x.device)
        
        class_probs[:, 0] = cumulative_probs[:, 0]
        for k in range(1, self.num_thresholds):
            class_probs[:, k] = cumulative_probs[:, k] - cumulative_probs[:, k-1]
        class_probs[:, -1] = 1.0 - cumulative_probs[:, -1]
        class_probs = torch.clamp(class_probs, min=1e-7, max=1.0)
        
        return {'cumulative_logits': cumulative_logits, 'class_probs': class_probs}

class CrossAttentionFusion(nn.Module):
    """Bidirectional cross-attention"""
    def __init__(self, dim, num_heads=8, dropout=0.1):
        super().__init__()
        self.text_to_image_attn = nn.MultiheadAttention(dim, num_heads, dropout, batch_first=True)
        self.image_to_text_attn = nn.MultiheadAttention(dim, num_heads, dropout, batch_first=True)
        self.text_norm = nn.LayerNorm(dim)
        self.image_norm = nn.LayerNorm(dim)
        self.text_ffn = nn.Sequential(
            nn.Linear(dim, dim * 4), nn.GELU(), nn.Dropout(dropout),
            nn.Linear(dim * 4, dim), nn.Dropout(dropout)
        )
        self.image_ffn = nn.Sequential(
            nn.Linear(dim, dim * 4), nn.GELU(), nn.Dropout(dropout),
            nn.Linear(dim * 4, dim), nn.Dropout(dropout)
        )
        self.ffn_norm_text = nn.LayerNorm(dim)
        self.ffn_norm_image = nn.LayerNorm(dim)
    
    def forward(self, text_emb, image_emb):
        text_seq = text_emb.unsqueeze(1)
        image_seq = image_emb.unsqueeze(1)
        
        text_attended, _ = self.text_to_image_attn(text_seq, image_seq, image_seq)
        text_out = self.text_norm(text_emb + text_attended.squeeze(1))
        
        image_attended, _ = self.image_to_text_attn(image_seq, text_seq, text_seq)
        image_out = self.image_norm(image_emb + image_attended.squeeze(1))
        
        text_final = self.ffn_norm_text(text_out + self.text_ffn(text_out))
        image_final = self.ffn_norm_image(image_out + self.image_ffn(image_out))
        
        return text_final, image_final

class EnhancedFusionModel(nn.Module):
    """Multi-modal model with ordinal regression + enhanced ASL"""
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg
        
        self.text_model = AutoModel.from_pretrained(cfg['TEXT_MODEL'])
        clip_model = CLIPModel.from_pretrained(cfg['IMAGE_MODEL'])
        self.image_model = clip_model.vision_model
        
        self._freeze_encoders()
        
        self.text_proj = nn.Linear(cfg['TEXT_DIM'], cfg['FUSION_DIM'])
        self.image_proj = nn.Linear(cfg['IMAGE_DIM'], cfg['FUSION_DIM'])
        
        self.cross_attention = CrossAttentionFusion(
            dim=cfg['FUSION_DIM'],
            num_heads=cfg['CROSS_ATTN_HEADS'],
            dropout=cfg['CROSS_ATTN_DROPOUT']
        )
        
        fusion_input_dim = cfg['FUSION_DIM'] * 2
        self.fusion_norm = nn.LayerNorm(fusion_input_dim)
        self.fusion_mlp = nn.Sequential(
            nn.Linear(fusion_input_dim, 512), nn.GELU(), nn.Dropout(0.2),
            nn.Linear(512, cfg['FUSION_OUT_DIM']), nn.LayerNorm(cfg['FUSION_OUT_DIM'])
        )
        
        self.sentiment_head = OrdinalRegressionHead(cfg['FUSION_OUT_DIM'], cfg['NUM_SENTIMENT_CLASSES'])
        self.emotion_head = nn.Sequential(
            nn.Linear(cfg['FUSION_OUT_DIM'], 256), nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(256, cfg['NUM_EMOTION_CLASSES'])
        )
        self.intensity_head = nn.Sequential(
            nn.Linear(cfg['FUSION_OUT_DIM'], 128), nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(128, 1)
        )
    
    def _freeze_encoders(self):
        for param in self.text_model.parameters():
            param.requires_grad = False
        for param in self.image_model.parameters():
            param.requires_grad = False
    
    def unfreeze_backbone(self, layers_to_unfreeze=3):
        if hasattr(self.text_model, 'encoder') and hasattr(self.text_model.encoder, 'layer'):
            for layer in list(self.text_model.encoder.layer[-layers_to_unfreeze:]):
                for param in layer.parameters():
                    param.requires_grad = True
        
        if hasattr(self.image_model, 'encoder') and hasattr(self.image_model.encoder, 'layers'):
            for layer in list(self.image_model.encoder.layers[-layers_to_unfreeze:]):
                for param in layer.parameters():
                    param.requires_grad = True
    
    def pool_text(self, model_output, attention_mask):
        last_hidden = model_output.last_hidden_state
        if self.cfg['POOLING'] == 'cls':
            return last_hidden[:, 0]
        mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden.size()).float()
        sum_embeddings = torch.sum(last_hidden * mask_expanded, 1)
        sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
        return sum_embeddings / sum_mask
    
    def forward(self, input_ids, attention_mask, image):
        text_output = self.text_model(input_ids=input_ids, attention_mask=attention_mask)
        text_emb = self.pool_text(text_output, attention_mask)
        
        image_output = self.image_model(pixel_values=image)
        image_emb = image_output.pooler_output
        
        text_proj = self.text_proj(text_emb)
        image_proj = self.image_proj(image_emb)
        
        text_cross, image_cross = self.cross_attention(text_proj, image_proj)
        
        fused = torch.cat([text_cross, image_cross], dim=1)
        fused = self.fusion_norm(fused)
        fused = self.fusion_mlp(fused)
        
        sentiment_outputs = self.sentiment_head(fused)
        emotion_logits = self.emotion_head(fused)
        intensity = self.intensity_head(fused).squeeze(-1)
        
        return {
            'sentiment': sentiment_outputs,
            'emotion_logits': emotion_logits,
            'intensity': intensity
        }

print("âœ“ Model components defined")

# ==================== PART 4: DATASET ====================
print("\n" + "=" * 80)
print("PART 4: DATASET")
print("=" * 80)

class MemeDataset(Dataset):
    def __init__(self, df, tokenizer, image_transform, image_dir, cfg):
        self.df = df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.image_transform = image_transform
        self.image_dir = image_dir
        self.cfg = cfg
        self._detect_columns()
    
    def _detect_columns(self):
        cols = self.df.columns.tolist()
        self.image_col = next((c for c in ['image_name', 'image', 'img_name', 'filename', 'Unnamed: 0'] if c in cols), cols[0])
        self.text_col = next((c for c in ['text', 'ocr_text', 'caption', 'OCR_extracted_text'] if c in cols), None)
        self.sentiment_col = next((c for c in ['sentiment', 'overall_sentiment', 'overall'] if c in cols), None)
        
        self.sentiment_map = {'very_positive': 0, 'positive': 1, 'neutral': 2, 'negative': 3, 'very_negative': 4}
        self.humor_map = {'not_funny': 0, 'funny': 1, 'very_funny': 1, 'hilarious': 1}
        self.sarcasm_map = {'not_sarcastic': 0, 'general': 1, 'twisted_meaning': 1, 'very_twisted': 1}
        self.offensive_map = {'not_offensive': 0, 'slight': 1, 'very_offensive': 1, 'hateful_offensive': 1}
        self.motivational_map = {'not_motivational': 0, 'motivational': 1}
    
    def _map_label(self, value, mapping, default=0):
        if pd.isna(value):
            return default
        if isinstance(value, str):
            return mapping.get(value.lower().strip(), default)
        return int(value)
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        
        image_name = str(row[self.image_col])
        if not any(image_name.lower().endswith(ext) for ext in ['.jpg', '.jpeg', '.png']):
            image_name = f"{image_name}.jpg"
        image_path = os.path.join(self.image_dir, image_name)
        
        try:
            image = Image.open(image_path).convert('RGB')
            image = self.image_transform(image)
        except:
            image = torch.zeros(3, self.cfg['IMG_SIZE'], self.cfg['IMG_SIZE'])
        
        text = str(row.get(self.text_col, '')) if self.text_col else 'No text'
        encoding = self.tokenizer(text, max_length=self.cfg['MAX_LEN'], padding='max_length', truncation=True, return_tensors='pt')
        
        sentiment_val = row.get(self.sentiment_col, 'neutral') if self.sentiment_col else 'neutral'
        sentiment_label = self._map_label(sentiment_val, self.sentiment_map, default=2)
        
        emotion_labels = torch.tensor([
            float(self._map_label(row.get('humour', row.get('humor', 0)), self.humor_map, 0)),
            float(self._map_label(row.get('sarcastic', row.get('sarcasm', 0)), self.sarcasm_map, 0)),
            float(self._map_label(row.get('offensive', 0), self.offensive_map, 0)),
            float(self._map_label(row.get('motivational', 0), self.motivational_map, 0))
        ], dtype=torch.float)
        
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'image': image,
            'sentiment_label': torch.tensor(sentiment_label, dtype=torch.long),
            'emotion_labels': emotion_labels,
            'intensity': torch.tensor(0.5, dtype=torch.float),
            'motivational_flag': emotion_labels[3]
        }

print("âœ“ Dataset class defined")

# ==================== PART 5: LOSS & METRICS ====================
print("\n" + "=" * 80)
print("PART 5: LOSS FUNCTIONS & METRICS")
print("=" * 80)

# UPDATED: class-weighted ordinal loss, with extra argument
def ordinal_regression_loss(cumulative_logits, labels, class_weights=None):
    batch_size = labels.size(0)
    num_thresholds = cumulative_logits.size(1)
    target_cumulative = torch.zeros_like(cumulative_logits)
    
    for i in range(batch_size):
        y = int(labels[i].item())
        if y < num_thresholds:
            target_cumulative[i, y:] = 1.0
    
    # compute per-sample, per-threshold loss
    loss_matrix = F.binary_cross_entropy_with_logits(
        cumulative_logits, target_cumulative, reduction='none'
    )  # [B, num_thresholds]
    
    if class_weights is not None:
        # class_weights is list[5]; map each sample to its class weight
        cw = torch.tensor(class_weights, device=labels.device, dtype=loss_matrix.dtype)
        sample_weights = cw[labels]  # [B]
        loss_matrix = loss_matrix * sample_weights.unsqueeze(1)
    
    return loss_matrix.mean()

def combined_loss(outputs, batch, cfg, emotion_loss_fn):
    loss_sent = ordinal_regression_loss(
        outputs['sentiment']['cumulative_logits'],
        batch['sentiment_label'],
        cfg.get('SENTIMENT_CLASS_WEIGHTS')
    )
    loss_emotion = emotion_loss_fn(outputs['emotion_logits'], batch['emotion_labels'])
    loss_intensity = F.smooth_l1_loss(outputs['intensity'], batch['intensity'])
    
    total_loss = (
        cfg['LOSS_WEIGHTS']['sentiment'] * loss_sent +
        cfg['LOSS_WEIGHTS']['emotion'] * loss_emotion +
        cfg['LOSS_WEIGHTS']['intensity'] * loss_intensity
    )
    
    return total_loss, loss_sent, loss_emotion, loss_intensity

# UPDATED: now takes both emotion thresholds and sentiment EV thresholds
def compute_metrics(sentiment_outputs, sentiment_labels, emotion_logits, emotion_labels,
                    emotion_thresholds, expected_thresholds):
    # Sentiment
    class_probs = sentiment_outputs['class_probs']
    num_classes = class_probs.size(1)
    
    y_true = sentiment_labels.cpu().numpy()
    class_probs_np = class_probs.cpu().numpy()
    
    # EXPECTED VALUE based decoding instead of plain argmax
    # E[class] = sum_k p_k * k
    ev = np.sum(class_probs_np * np.arange(num_classes), axis=1)
    ev_thresholds = np.array(expected_thresholds)
    
    # np.digitize: returns bin index; with 4 thresholds -> values in {0..4}
    y_pred = np.digitize(ev, bins=ev_thresholds)
    
    sent_acc = accuracy_score(y_true, y_pred)
    _, _, sent_f1, _ = precision_recall_fscore_support(y_true, y_pred, average='macro', zero_division=0)
    sent_mae = mean_absolute_error(y_true, y_pred)
    sent_1off = np.mean(np.abs(y_true - y_pred) <= 1)
    
    # Emotions
    emo_probs = torch.sigmoid(emotion_logits).cpu().numpy()
    emo_true = emotion_labels.cpu().numpy()
    emo_thresholds = np.array(emotion_thresholds)
    emo_pred = (emo_probs >= emo_thresholds).astype(float)
    
    _, _, emo_f1, _ = precision_recall_fscore_support(emo_true, emo_pred, average='samples', zero_division=0)
    
    return {
        'sentiment_accuracy': sent_acc,
        'sentiment_f1': sent_f1,
        'sentiment_mae': sent_mae,
        'sentiment_1off_accuracy': sent_1off,
        'emotion_f1': emo_f1
    }

print("âœ“ Loss functions and metrics defined")

# ==================== PART 6: TRAINER ====================
print("\n" + "=" * 80)
print("PART 6: TRAINER")
print("=" * 80)

class Trainer:
    def __init__(self, model, cfg, train_loader, val_loader, device, emotion_loss_fn):
        self.model = model
        self.cfg = cfg
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.device = device
        self.emotion_loss_fn = emotion_loss_fn
        
        self.optimizer = self.make_optimizer()
        self.scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(self.optimizer, T_max=cfg['EPOCHS'])
        self.scaler = GradScaler() if cfg['USE_AMP'] else None
        self.best_metric = -float('inf')
    
    def make_optimizer(self):
        head_params = []
        backbone_params = []
        
        for name, param in self.model.named_parameters():
            if param.requires_grad:
                if 'text_model' in name or 'image_model' in name:
                    backbone_params.append(param)
                else:
                    head_params.append(param)
        
        param_groups = [{'params': head_params, 'lr': self.cfg['LR_HEADS']}]
        if backbone_params:
            param_groups.append({'params': backbone_params, 'lr': self.cfg['LR_BACKBONE']})
        
        return torch.optim.AdamW(param_groups, weight_decay=self.cfg['WEIGHT_DECAY'])
    
    def train_epoch(self, epoch):
        self.model.train()
        total_loss = 0.0
        
        pbar = tqdm(self.train_loader, desc=f"Epoch {epoch+1}/{self.cfg['EPOCHS']} [Train]")
        self.optimizer.zero_grad()
        
        for batch_idx, batch in enumerate(pbar):
            batch_device = {k: v.to(self.device) for k, v in batch.items() if k != 'motivational_flag'}
            
            if self.cfg['USE_AMP']:
                with autocast():
                    outputs = self.model(batch_device['input_ids'], batch_device['attention_mask'], batch_device['image'])
                    loss, l_sent, l_emo, l_int = combined_loss(outputs, batch_device, self.cfg, self.emotion_loss_fn)
                    loss = loss / self.cfg['GRADIENT_ACCUMULATION_STEPS']
                
                self.scaler.scale(loss).backward()
                
                if (batch_idx + 1) % self.cfg['GRADIENT_ACCUMULATION_STEPS'] == 0:
                    if self.cfg['GRADIENT_CLIP'] > 0:
                        self.scaler.unscale_(self.optimizer)
                        torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.cfg['GRADIENT_CLIP'])
                    self.scaler.step(self.optimizer)
                    self.scaler.update()
                    self.optimizer.zero_grad()
            else:
                outputs = self.model(batch_device['input_ids'], batch_device['attention_mask'], batch_device['image'])
                loss, l_sent, l_emo, l_int = combined_loss(outputs, batch_device, self.cfg, self.emotion_loss_fn)
                loss = loss / self.cfg['GRADIENT_ACCUMULATION_STEPS']
                loss.backward()
                
                if (batch_idx + 1) % self.cfg['GRADIENT_ACCUMULATION_STEPS'] == 0:
                    if self.cfg['GRADIENT_CLIP'] > 0:
                        torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.cfg['GRADIENT_CLIP'])
                    self.optimizer.step()
                    self.optimizer.zero_grad()
            
            total_loss += loss.item() * self.cfg['GRADIENT_ACCUMULATION_STEPS']
            pbar.set_postfix({'loss': f"{loss.item() * self.cfg['GRADIENT_ACCUMULATION_STEPS']:.4f}"})
        
        return total_loss / len(self.train_loader)
    
    def validate(self, epoch):
        self.model.eval()
        total_loss = 0.0
        
        all_sentiment_labels = []
        all_sentiment_outputs = []
        all_emotion_labels = []
        all_emotion_logits = []
        
        with torch.no_grad():
            for batch in tqdm(self.val_loader, desc=f"Epoch {epoch+1} [Val]"):
                batch_device = {k: v.to(self.device) for k, v in batch.items() if k != 'motivational_flag'}
                
                outputs = self.model(batch_device['input_ids'], batch_device['attention_mask'], batch_device['image'])
                loss, _, _, _ = combined_loss(outputs, batch_device, self.cfg, self.emotion_loss_fn)
                total_loss += loss.item()
                
                all_sentiment_labels.append(batch_device['sentiment_label'].cpu())
                all_sentiment_outputs.append({
                    'cumulative_logits': outputs['sentiment']['cumulative_logits'].cpu(),
                    'class_probs': outputs['sentiment']['class_probs'].cpu()
                })
                all_emotion_labels.append(batch_device['emotion_labels'].cpu())
                all_emotion_logits.append(outputs['emotion_logits'].cpu())
        
        all_sentiment_labels = torch.cat(all_sentiment_labels)
        combined_sentiment = {
            'cumulative_logits': torch.cat([o['cumulative_logits'] for o in all_sentiment_outputs]),
            'class_probs': torch.cat([o['class_probs'] for o in all_sentiment_outputs])
        }
        all_emotion_labels = torch.cat(all_emotion_labels)
        all_emotion_logits = torch.cat(all_emotion_logits)
        
        metrics = compute_metrics(
            combined_sentiment,
            all_sentiment_labels,
            all_emotion_logits, 
            all_emotion_labels,
            self.cfg['EMO_THRESHOLDS'],
            self.cfg['SENTIMENT_EXPECTED_THRESHOLDS']
        )
        
        avg_loss = total_loss / len(self.val_loader)
        
        print(f"\n{'='*70}")
        print(f"Validation Results (Epoch {epoch+1}):")
        print(f"  Loss: {avg_loss:.4f}")
        print(f"  Sentiment Accuracy: {metrics['sentiment_accuracy']:.4f}")
        print(f"  Sentiment F1: {metrics['sentiment_f1']:.4f}")
        print(f"  Sentiment MAE: {metrics['sentiment_mae']:.4f}")
        print(f"  Sentiment 1-off Acc: {metrics['sentiment_1off_accuracy']:.4f}")
        print(f"  Emotion F1: {metrics['emotion_f1']:.4f}")
        print(f"{'='*70}\n")
        
        return {**metrics, 'val_loss': avg_loss}
    
    def fit(self):
        print(f"\n{'='*70}")
        print(f"STARTING TRAINING: {self.cfg['EPOCHS']} EPOCHS")
        print(f"{'='*70}\n")
        
        for epoch in range(self.cfg['EPOCHS']):
            # Early backbone unfreezing
            if epoch == self.cfg['UNFREEZE_BACKBONE_EPOCH']:
                print(f"\n{'='*70}")
                print(f"ðŸ”“ UNFREEZING BACKBONE at epoch {epoch+1}")
                print(f"{'='*70}\n")
                self.model.unfreeze_backbone(layers_to_unfreeze=self.cfg['UNFREEZE_LAYERS'])
                self.optimizer = self.make_optimizer()
                self.scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(self.optimizer, T_max=self.cfg['EPOCHS'])
            
            train_loss = self.train_epoch(epoch)
            print(f"\nTrain Loss: {train_loss:.4f}")
            
            val_metrics = self.validate(epoch)
            
            if self.scheduler:
                self.scheduler.step()
            
            # Composite metric (emphasis on emotion F1)
            composite = (
                val_metrics['sentiment_f1'] +
                val_metrics['sentiment_1off_accuracy'] -
                val_metrics['sentiment_mae'] +
                val_metrics['emotion_f1'] * 1.5
            )
            
            if composite > self.best_metric:
                self.best_metric = composite
                os.makedirs(self.cfg['CHECKPOINT_PATH'], exist_ok=True)
                checkpoint_path = os.path.join(self.cfg['CHECKPOINT_PATH'], 'best_model_enhanced.pt')
                torch.save({
                    'epoch': epoch,
                    'model_state_dict': self.model.state_dict(),
                    'optimizer_state_dict': self.optimizer.state_dict(),
                    'best_metric': self.best_metric,
                    'metrics': val_metrics,
                    'config': self.cfg
                }, checkpoint_path)
                print(f"âœ“ Saved best model (composite: {composite:.4f})")
        
        print("\nâœ… TRAINING COMPLETED!")
        return self.best_metric

print("âœ“ Trainer class defined")

# ==================== PART 7: DATA LOADING ====================
print("\n" + "=" * 80)
print("PART 7: DATA LOADING & PREPARATION")
print("=" * 80)

# Initialize tokenizer and transforms
tokenizer = AutoTokenizer.from_pretrained(cfg['TEXT_MODEL'])

train_transform = transforms.Compose([
    transforms.Resize((cfg['IMG_SIZE'], cfg['IMG_SIZE'])),
    transforms.RandomHorizontalFlip(p=0.3),
    transforms.ColorJitter(brightness=0.2, contrast=0.2),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073],
                       std=[0.26862954, 0.26130258, 0.27577711])
])

val_transform = transforms.Compose([
    transforms.Resize((cfg['IMG_SIZE'], cfg['IMG_SIZE'])),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073],
                       std=[0.26862954, 0.26130258, 0.27577711])
])

print("âœ“ Tokenizer and transforms initialized")

# Create datasets
train_dataset = MemeDataset(train_df, tokenizer, train_transform, NEW_TRAIN_DIR, cfg)
val_dataset = MemeDataset(val_df, tokenizer, val_transform, NEW_VAL_DIR, cfg)

print(f"âœ“ Train dataset: {len(train_dataset)} samples")
print(f"âœ“ Val dataset: {len(val_dataset)} samples")

# UPDATED: Create weighted sampler for motivational + extreme sentiment oversampling
print("\nCreating weighted sampler...")
sample_weights = []
motivational_count = 0
extreme_count = 0  # very_positive (0) + very_negative (4)

for idx in range(len(train_dataset)):
    item = train_dataset[idx]
    is_motivational = int(item['motivational_flag'].item())
    sent_label = int(item['sentiment_label'].item())
    
    weight = 1.0
    
    # Motivational oversampling
    if is_motivational:
        weight *= cfg['MOTIVATIONAL_OVERSAMPLE_FACTOR']
        motivational_count += 1
    
    # Extreme sentiment oversampling (very_positive=0, very_negative=4)
    if sent_label in [0, 4]:
        weight *= cfg['EXTREME_SENTIMENT_OVERSAMPLE_FACTOR']
        extreme_count += 1
    
    sample_weights.append(weight)

motivational_pct = motivational_count / len(train_dataset) * 100 if len(train_dataset) > 0 else 0
extreme_pct = extreme_count / len(train_dataset) * 100 if len(train_dataset) > 0 else 0
effective_motivational = motivational_count * cfg['MOTIVATIONAL_OVERSAMPLE_FACTOR'] / len(train_dataset) * 100 if len(train_dataset) > 0 else 0
effective_extreme = extreme_count * cfg['EXTREME_SENTIMENT_OVERSAMPLE_FACTOR'] / len(train_dataset) * 100 if len(train_dataset) > 0 else 0

print(f"  Motivational samples: {motivational_count} ({motivational_pct:.2f}%)")
print(f"  Oversampling factor (motivational): {cfg['MOTIVATIONAL_OVERSAMPLE_FACTOR']}x")
print(f"  Effective motivational representation: {effective_motivational:.1f}%")

print(f"  Extreme sentiment samples (very_pos/very_neg): {extreme_count} ({extreme_pct:.2f}%)")
print(f"  Oversampling factor (extremes): {cfg['EXTREME_SENTIMENT_OVERSAMPLE_FACTOR']}x")
print(f"  Effective extreme representation: {effective_extreme:.1f}%")

sampler = WeightedRandomSampler(weights=sample_weights, num_samples=len(sample_weights), replacement=True)

# Create dataloaders
train_loader = DataLoader(
    train_dataset, batch_size=cfg['BATCH_SIZE'], sampler=sampler,
    num_workers=2, pin_memory=True, drop_last=True
)

val_loader = DataLoader(
    val_dataset, batch_size=cfg['BATCH_SIZE'], shuffle=False,
    num_workers=2, pin_memory=True
)

print(f"âœ“ Train batches: {len(train_loader)}")
print(f"âœ“ Val batches: {len(val_loader)}")

# ==================== PART 8: MODEL INITIALIZATION & TRAINING ====================
print("\n" + "=" * 80)
print("PART 8: MODEL INITIALIZATION")
print("=" * 80)

model = EnhancedFusionModel(cfg).to(device)

total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"\nModel Statistics:")
print(f"  Total parameters: {total_params:,}")
print(f"  Trainable parameters: {trainable_params:,}")
print(f"  Frozen parameters: {total_params - trainable_params:,}")

# Initialize enhanced emotion loss
emotion_loss_fn = EnhancedAsymmetricLoss(
    gamma_neg=cfg['ASL_GAMMA_NEG'],
    gamma_pos=cfg['ASL_GAMMA_POS'],
    clip=cfg['ASL_CLIP'],
    priors=cfg['EMO_PRIORS'],
    prior_tau=cfg['ASL_PRIOR_TAU']
)

print(f"\nâœ“ Enhanced ASL initialized:")
print(f"  Î³_neg={cfg['ASL_GAMMA_NEG']}, Î³_pos={cfg['ASL_GAMMA_POS']}")
print(f"  Prior adjustment: Ï„={cfg['ASL_PRIOR_TAU']}")
print(f"  Priors: {[f'{p:.3f}' for p in cfg['EMO_PRIORS']]}")

# Initialize trainer
trainer = Trainer(model, cfg, train_loader, val_loader, device, emotion_loss_fn)

print("\nâœ“ Trainer initialized")

# ==================== START TRAINING ====================
print("\n" + "=" * 80)
print("STARTING TRAINING")
print("=" * 80)

best_metric = trainer.fit()

print(f"\n{'='*80}")
print(f"âœ… TRAINING COMPLETED!")
print(f"{'='*80}")
print(f"Best composite metric: {best_metric:.4f}")
print(f"Model saved to: {cfg['CHECKPOINT_PATH']}/best_model_enhanced.pt")

# Generate model card
model_card = f"""# Enhanced Multi-modal Meme Analysis Model

## Overview
This model uses a hybrid loss strategy combining ordinal regression for sentiment 
and enhanced asymmetric loss (ASL) with prior adjustment for emotions.

## Key Improvements

### 1. Hybrid Loss Strategy
- **Sentiment**: Ordinal regression respects natural class ordering, with class-weighting
  to emphasize extreme sentiments.
- **Emotions**: Enhanced ASL with positive focusing (Î³_pos={cfg['ASL_GAMMA_POS']}) 
  and prior adjustment (Ï„={cfg['ASL_PRIOR_TAU']})
- **Intensity**: Smooth L1 loss

### 2. Oversampling Strategy
- Motivational oversampling factor: {cfg['MOTIVATIONAL_OVERSAMPLE_FACTOR']}x
- Extreme sentiment oversampling factor (very_positive / very_negative): {cfg['EXTREME_SENTIMENT_OVERSAMPLE_FACTOR']}x
- Original motivational representation: {motivational_pct:.2f}%
- Effective motivational representation: {effective_motivational:.1f}%
- Original extreme representation: {extreme_pct:.2f}%
- Effective extreme representation: {effective_extreme:.1f}%

### 3. Early Backbone Unfreezing
- Unfreezes at epoch {cfg['UNFREEZE_BACKBONE_EPOCH']}
- Layers unfrozen: {cfg['UNFREEZE_LAYERS']}

## Architecture
- **Text**: {cfg['TEXT_MODEL']}
- **Image**: {cfg['IMAGE_MODEL']}
- **Fusion**: Bidirectional cross-attention
- **Params**: {total_params:,} total, {trainable_params:,} trainable

## Training Details
- Epochs: {cfg['EPOCHS']}
- Batch size: {cfg['BATCH_SIZE']}
- LR (heads): {cfg['LR_HEADS']}
- LR (backbone): {cfg['LR_BACKBONE']}
- Loss weights: Sentiment={cfg['LOSS_WEIGHTS']['sentiment']}, 
  Emotion={cfg['LOSS_WEIGHTS']['emotion']}, Intensity={cfg['LOSS_WEIGHTS']['intensity']}

## Performance
- Best composite metric: {best_metric:.4f}

## Dataset
- Training samples: {len(train_df):,}
- Validation samples: {len(val_df):,}

## Usage

```python
checkpoint = torch.load('best_model_enhanced.pt')
model = EnhancedFusionModel(checkpoint['config']).to(device)
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()

with torch.no_grad():
    outputs = model(input_ids, attention_mask, image)
    sentiment_probs = outputs['sentiment']['class_probs']
    # Expected-value based decoding (same as validation):
    num_classes = sentiment_probs.size(1)
    ev = (sentiment_probs * torch.arange(num_classes, device=sentiment_probs.device)).sum(dim=1)
    # Apply thresholds from config['SENTIMENT_EXPECTED_THRESHOLDS'] as in compute_metrics
    emotions = torch.sigmoid(outputs['emotion_logits'])


SyntaxError: incomplete input (2831241693.py, line 997)

In [8]:
# ==================== COMPLETE ENHANCED MEME ANALYSIS PIPELINE ====================
# This code includes BOTH data preparation AND enhanced training
# Run this entire script from start to finish

# ==================== PART 0: SETUP & DEPENDENCIES ====================
import sys, subprocess, os, json, zipfile, shutil, random, warnings
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from torch.cuda.amp import GradScaler, autocast
import torchvision.transforms as transforms
from PIL import Image
from tqdm import tqdm
import yaml

warnings.filterwarnings('ignore')

print("=" * 80)
print("INSTALLING DEPENDENCIES...")
print("=" * 80)

# Install required packages
subprocess.check_call([
    sys.executable, "-m", "pip", "install", "-q",
    "transformers>=4.40.0", "accelerate", "torch", "timm",
    "scikit-learn", "pandas", "matplotlib", "seaborn",
    "huggingface_hub>=0.18.0", "gdown", "iterative-stratification"
])

# Import additional packages
from sklearn.metrics import (
    accuracy_score, precision_recall_fscore_support,
    mean_squared_error, mean_absolute_error, f1_score
)
from transformers import AutoModel, AutoTokenizer, CLIPModel
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit

print("âœ“ All dependencies installed\n")

# ==================== PART 1: DATA PREPARATION ====================
print("=" * 80)
print("PART 1: DATA PREPARATION")
print("=" * 80)

# Download and extract dataset
print("\nDownloading dataset...")
subprocess.run(["gdown", "1jEJ2nf5CDJknq80ogzU-Uyz7jbBi-1LZ", "--fuzzy"], 
               check=False, capture_output=True)

print("Extracting dataset...")
zip_files = [f for f in os.listdir('.') if f.endswith('.zip')]
if zip_files:
    subprocess.run(["unzip", "-q", "-o", zip_files[0]], check=False, capture_output=True)

# Download additional files
subprocess.run([
    "gdown", "--folder", "19yaav8ORSVj9DeJUaHKq1H3HtVnkClBw", "--remaining-ok"
], check=False, capture_output=True)

# Extract password-protected archive
print("Extracting protected archive...")
zip_path = '/kaggle/working/Memotion 3/memotion3.zip'
extract_to = '/kaggle/working/'
password = b'memotion3taskaaai@22'

if os.path.exists(zip_path):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to, pwd=password)
    print(f"âœ“ Extracted to: {extract_to}")

# Setup paths
ORIGINAL_TRAIN_IMG_DIR = '/kaggle/working/trainImages/'
ORIGINAL_CSV_PATH = '/kaggle/working/memotion3/train.csv'
VALIDATION_SPLIT_RATIO = 0.15

OUTPUT_BASE_DIR = '/kaggle/working/'
NEW_VAL_DIR = os.path.join(OUTPUT_BASE_DIR, 'validation_images/')
NEW_TRAIN_DIR = os.path.join(OUTPUT_BASE_DIR, 'new_train_images/')

os.makedirs(NEW_VAL_DIR, exist_ok=True)
os.makedirs(NEW_TRAIN_DIR, exist_ok=True)

# Load and process CSV
print("\nLoading CSV file...")
df = pd.read_csv(ORIGINAL_CSV_PATH)

# Detect image column
possible_image_cols = ['image_name', 'image', 'img_name', 'filename', 'Unnamed: 0']
IMAGE_FILENAME_COLUMN = next((col for col in possible_image_cols if col in df.columns), df.columns[0])
print(f"âœ“ Image column: {IMAGE_FILENAME_COLUMN}")

# Normalize labels
for col in ['offensive', 'motivational', 'humour', 'humor', 'sarcastic', 'sarcasm', 'overall', 'sentiment']:
    if col in df.columns:
        df[col] = df[col].astype(str).str.lower().str.strip()

# Create binary labels
def create_binary_label(value, positive_values):
    if pd.isna(value) or value in ['nan', 'none', '']:
        return 0
    return 1 if value in positive_values else 0

if 'offensive' in df.columns:
    df['offensive_bin'] = df['offensive'].apply(
        lambda x: create_binary_label(x, ['slight', 'very_offensive', 'hateful_offensive'])
    )
else:
    df['offensive_bin'] = 0

if 'motivational' in df.columns:
    df['motivational_bin'] = df['motivational'].apply(
        lambda x: create_binary_label(x, ['motivational'])
    )
else:
    df['motivational_bin'] = 0

if 'humour' in df.columns or 'humor' in df.columns:
    humor_col = 'humour' if 'humour' in df.columns else 'humor'
    df['humor_bin'] = df[humor_col].apply(
        lambda x: create_binary_label(x, ['funny', 'very_funny', 'hilarious'])
    )
else:
    df['humor_bin'] = 0

if 'sarcastic' in df.columns or 'sarcasm' in df.columns:
    sarcasm_col = 'sarcastic' if 'sarcastic' in df.columns else 'sarcasm'
    df['sarcasm_bin'] = df[sarcasm_col].apply(
        lambda x: create_binary_label(x, ['general', 'twisted_meaning', 'very_twisted'])
    )
else:
    df['sarcasm_bin'] = 0

# Stratified split
stratify_columns = ['offensive_bin', 'motivational_bin', 'humor_bin', 'sarcasm_bin']
y_stratify = df[stratify_columns].values

msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=VALIDATION_SPLIT_RATIO, random_state=42)
train_idx, val_idx = next(msss.split(df, y_stratify))

train_df = df.iloc[train_idx].reset_index(drop=True)
val_df = df.iloc[val_idx].reset_index(drop=True)

print(f"\nâœ“ Stratified split complete:")
print(f"  Training: {len(train_df)} samples")
print(f"  Validation: {len(val_df)} samples")

# Calculate label priors
label_priors = {
    'offensive_pos_rate': float(train_df['offensive_bin'].sum() / len(train_df)),
    'motivational_pos_rate': float(train_df['motivational_bin'].sum() / len(train_df)),
    'humor_pos_rate': float(train_df['humor_bin'].sum() / len(train_df)),
    'sarcasm_pos_rate': float(train_df['sarcasm_bin'].sum() / len(train_df))
}

priors_path = os.path.join(OUTPUT_BASE_DIR, 'label_priors.json')
with open(priors_path, 'w') as f:
    json.dump(label_priors, f, indent=2)

print(f"\nâœ“ Label priors calculated:")
for key, val in label_priors.items():
    print(f"  {key}: {val:.4f}")

# Copy images
def copy_images(df_subset, dest_dir, source_dir, image_col):
    copied = 0
    missing = 0
    
    for idx in tqdm(df_subset[image_col], desc=f"Copying to {dest_dir}"):
        filename = str(idx)
        if not any(filename.lower().endswith(ext) for ext in ['.jpg', '.jpeg', '.png', '.gif']):
            for ext in ['.jpg', '.jpeg', '.png']:
                test_path = os.path.join(source_dir, f"{filename}{ext}")
                if os.path.exists(test_path):
                    filename = f"{filename}{ext}"
                    break
            else:
                filename = f"{filename}.jpg"
        
        source_path = os.path.join(source_dir, filename)
        if os.path.exists(source_path):
            shutil.copy(source_path, os.path.join(dest_dir, filename))
            copied += 1
        else:
            missing += 1
    
    return copied, missing

print("\nCopying images...")
copied_val, missing_val = copy_images(val_df, NEW_VAL_DIR, ORIGINAL_TRAIN_IMG_DIR, IMAGE_FILENAME_COLUMN)
copied_train, missing_train = copy_images(train_df, NEW_TRAIN_DIR, ORIGINAL_TRAIN_IMG_DIR, IMAGE_FILENAME_COLUMN)

print(f"âœ“ Validation: {copied_val} copied, {missing_val} missing")
print(f"âœ“ Training: {copied_train} copied, {missing_train} missing")

# Save CSVs
train_csv_path = os.path.join(OUTPUT_BASE_DIR, 'train_split.csv')
val_csv_path = os.path.join(OUTPUT_BASE_DIR, 'validation_split.csv')

train_df.to_csv(train_csv_path, index=False)
val_df.to_csv(val_csv_path, index=False)

print(f"\nâœ“ Saved train CSV: {train_csv_path}")
print(f"âœ“ Saved validation CSV: {val_csv_path}")
print("\nâœ… DATA PREPARATION COMPLETE\n")

# ==================== PART 2: CONFIGURATION ====================
print("=" * 80)
print("PART 2: CONFIGURATION")
print("=" * 80)

CONFIG_YAML = """
TEXT_MODEL: "google/muril-base-cased"
IMAGE_MODEL: "openai/clip-vit-base-patch32"
TEXT_DIM: 768
IMAGE_DIM: 768
FUSION_DIM: 512
FUSION_OUT_DIM: 512

MAX_LEN: 128
IMG_SIZE: 224
BATCH_SIZE: 16
GRADIENT_ACCUMULATION_STEPS: 2
LR_HEADS: 0.001
LR_BACKBONE: 0.00002
WEIGHT_DECAY: 0.01
EPOCHS: 20
SEED: 42
DEVICE: "cuda"
CHECKPOINT_PATH: "/kaggle/working/checkpoints"

NUM_SENTIMENT_CLASSES: 5
NUM_EMOTION_CLASSES: 4

USE_ORDINAL_REGRESSION: true
ORDINAL_LINK: "logit"

LOSS_WEIGHTS:
  sentiment: 2.0
  emotion: 1.5
  intensity: 0.5

ASL_GAMMA_NEG: 6.0
ASL_GAMMA_POS: 0.5
ASL_CLIP: 0.05
ASL_PRIOR_TAU: 1.2

EMOTION_LABELS: ["humor", "sarcasm", "offensive", "motivational"]
EMO_THRESHOLDS: [0.5, 0.5, 0.60, 0.60]

# thresholds for expected-value based sentiment decoding
SENTIMENT_EXPECTED_THRESHOLDS: [0.5, 1.5, 2.5, 3.5]

# class weights for ordinal loss (upweight extremes 0 and 4)
SENTIMENT_CLASS_WEIGHTS: [1.5, 1.0, 1.0, 1.0, 1.5]

POOLING: "mean"
USE_AMP: true
GRADIENT_CLIP: 1.0
SCHEDULER: "cosine"
UNFREEZE_BACKBONE_EPOCH: 2
UNFREEZE_LAYERS: 3

MOTIVATIONAL_OVERSAMPLE_FACTOR: 8.0

# oversampling factor for extreme sentiment classes (very_positive & very_negative)
EXTREME_SENTIMENT_OVERSAMPLE_FACTOR: 5.0

CROSS_ATTN_HEADS: 8
CROSS_ATTN_DROPOUT: 0.1

SENTIMENT_MAP_REV:
  0: "very_positive"
  1: "positive"
  2: "neutral"
  3: "negative"
  4: "very_negative"
"""

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

cfg = yaml.safe_load(CONFIG_YAML)
set_seed(cfg['SEED'])

# Load priors
with open(priors_path, 'r') as f:
    priors = json.load(f)

cfg['EMO_PRIORS'] = [
    priors['humor_pos_rate'],
    priors['sarcasm_pos_rate'],
    priors['offensive_pos_rate'],
    priors['motivational_pos_rate']
]

device = torch.device(cfg['DEVICE'] if torch.cuda.is_available() else 'cpu')

print(f"\nâœ“ Configuration loaded:")
print(f"  Device: {device}")
print(f"  Epochs: {cfg['EPOCHS']}")
print(f"  Batch size: {cfg['BATCH_SIZE']}")
print(f"  Motivational oversampling: {cfg['MOTIVATIONAL_OVERSAMPLE_FACTOR']}x")
print(f"  Extreme sentiment oversampling: {cfg['EXTREME_SENTIMENT_OVERSAMPLE_FACTOR']}x")
print(f"  Emotion priors: {[f'{p:.3f}' for p in cfg['EMO_PRIORS']]}")

# ==================== PART 3: MODEL COMPONENTS ====================
print("\n" + "=" * 80)
print("PART 3: MODEL COMPONENTS")
print("=" * 80)

class EnhancedAsymmetricLoss(nn.Module):
    """Enhanced ASL with prior adjustment"""
    def __init__(self, gamma_neg=6.0, gamma_pos=0.5, clip=0.05, priors=None, prior_tau=1.2, eps=1e-8):
        super().__init__()
        self.gamma_neg = gamma_neg
        self.gamma_pos = gamma_pos
        self.clip = clip
        self.eps = eps
        self.priors = priors
        self.prior_tau = prior_tau
    
    def forward(self, logits, targets):
        # Prior adjustment
        if self.priors is not None:
            priors_tensor = torch.tensor(self.priors, device=logits.device, dtype=logits.dtype)
            adjustment = self.prior_tau * torch.log(priors_tensor.clamp(min=self.eps))
            logits = logits - adjustment
        
        xs_pos = torch.sigmoid(logits)
        xs_neg = 1 - xs_pos
        
        if self.clip is not None and self.clip > 0:
            xs_neg = (xs_neg + self.clip).clamp(max=1)
        
        los_pos = targets * torch.log(xs_pos.clamp(min=self.eps))
        los_neg = (1 - targets) * torch.log(xs_neg.clamp(min=self.eps))
        
        if self.gamma_neg > 0 or self.gamma_pos > 0:
            pt0 = xs_pos * targets
            pt1 = xs_neg * (1 - targets)
            pt = pt0 + pt1
            one_sided_gamma = self.gamma_pos * targets + self.gamma_neg * (1 - targets)
            one_sided_w = torch.pow(1 - pt, one_sided_gamma)
            loss = one_sided_w * (los_pos + los_neg)
        else:
            loss = los_pos + los_neg
        
        return -loss.mean()

class OrdinalRegressionHead(nn.Module):
    """Ordinal regression using cumulative link model"""
    def __init__(self, input_dim, num_classes):
        super().__init__()
        self.num_classes = num_classes
        self.num_thresholds = num_classes - 1
        
        self.projection = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 1)
        )
        
        initial_thresholds = torch.linspace(-2, 2, self.num_thresholds)
        self.thresholds = nn.Parameter(initial_thresholds)
    
    def forward(self, x):
        score = self.projection(x).squeeze(-1)
        ordered_thresholds = torch.cumsum(F.softplus(self.thresholds), dim=0)
        cumulative_logits = ordered_thresholds.unsqueeze(0) - score.unsqueeze(1)
        cumulative_probs = torch.sigmoid(cumulative_logits)
        
        batch_size = cumulative_probs.size(0)
        class_probs = torch.zeros(batch_size, self.num_classes, device=x.device)
        
        class_probs[:, 0] = cumulative_probs[:, 0]
        for k in range(1, self.num_thresholds):
            class_probs[:, k] = cumulative_probs[:, k] - cumulative_probs[:, k-1]
        class_probs[:, -1] = 1.0 - cumulative_probs[:, -1]
        class_probs = torch.clamp(class_probs, min=1e-7, max=1.0)
        
        return {'cumulative_logits': cumulative_logits, 'class_probs': class_probs}

class CrossAttentionFusion(nn.Module):
    """Bidirectional cross-attention"""
    def __init__(self, dim, num_heads=8, dropout=0.1):
        super().__init__()
        self.text_to_image_attn = nn.MultiheadAttention(dim, num_heads, dropout, batch_first=True)
        self.image_to_text_attn = nn.MultiheadAttention(dim, num_heads, dropout, batch_first=True)
        self.text_norm = nn.LayerNorm(dim)
        self.image_norm = nn.LayerNorm(dim)
        self.text_ffn = nn.Sequential(
            nn.Linear(dim, dim * 4), nn.GELU(), nn.Dropout(dropout),
            nn.Linear(dim * 4, dim), nn.Dropout(dropout)
        )
        self.image_ffn = nn.Sequential(
            nn.Linear(dim, dim * 4), nn.GELU(), nn.Dropout(dropout),
            nn.Linear(dim * 4, dim), nn.Dropout(dropout)
        )
        self.ffn_norm_text = nn.LayerNorm(dim)
        self.ffn_norm_image = nn.LayerNorm(dim)
    
    def forward(self, text_emb, image_emb):
        text_seq = text_emb.unsqueeze(1)
        image_seq = image_emb.unsqueeze(1)
        
        text_attended, _ = self.text_to_image_attn(text_seq, image_seq, image_seq)
        text_out = self.text_norm(text_emb + text_attended.squeeze(1))
        
        image_attended, _ = self.image_to_text_attn(image_seq, text_seq, text_seq)
        image_out = self.image_norm(image_emb + image_attended.squeeze(1))
        
        text_final = self.ffn_norm_text(text_out + self.text_ffn(text_out))
        image_final = self.ffn_norm_image(image_out + self.image_ffn(image_out))
        
        return text_final, image_final

class EnhancedFusionModel(nn.Module):
    """Multi-modal model with ordinal regression + enhanced ASL"""
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg
        
        self.text_model = AutoModel.from_pretrained(cfg['TEXT_MODEL'])
        clip_model = CLIPModel.from_pretrained(cfg['IMAGE_MODEL'])
        self.image_model = clip_model.vision_model
        
        self._freeze_encoders()
        
        self.text_proj = nn.Linear(cfg['TEXT_DIM'], cfg['FUSION_DIM'])
        self.image_proj = nn.Linear(cfg['IMAGE_DIM'], cfg['FUSION_DIM'])
        
        self.cross_attention = CrossAttentionFusion(
            dim=cfg['FUSION_DIM'],
            num_heads=cfg['CROSS_ATTN_HEADS'],
            dropout=cfg['CROSS_ATTN_DROPOUT']
        )
        
        fusion_input_dim = cfg['FUSION_DIM'] * 2
        self.fusion_norm = nn.LayerNorm(fusion_input_dim)
        self.fusion_mlp = nn.Sequential(
            nn.Linear(fusion_input_dim, 512), nn.GELU(), nn.Dropout(0.2),
            nn.Linear(512, cfg['FUSION_OUT_DIM']), nn.LayerNorm(cfg['FUSION_OUT_DIM'])
        )
        
        self.sentiment_head = OrdinalRegressionHead(cfg['FUSION_OUT_DIM'], cfg['NUM_SENTIMENT_CLASSES'])
        self.emotion_head = nn.Sequential(
            nn.Linear(cfg['FUSION_OUT_DIM'], 256), nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(256, cfg['NUM_EMOTION_CLASSES'])
        )
        self.intensity_head = nn.Sequential(
            nn.Linear(cfg['FUSION_OUT_DIM'], 128), nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(128, 1)
        )
    
    def _freeze_encoders(self):
        for param in self.text_model.parameters():
            param.requires_grad = False
        for param in self.image_model.parameters():
            param.requires_grad = False
    
    def unfreeze_backbone(self, layers_to_unfreeze=3):
        if hasattr(self.text_model, 'encoder') and hasattr(self.text_model.encoder, 'layer'):
            for layer in list(self.text_model.encoder.layer[-layers_to_unfreeze:]):
                for param in layer.parameters():
                    param.requires_grad = True
        
        if hasattr(self.image_model, 'encoder') and hasattr(self.image_model.encoder, 'layers'):
            for layer in list(self.image_model.encoder.layers[-layers_to_unfreeze:]):
                for param in layer.parameters():
                    param.requires_grad = True
    
    def pool_text(self, model_output, attention_mask):
        last_hidden = model_output.last_hidden_state
        if self.cfg['POOLING'] == 'cls':
            return last_hidden[:, 0]
        mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden.size()).float()
        sum_embeddings = torch.sum(last_hidden * mask_expanded, 1)
        sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
        return sum_embeddings / sum_mask
    
    def forward(self, input_ids, attention_mask, image):
        text_output = self.text_model(input_ids=input_ids, attention_mask=attention_mask)
        text_emb = self.pool_text(text_output, attention_mask)
        
        image_output = self.image_model(pixel_values=image)
        image_emb = image_output.pooler_output
        
        text_proj = self.text_proj(text_emb)
        image_proj = self.image_proj(image_emb)
        
        text_cross, image_cross = self.cross_attention(text_proj, image_proj)
        
        fused = torch.cat([text_cross, image_cross], dim=1)
        fused = self.fusion_norm(fused)
        fused = self.fusion_mlp(fused)
        
        sentiment_outputs = self.sentiment_head(fused)
        emotion_logits = self.emotion_head(fused)
        intensity = self.intensity_head(fused).squeeze(-1)
        
        return {
            'sentiment': sentiment_outputs,
            'emotion_logits': emotion_logits,
            'intensity': intensity
        }

print("âœ“ Model components defined")

# ==================== PART 4: DATASET ====================
print("\n" + "=" * 80)
print("PART 4: DATASET")
print("=" * 80)

class MemeDataset(Dataset):
    def __init__(self, df, tokenizer, image_transform, image_dir, cfg):
        self.df = df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.image_transform = image_transform
        self.image_dir = image_dir
        self.cfg = cfg
        self._detect_columns()
    
    def _detect_columns(self):
        cols = self.df.columns.tolist()
        self.image_col = next((c for c in ['image_name', 'image', 'img_name', 'filename', 'Unnamed: 0'] if c in cols), cols[0])
        self.text_col = next((c for c in ['text', 'ocr_text', 'caption', 'OCR_extracted_text'] if c in cols), None)
        self.sentiment_col = next((c for c in ['sentiment', 'overall_sentiment', 'overall'] if c in cols), None)
        
        self.sentiment_map = {'very_positive': 0, 'positive': 1, 'neutral': 2, 'negative': 3, 'very_negative': 4}
        self.humor_map = {'not_funny': 0, 'funny': 1, 'very_funny': 1, 'hilarious': 1}
        self.sarcasm_map = {'not_sarcastic': 0, 'general': 1, 'twisted_meaning': 1, 'very_twisted': 1}
        self.offensive_map = {'not_offensive': 0, 'slight': 1, 'very_offensive': 1, 'hateful_offensive': 1}
        self.motivational_map = {'not_motivational': 0, 'motivational': 1}
    
    def _map_label(self, value, mapping, default=0):
        if pd.isna(value):
            return default
        if isinstance(value, str):
            return mapping.get(value.lower().strip(), default)
        return int(value)
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        
        image_name = str(row[self.image_col])
        if not any(image_name.lower().endswith(ext) for ext in ['.jpg', '.jpeg', '.png']):
            image_name = f"{image_name}.jpg"
        image_path = os.path.join(self.image_dir, image_name)
        
        try:
            image = Image.open(image_path).convert('RGB')
            image = self.image_transform(image)
        except:
            image = torch.zeros(3, self.cfg['IMG_SIZE'], self.cfg['IMG_SIZE'])
        
        text = str(row.get(self.text_col, '')) if self.text_col else 'No text'
        encoding = self.tokenizer(text, max_length=self.cfg['MAX_LEN'], padding='max_length', truncation=True, return_tensors='pt')
        
        sentiment_val = row.get(self.sentiment_col, 'neutral') if self.sentiment_col else 'neutral'
        sentiment_label = self._map_label(sentiment_val, self.sentiment_map, default=2)
        
        emotion_labels = torch.tensor([
            float(self._map_label(row.get('humour', row.get('humor', 0)), self.humor_map, 0)),
            float(self._map_label(row.get('sarcastic', row.get('sarcasm', 0)), self.sarcasm_map, 0)),
            float(self._map_label(row.get('offensive', 0), self.offensive_map, 0)),
            float(self._map_label(row.get('motivational', 0), self.motivational_map, 0))
        ], dtype=torch.float)
        
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'image': image,
            'sentiment_label': torch.tensor(sentiment_label, dtype=torch.long),
            'emotion_labels': emotion_labels,
            'intensity': torch.tensor(0.5, dtype=torch.float),
            'motivational_flag': emotion_labels[3]
        }

print("âœ“ Dataset class defined")

# ==================== PART 5: LOSS & METRICS ====================
print("\n" + "=" * 80)
print("PART 5: LOSS FUNCTIONS & METRICS")
print("=" * 80)

def ordinal_regression_loss(cumulative_logits, labels, class_weights=None):
    batch_size = labels.size(0)
    num_thresholds = cumulative_logits.size(1)
    target_cumulative = torch.zeros_like(cumulative_logits)
    
    for i in range(batch_size):
        y = int(labels[i].item())
        if y < num_thresholds:
            target_cumulative[i, y:] = 1.0
    
    loss_matrix = F.binary_cross_entropy_with_logits(
        cumulative_logits, target_cumulative, reduction='none'
    )
    
    if class_weights is not None:
        cw = torch.tensor(class_weights, device=labels.device, dtype=loss_matrix.dtype)
        sample_weights = cw[labels]
        loss_matrix = loss_matrix * sample_weights.unsqueeze(1)
    
    return loss_matrix.mean()

def combined_loss(outputs, batch, cfg, emotion_loss_fn):
    loss_sent = ordinal_regression_loss(
        outputs['sentiment']['cumulative_logits'],
        batch['sentiment_label'],
        cfg.get('SENTIMENT_CLASS_WEIGHTS')
    )
    loss_emotion = emotion_loss_fn(outputs['emotion_logits'], batch['emotion_labels'])
    loss_intensity = F.smooth_l1_loss(outputs['intensity'], batch['intensity'])
    
    total_loss = (
        cfg['LOSS_WEIGHTS']['sentiment'] * loss_sent +
        cfg['LOSS_WEIGHTS']['emotion'] * loss_emotion +
        cfg['LOSS_WEIGHTS']['intensity'] * loss_intensity
    )
    
    return total_loss, loss_sent, loss_emotion, loss_intensity

def compute_metrics(sentiment_outputs, sentiment_labels, emotion_logits, emotion_labels,
                    emotion_thresholds, expected_thresholds):
    class_probs = sentiment_outputs['class_probs']
    num_classes = class_probs.size(1)
    
    y_true = sentiment_labels.cpu().numpy()
    class_probs_np = class_probs.cpu().numpy()
    
    ev = np.sum(class_probs_np * np.arange(num_classes), axis=1)
    ev_thresholds = np.array(expected_thresholds)
    y_pred = np.digitize(ev, bins=ev_thresholds)
    
    sent_acc = accuracy_score(y_true, y_pred)
    _, _, sent_f1, _ = precision_recall_fscore_support(y_true, y_pred, average='macro', zero_division=0)
    sent_mae = mean_absolute_error(y_true, y_pred)
    sent_1off = np.mean(np.abs(y_true - y_pred) <= 1)
    
    emo_probs = torch.sigmoid(emotion_logits).cpu().numpy()
    emo_true = emotion_labels.cpu().numpy()
    emo_thresholds = np.array(emotion_thresholds)
    emo_pred = (emo_probs >= emo_thresholds).astype(float)
    
    _, _, emo_f1, _ = precision_recall_fscore_support(emo_true, emo_pred, average='samples', zero_division=0)
    
    return {
        'sentiment_accuracy': sent_acc,
        'sentiment_f1': sent_f1,
        'sentiment_mae': sent_mae,
        'sentiment_1off_accuracy': sent_1off,
        'emotion_f1': emo_f1
    }

print("âœ“ Loss functions and metrics defined")

# ==================== PART 6: TRAINER ====================
print("\n" + "=" * 80)
print("PART 6: TRAINER")
print("=" * 80)

class Trainer:
    def __init__(self, model, cfg, train_loader, val_loader, device, emotion_loss_fn):
        self.model = model
        self.cfg = cfg
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.device = device
        self.emotion_loss_fn = emotion_loss_fn
        
        self.optimizer = self.make_optimizer()
        self.scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(self.optimizer, T_max=cfg['EPOCHS'])
        self.scaler = GradScaler() if cfg['USE_AMP'] else None
        self.best_metric = -float('inf')
    
    def make_optimizer(self):
        head_params = []
        backbone_params = []
        
        for name, param in self.model.named_parameters():
            if param.requires_grad:
                if 'text_model' in name or 'image_model' in name:
                    backbone_params.append(param)
                else:
                    head_params.append(param)
        
        param_groups = [{'params': head_params, 'lr': self.cfg['LR_HEADS']}]
        if backbone_params:
            param_groups.append({'params': backbone_params, 'lr': self.cfg['LR_BACKBONE']})
        
        return torch.optim.AdamW(param_groups, weight_decay=self.cfg['WEIGHT_DECAY'])
    
    def train_epoch(self, epoch):
        self.model.train()
        total_loss = 0.0
        
        pbar = tqdm(self.train_loader, desc=f"Epoch {epoch+1}/{self.cfg['EPOCHS']} [Train]")
        self.optimizer.zero_grad()
        
        for batch_idx, batch in enumerate(pbar):
            batch_device = {k: v.to(self.device) for k, v in batch.items() if k != 'motivational_flag'}
            
            if self.cfg['USE_AMP']:
                with autocast():
                    outputs = self.model(batch_device['input_ids'], batch_device['attention_mask'], batch_device['image'])
                    loss, l_sent, l_emo, l_int = combined_loss(outputs, batch_device, self.cfg, self.emotion_loss_fn)
                    loss = loss / self.cfg['GRADIENT_ACCUMULATION_STEPS']
                
                self.scaler.scale(loss).backward()
                
                if (batch_idx + 1) % self.cfg['GRADIENT_ACCUMULATION_STEPS'] == 0:
                    if self.cfg['GRADIENT_CLIP'] > 0:
                        self.scaler.unscale_(self.optimizer)
                        torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.cfg['GRADIENT_CLIP'])
                    self.scaler.step(self.optimizer)
                    self.scaler.update()
                    self.optimizer.zero_grad()
            else:
                outputs = self.model(batch_device['input_ids'], batch_device['attention_mask'], batch_device['image'])
                loss, l_sent, l_emo, l_int = combined_loss(outputs, batch_device, self.cfg, self.emotion_loss_fn)
                loss = loss / self.cfg['GRADIENT_ACCUMULATION_STEPS']
                loss.backward()
                
                if (batch_idx + 1) % self.cfg['GRADIENT_ACCUMULATION_STEPS'] == 0:
                    if self.cfg['GRADIENT_CLIP'] > 0:
                        torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.cfg['GRADIENT_CLIP'])
                    self.optimizer.step()
                    self.optimizer.zero_grad()
            
            total_loss += loss.item() * self.cfg['GRADIENT_ACCUMULATION_STEPS']
            pbar.set_postfix({'loss': f"{loss.item() * self.cfg['GRADIENT_ACCUMULATION_STEPS']:.4f}"})
        
        return total_loss / len(self.train_loader)
    
    def validate(self, epoch):
        self.model.eval()
        total_loss = 0.0
        
        all_sentiment_labels = []
        all_sentiment_outputs = []
        all_emotion_labels = []
        all_emotion_logits = []
        
        with torch.no_grad():
            for batch in tqdm(self.val_loader, desc=f"Epoch {epoch+1} [Val]"):
                batch_device = {k: v.to(self.device) for k, v in batch.items() if k != 'motivational_flag'}
                
                outputs = self.model(batch_device['input_ids'], batch_device['attention_mask'], batch_device['image'])
                loss, _, _, _ = combined_loss(outputs, batch_device, self.cfg, self.emotion_loss_fn)
                total_loss += loss.item()
                
                all_sentiment_labels.append(batch_device['sentiment_label'].cpu())
                all_sentiment_outputs.append({
                    'cumulative_logits': outputs['sentiment']['cumulative_logits'].cpu(),
                    'class_probs': outputs['sentiment']['class_probs'].cpu()
                })
                all_emotion_labels.append(batch_device['emotion_labels'].cpu())
                all_emotion_logits.append(outputs['emotion_logits'].cpu())
        
        all_sentiment_labels = torch.cat(all_sentiment_labels)
        combined_sentiment = {
            'cumulative_logits': torch.cat([o['cumulative_logits'] for o in all_sentiment_outputs]),
            'class_probs': torch.cat([o['class_probs'] for o in all_sentiment_outputs])
        }
        all_emotion_labels = torch.cat(all_emotion_labels)
        all_emotion_logits = torch.cat(all_emotion_logits)
        
        metrics = compute_metrics(
            combined_sentiment,
            all_sentiment_labels,
            all_emotion_logits, 
            all_emotion_labels,
            self.cfg['EMO_THRESHOLDS'],
            self.cfg['SENTIMENT_EXPECTED_THRESHOLDS']
        )
        
        avg_loss = total_loss / len(self.val_loader)
        
        print(f"\n{'='*70}")
        print(f"Validation Results (Epoch {epoch+1}):")
        print(f"  Loss: {avg_loss:.4f}")
        print(f"  Sentiment Accuracy: {metrics['sentiment_accuracy']:.4f}")
        print(f"  Sentiment F1: {metrics['sentiment_f1']:.4f}")
        print(f"  Sentiment MAE: {metrics['sentiment_mae']:.4f}")
        print(f"  Sentiment 1-off Acc: {metrics['sentiment_1off_accuracy']:.4f}")
        print(f"  Emotion F1: {metrics['emotion_f1']:.4f}")
        print(f"{'='*70}\n")
        
        return {**metrics, 'val_loss': avg_loss}
    
    def fit(self):
        print(f"\n{'='*70}")
        print(f"STARTING TRAINING: {self.cfg['EPOCHS']} EPOCHS")
        print(f"{'='*70}\n")
        
        for epoch in range(self.cfg['EPOCHS']):
            if epoch == self.cfg['UNFREEZE_BACKBONE_EPOCH']:
                print(f"\n{'='*70}")
                print(f"ðŸ”“ UNFREEZING BACKBONE at epoch {epoch+1}")
                print(f"{'='*70}\n")
                self.model.unfreeze_backbone(layers_to_unfreeze=self.cfg['UNFREEZE_LAYERS'])
                self.optimizer = self.make_optimizer()
                self.scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(self.optimizer, T_max=self.cfg['EPOCHS'])
            
            train_loss = self.train_epoch(epoch)
            print(f"\nTrain Loss: {train_loss:.4f}")
            
            val_metrics = self.validate(epoch)
            
            if self.scheduler:
                self.scheduler.step()
            
            composite = (
                val_metrics['sentiment_f1'] +
                val_metrics['sentiment_1off_accuracy'] -
                val_metrics['sentiment_mae'] +
                val_metrics['emotion_f1'] * 1.5
            )
            
            if composite > self.best_metric:
                self.best_metric = composite
                os.makedirs(self.cfg['CHECKPOINT_PATH'], exist_ok=True)
                checkpoint_path = os.path.join(self.cfg['CHECKPOINT_PATH'], 'best_model_enhanced.pt')
                torch.save({
                    'epoch': epoch,
                    'model_state_dict': self.model.state_dict(),
                    'optimizer_state_dict': self.optimizer.state_dict(),
                    'best_metric': self.best_metric,
                    'metrics': val_metrics,
                    'config': self.cfg
                }, checkpoint_path)
                print(f"âœ“ Saved best model (composite: {composite:.4f})")
        
        print("\nâœ… TRAINING COMPLETED!")
        return self.best_metric

print("âœ“ Trainer class defined")

# ==================== PART 7: DATA LOADING ====================
print("\n" + "=" * 80)
print("PART 7: DATA LOADING & PREPARATION")
print("=" * 80)

tokenizer = AutoTokenizer.from_pretrained(cfg['TEXT_MODEL'])

train_transform = transforms.Compose([
    transforms.Resize((cfg['IMG_SIZE'], cfg['IMG_SIZE'])),
    transforms.RandomHorizontalFlip(p=0.3),
    transforms.ColorJitter(brightness=0.2, contrast=0.2),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073],
                       std=[0.26862954, 0.26130258, 0.27577711])
])

val_transform = transforms.Compose([
    transforms.Resize((cfg['IMG_SIZE'], cfg['IMG_SIZE'])),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073],
                       std=[0.26862954, 0.26130258, 0.27577711])
])

print("âœ“ Tokenizer and transforms initialized")

train_dataset = MemeDataset(train_df, tokenizer, train_transform, NEW_TRAIN_DIR, cfg)
val_dataset = MemeDataset(val_df, tokenizer, val_transform, NEW_VAL_DIR, cfg)

print(f"âœ“ Train dataset: {len(train_dataset)} samples")
print(f"âœ“ Val dataset: {len(val_dataset)} samples")

print("\nCreating weighted sampler...")
sample_weights = []
motivational_count = 0
extreme_count = 0

for idx in range(len(train_dataset)):
    item = train_dataset[idx]
    is_motivational = int(item['motivational_flag'].item())
    sent_label = int(item['sentiment_label'].item())
    
    weight = 1.0
    
    if is_motivational:
        weight *= cfg['MOTIVATIONAL_OVERSAMPLE_FACTOR']
        motivational_count += 1
    
    if sent_label in [0, 4]:
        weight *= cfg['EXTREME_SENTIMENT_OVERSAMPLE_FACTOR']
        extreme_count += 1
    
    sample_weights.append(weight)

if len(train_dataset) > 0:
    motivational_pct = motivational_count / len(train_dataset) * 100
    extreme_pct = extreme_count / len(train_dataset) * 100
    effective_motivational = motivational_count * cfg['MOTIVATIONAL_OVERSAMPLE_FACTOR'] / len(train_dataset) * 100
    effective_extreme = extreme_count * cfg['EXTREME_SENTIMENT_OVERSAMPLE_FACTOR'] / len(train_dataset) * 100
else:
    motivational_pct = extreme_pct = effective_motivational = effective_extreme = 0.0

print(f"  Motivational samples: {motivational_count} ({motivational_pct:.2f}%)")
print(f"  Oversampling factor (motivational): {cfg['MOTIVATIONAL_OVERSAMPLE_FACTOR']}x")
print(f"  Effective motivational representation: {effective_motivational:.1f}%")
print(f"  Extreme sentiment samples (very_pos/very_neg): {extreme_count} ({extreme_pct:.2f}%)")
print(f"  Oversampling factor (extremes): {cfg['EXTREME_SENTIMENT_OVERSAMPLE_FACTOR']}x")
print(f"  Effective extreme representation: {effective_extreme:.1f}%")

sampler = WeightedRandomSampler(weights=sample_weights, num_samples=len(sample_weights), replacement=True)

train_loader = DataLoader(
    train_dataset, batch_size=cfg['BATCH_SIZE'], sampler=sampler,
    num_workers=2, pin_memory=True, drop_last=True
)

val_loader = DataLoader(
    val_dataset, batch_size=cfg['BATCH_SIZE'], shuffle=False,
    num_workers=2, pin_memory=True
)

print(f"âœ“ Train batches: {len(train_loader)}")
print(f"âœ“ Val batches: {len(val_loader)}")

# ==================== PART 8: MODEL INITIALIZATION & TRAINING ====================
print("\n" + "=" * 80)
print("PART 8: MODEL INITIALIZATION")
print("=" * 80)

model = EnhancedFusionModel(cfg).to(device)

total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"\nModel Statistics:")
print(f"  Total parameters: {total_params:,}")
print(f"  Trainable parameters: {trainable_params:,}")
print(f"  Frozen parameters: {total_params - trainable_params:,}")

emotion_loss_fn = EnhancedAsymmetricLoss(
    gamma_neg=cfg['ASL_GAMMA_NEG'],
    gamma_pos=cfg['ASL_GAMMA_POS'],
    clip=cfg['ASL_CLIP'],
    priors=cfg['EMO_PRIORS'],
    prior_tau=cfg['ASL_PRIOR_TAU']
)

print(f"\nâœ“ Enhanced ASL initialized:")
print(f"  gamma_neg={cfg['ASL_GAMMA_NEG']}, gamma_pos={cfg['ASL_GAMMA_POS']}")
print(f"  Prior adjustment: tau={cfg['ASL_PRIOR_TAU']}")
print(f"  Priors: {[f'{p:.3f}' for p in cfg['EMO_PRIORS']]}")

trainer = Trainer(model, cfg, train_loader, val_loader, device, emotion_loss_fn)

print("\nâœ“ Trainer initialized")

print("\n" + "=" * 80)
print("STARTING TRAINING")
print("=" * 80)

best_metric = trainer.fit()

print(f"\n{'='*80}")
print("âœ… TRAINING COMPLETED!")
print(f"{'='*80}")
print(f"Best composite metric: {best_metric:.4f}")
print(f"Model saved to: {cfg['CHECKPOINT_PATH']}/best_model_enhanced.pt")

model_card_lines = [
    "# Enhanced Multi-modal Meme Analysis Model",
    "",
    "## Overview",
    "This model uses a hybrid loss strategy combining ordinal regression for sentiment",
    "and enhanced asymmetric loss (ASL) with prior adjustment for emotions.",
    "",
    "## Key Improvements",
    "",
    "### 1. Hybrid Loss Strategy",
    "- Sentiment: Ordinal regression respects natural class ordering, with class-weighting",
    "  to emphasize extreme sentiments.",
    f"- Emotions: Enhanced ASL with positive focusing (gamma_pos={cfg['ASL_GAMMA_POS']})",
    f"  and prior adjustment (tau={cfg['ASL_PRIOR_TAU']}).",
    "- Intensity: Smooth L1 loss.",
    "",
    "### 2. Oversampling Strategy",
    f"- Motivational oversampling factor: {cfg['MOTIVATIONAL_OVERSAMPLE_FACTOR']}x",
    f"- Extreme sentiment oversampling factor (very_positive / very_negative): {cfg['EXTREME_SENTIMENT_OVERSAMPLE_FACTOR']}x",
    f"- Original motivational representation: {motivational_pct:.2f}%",
    f"- Effective motivational representation: {effective_motivational:.1f}%",
    f"- Original extreme representation: {extreme_pct:.2f}%",
    f"- Effective extreme representation: {effective_extreme:.1f}%",
    "",
    "### 3. Early Backbone Unfreezing",
    f"- Unfreezes at epoch {cfg['UNFREEZE_BACKBONE_EPOCH']}",
    f"- Layers unfrozen: {cfg['UNFREEZE_LAYERS']}",
    "",
    "## Architecture",
    f"- Text encoder: {cfg['TEXT_MODEL']}",
    f"- Image encoder: {cfg['IMAGE_MODEL']}",
    "- Fusion: Bidirectional cross-attention",
    f"- Parameters: {total_params:,} total, {trainable_params:,} trainable",
    "",
    "## Training Details",
    f"- Epochs: {cfg['EPOCHS']}",
    f"- Batch size: {cfg['BATCH_SIZE']}",
    f"- LR (heads): {cfg['LR_HEADS']}",
    f"- LR (backbone): {cfg['LR_BACKBONE']}",
    f"- Loss weights: Sentiment={cfg['LOSS_WEIGHTS']['sentiment']}, "
    f"Emotion={cfg['LOSS_WEIGHTS']['emotion']}, Intensity={cfg['LOSS_WEIGHTS']['intensity']}",
    "",
    "## Performance",
    f"- Best composite metric: {best_metric:.4f}",
    "",
    "## Dataset",
    f"- Training samples: {len(train_df):,}",
    f"- Validation samples: {len(val_df):,}",
    "",
    "## Inference Hint",
    "Use the same expected-value based decoding as in validation:",
    "",
    "```python",
    "with torch.no_grad():",
    "    outputs = model(input_ids, attention_mask, image)",
    "    sentiment_probs = outputs['sentiment']['class_probs']",
    "    num_classes = sentiment_probs.size(1)",
    "    ev = (sentiment_probs * torch.arange(num_classes, device=sentiment_probs.device)).sum(dim=1)",
    "    # Map ev to class index using cfg['SENTIMENT_EXPECTED_THRESHOLDS']",
    "    emotions = torch.sigmoid(outputs['emotion_logits'])",
    "```",
]

model_card = "\n".join(model_card_lines)

model_card_path = os.path.join(cfg['CHECKPOINT_PATH'], 'model_card.md')
with open(model_card_path, 'w') as f:
    f.write(model_card)

print(f"\nâœ“ Model card saved to: {model_card_path}")
print("\n" + "=" * 80)
print("ALL DONE! ðŸŽ‰")
print("=" * 80 + "\n")


INSTALLING DEPENDENCIES...
âœ“ All dependencies installed

PART 1: DATA PREPARATION

Downloading dataset...
Extracting dataset...
Extracting protected archive...
âœ“ Extracted to: /kaggle/working/

Loading CSV file...
âœ“ Image column: Unnamed: 0

âœ“ Stratified split complete:
  Training: 5950 samples
  Validation: 1050 samples

âœ“ Label priors calculated:
  offensive_pos_rate: 0.3909
  motivational_pos_rate: 0.1187
  humor_pos_rate: 0.8558
  sarcasm_pos_rate: 0.7891

Copying images...


Copying to /kaggle/working/validation_images/: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1050/1050 [00:00<00:00, 5975.73it/s]
Copying to /kaggle/working/new_train_images/: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 5950/5950 [00:01<00:00, 5543.27it/s]


âœ“ Validation: 1050 copied, 0 missing
âœ“ Training: 5950 copied, 0 missing

âœ“ Saved train CSV: /kaggle/working/train_split.csv
âœ“ Saved validation CSV: /kaggle/working/validation_split.csv

âœ… DATA PREPARATION COMPLETE

PART 2: CONFIGURATION

âœ“ Configuration loaded:
  Device: cuda
  Epochs: 20
  Batch size: 16
  Motivational oversampling: 8.0x
  Extreme sentiment oversampling: 5.0x
  Emotion priors: ['0.856', '0.789', '0.391', '0.119']

PART 3: MODEL COMPONENTS
âœ“ Model components defined

PART 4: DATASET
âœ“ Dataset class defined

PART 5: LOSS FUNCTIONS & METRICS
âœ“ Loss functions and metrics defined

PART 6: TRAINER
âœ“ Trainer class defined

PART 7: DATA LOADING & PREPARATION
âœ“ Tokenizer and transforms initialized
âœ“ Train dataset: 5950 samples
âœ“ Val dataset: 1050 samples

Creating weighted sampler...
  Motivational samples: 706 (11.87%)
  Oversampling factor (motivational): 8.0x
  Effective motivational representation: 94.9%
  Extreme sentiment samples (very_pos/very_

Epoch 1/20 [Train]:   0%|          | 0/371 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 1/20 [Train]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 371/371 [00:38<00:00,  9.71it/s, loss=1.5764]



Train Loss: 1.4921


Epoch 1 [Val]:   0%|          | 0/66 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 1 [Val]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 66/66 [00:05<00:00, 11.34it/s]



Validation Results (Epoch 1):
  Loss: 1.3405
  Sentiment Accuracy: 0.3362
  Sentiment F1: 0.2126
  Sentiment MAE: 0.8800
  Sentiment 1-off Acc: 0.8067
  Emotion F1: 0.7723

âœ“ Saved best model (composite: 1.2977)


Epoch 2/20 [Train]:   0%|          | 0/371 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 2/20 [Train]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 371/371 [00:38<00:00,  9.70it/s, loss=1.3748]



Train Loss: 1.2829


Epoch 2 [Val]:   0%|          | 0/66 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 2 [Val]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 66/66 [00:05<00:00, 11.34it/s]



Validation Results (Epoch 2):
  Loss: 1.4670
  Sentiment Accuracy: 0.3210
  Sentiment F1: 0.2614
  Sentiment MAE: 0.9486
  Sentiment 1-off Acc: 0.7771
  Emotion F1: 0.7725


ðŸ”“ UNFREEZING BACKBONE at epoch 3



Epoch 3/20 [Train]:   0%|          | 0/371 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 3/20 [Train]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 371/371 [00:54<00:00,  6.84it/s, loss=0.5593]



Train Loss: 1.0888


Epoch 3 [Val]:   0%|          | 0/66 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 3 [Val]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 66/66 [00:05<00:00, 11.25it/s]



Validation Results (Epoch 3):
  Loss: 1.2462
  Sentiment Accuracy: 0.4010
  Sentiment F1: 0.2265
  Sentiment MAE: 0.7295
  Sentiment 1-off Acc: 0.8838
  Emotion F1: 0.7725

âœ“ Saved best model (composite: 1.5395)


Epoch 4/20 [Train]:   0%|          | 0/371 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 4/20 [Train]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 371/371 [00:54<00:00,  6.84it/s, loss=0.7047]



Train Loss: 0.8090


Epoch 4 [Val]:   0%|          | 0/66 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 4 [Val]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 66/66 [00:05<00:00, 11.23it/s]



Validation Results (Epoch 4):
  Loss: 1.1457
  Sentiment Accuracy: 0.4314
  Sentiment F1: 0.2309
  Sentiment MAE: 0.6876
  Sentiment 1-off Acc: 0.8867
  Emotion F1: 0.7725

âœ“ Saved best model (composite: 1.5887)


Epoch 5/20 [Train]:   0%|          | 0/371 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 5/20 [Train]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 371/371 [00:54<00:00,  6.83it/s, loss=1.1620]



Train Loss: 0.6977


Epoch 5 [Val]:   0%|          | 0/66 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 5 [Val]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 66/66 [00:05<00:00, 11.26it/s]



Validation Results (Epoch 5):
  Loss: 1.2570
  Sentiment Accuracy: 0.3848
  Sentiment F1: 0.2552
  Sentiment MAE: 0.7838
  Sentiment 1-off Acc: 0.8514
  Emotion F1: 0.7725



Epoch 6/20 [Train]:   0%|          | 0/371 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 6/20 [Train]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 371/371 [00:54<00:00,  6.84it/s, loss=0.6178]



Train Loss: 0.6278


Epoch 6 [Val]:   0%|          | 0/66 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 6 [Val]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 66/66 [00:05<00:00, 11.27it/s]



Validation Results (Epoch 6):
  Loss: 1.1588
  Sentiment Accuracy: 0.4267
  Sentiment F1: 0.2452
  Sentiment MAE: 0.7019
  Sentiment 1-off Acc: 0.8810
  Emotion F1: 0.7725



Epoch 7/20 [Train]:   0%|          | 0/371 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 7/20 [Train]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 371/371 [00:54<00:00,  6.83it/s, loss=0.5110]



Train Loss: 0.5580


Epoch 7 [Val]:   0%|          | 0/66 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 7 [Val]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 66/66 [00:05<00:00, 11.25it/s]



Validation Results (Epoch 7):
  Loss: 1.1383
  Sentiment Accuracy: 0.4152
  Sentiment F1: 0.2173
  Sentiment MAE: 0.7114
  Sentiment 1-off Acc: 0.8800
  Emotion F1: 0.7721



Epoch 8/20 [Train]:   0%|          | 0/371 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 8/20 [Train]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 371/371 [00:54<00:00,  6.83it/s, loss=0.4637]



Train Loss: 0.5293


Epoch 8 [Val]:   0%|          | 0/66 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 8 [Val]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 66/66 [00:05<00:00, 11.25it/s]



Validation Results (Epoch 8):
  Loss: 1.1102
  Sentiment Accuracy: 0.4257
  Sentiment F1: 0.2405
  Sentiment MAE: 0.7010
  Sentiment 1-off Acc: 0.8790
  Emotion F1: 0.7737



Epoch 9/20 [Train]:   0%|          | 0/371 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 9/20 [Train]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 371/371 [00:54<00:00,  6.84it/s, loss=0.3486]



Train Loss: 0.4957


Epoch 9 [Val]:   0%|          | 0/66 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 9 [Val]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 66/66 [00:05<00:00, 11.28it/s]



Validation Results (Epoch 9):
  Loss: 1.0636
  Sentiment Accuracy: 0.4257
  Sentiment F1: 0.2182
  Sentiment MAE: 0.6838
  Sentiment 1-off Acc: 0.8943
  Emotion F1: 0.7725



Epoch 10/20 [Train]:   0%|          | 0/371 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 10/20 [Train]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 371/371 [00:54<00:00,  6.84it/s, loss=0.3523]



Train Loss: 0.4777


Epoch 10 [Val]:   0%|          | 0/66 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 10 [Val]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 66/66 [00:05<00:00, 11.28it/s]



Validation Results (Epoch 10):
  Loss: 1.0805
  Sentiment Accuracy: 0.4381
  Sentiment F1: 0.2325
  Sentiment MAE: 0.6829
  Sentiment 1-off Acc: 0.8867
  Emotion F1: 0.7725

âœ“ Saved best model (composite: 1.5951)


Epoch 11/20 [Train]:   0%|          | 0/371 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 11/20 [Train]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 371/371 [00:54<00:00,  6.83it/s, loss=0.4972]



Train Loss: 0.4295


Epoch 11 [Val]:   0%|          | 0/66 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 11 [Val]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 66/66 [00:05<00:00, 11.25it/s]



Validation Results (Epoch 11):
  Loss: 1.0895
  Sentiment Accuracy: 0.4390
  Sentiment F1: 0.2353
  Sentiment MAE: 0.6762
  Sentiment 1-off Acc: 0.8914
  Emotion F1: 0.7725

âœ“ Saved best model (composite: 1.6093)


Epoch 12/20 [Train]:   0%|          | 0/371 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 12/20 [Train]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 371/371 [00:54<00:00,  6.85it/s, loss=0.4463]



Train Loss: 0.4084


Epoch 12 [Val]:   0%|          | 0/66 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 12 [Val]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 66/66 [00:05<00:00, 11.26it/s]



Validation Results (Epoch 12):
  Loss: 1.1506
  Sentiment Accuracy: 0.4324
  Sentiment F1: 0.2546
  Sentiment MAE: 0.6952
  Sentiment 1-off Acc: 0.8829
  Emotion F1: 0.7725



Epoch 13/20 [Train]:   0%|          | 0/371 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 13/20 [Train]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 371/371 [00:54<00:00,  6.84it/s, loss=0.4195]



Train Loss: 0.4322


Epoch 13 [Val]:   0%|          | 0/66 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 13 [Val]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 66/66 [00:05<00:00, 11.22it/s]



Validation Results (Epoch 13):
  Loss: 1.2384
  Sentiment Accuracy: 0.4171
  Sentiment F1: 0.2432
  Sentiment MAE: 0.7105
  Sentiment 1-off Acc: 0.8848
  Emotion F1: 0.7727



Epoch 14/20 [Train]:   0%|          | 0/371 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 14/20 [Train]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 371/371 [00:54<00:00,  6.85it/s, loss=0.5461]



Train Loss: 0.4430


Epoch 14 [Val]:   0%|          | 0/66 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 14 [Val]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 66/66 [00:05<00:00, 11.25it/s]



Validation Results (Epoch 14):
  Loss: 1.2422
  Sentiment Accuracy: 0.4219
  Sentiment F1: 0.2481
  Sentiment MAE: 0.7114
  Sentiment 1-off Acc: 0.8800
  Emotion F1: 0.7727



Epoch 15/20 [Train]:   0%|          | 0/371 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 15/20 [Train]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 371/371 [00:54<00:00,  6.84it/s, loss=0.3939]



Train Loss: 0.4418


Epoch 15 [Val]:   0%|          | 0/66 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 15 [Val]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 66/66 [00:05<00:00, 11.29it/s]



Validation Results (Epoch 15):
  Loss: 1.2452
  Sentiment Accuracy: 0.4143
  Sentiment F1: 0.2457
  Sentiment MAE: 0.7210
  Sentiment 1-off Acc: 0.8790
  Emotion F1: 0.7727



Epoch 16/20 [Train]:   0%|          | 0/371 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 16/20 [Train]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 371/371 [00:54<00:00,  6.84it/s, loss=0.5094]



Train Loss: 0.4505


Epoch 16 [Val]:   0%|          | 0/66 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 16 [Val]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 66/66 [00:05<00:00, 11.21it/s]



Validation Results (Epoch 16):
  Loss: 1.2476
  Sentiment Accuracy: 0.4114
  Sentiment F1: 0.2439
  Sentiment MAE: 0.7248
  Sentiment 1-off Acc: 0.8790
  Emotion F1: 0.7727



Epoch 17/20 [Train]:   0%|          | 0/371 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 17/20 [Train]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 371/371 [00:54<00:00,  6.85it/s, loss=0.4361]



Train Loss: 0.4388


Epoch 17 [Val]:   0%|          | 0/66 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 17 [Val]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 66/66 [00:05<00:00, 11.26it/s]



Validation Results (Epoch 17):
  Loss: 1.2494
  Sentiment Accuracy: 0.4124
  Sentiment F1: 0.2454
  Sentiment MAE: 0.7248
  Sentiment 1-off Acc: 0.8781
  Emotion F1: 0.7727



Epoch 18/20 [Train]:   0%|          | 0/371 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 18/20 [Train]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 371/371 [00:54<00:00,  6.83it/s, loss=0.5918]



Train Loss: 0.4471


Epoch 18 [Val]:   0%|          | 0/66 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 18 [Val]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 66/66 [00:05<00:00, 11.25it/s]



Validation Results (Epoch 18):
  Loss: 1.2505
  Sentiment Accuracy: 0.4124
  Sentiment F1: 0.2455
  Sentiment MAE: 0.7267
  Sentiment 1-off Acc: 0.8762
  Emotion F1: 0.7727



Epoch 19/20 [Train]:   0%|          | 0/371 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 19/20 [Train]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 371/371 [00:54<00:00,  6.83it/s, loss=0.3884]



Train Loss: 0.4406


Epoch 19 [Val]:   0%|          | 0/66 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 19 [Val]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 66/66 [00:05<00:00, 11.26it/s]



Validation Results (Epoch 19):
  Loss: 1.2513
  Sentiment Accuracy: 0.4124
  Sentiment F1: 0.2455
  Sentiment MAE: 0.7267
  Sentiment 1-off Acc: 0.8762
  Emotion F1: 0.7727



Epoch 20/20 [Train]:   0%|          | 0/371 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 20/20 [Train]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 371/371 [00:54<00:00,  6.84it/s, loss=0.3862]



Train Loss: 0.4328


Epoch 20 [Val]:   0%|          | 0/66 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 20 [Val]: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 66/66 [00:05<00:00, 11.25it/s]


Validation Results (Epoch 20):
  Loss: 1.2518
  Sentiment Accuracy: 0.4124
  Sentiment F1: 0.2455
  Sentiment MAE: 0.7267
  Sentiment 1-off Acc: 0.8762
  Emotion F1: 0.7727


âœ… TRAINING COMPLETED!

âœ… TRAINING COMPLETED!
Best composite metric: 1.6093
Model saved to: /kaggle/working/checkpoints/best_model_enhanced.pt

âœ“ Model card saved to: /kaggle/working/checkpoints/model_card.md

ALL DONE! ðŸŽ‰




