# Notebook 1: Data Preprocessing & Super-Resolution

**Goal**: Triage images by resolution, apply Real-ESRGAN upscaling, and organize into train/val/test splits.

**Key Practices**:
- Resolution Triage: <48px → trash, 48-112px → upscale, >112px → resize
- Target: 336x336 for Vision-LLM, 224x224 for ResNet
- All splits get the same preprocessing for fair comparison

In [1]:
# Install dependencies
!pip install -q torch torchvision==0.17.2 basicsr realesrgan opencv-python-headless tqdm

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m172.5/172.5 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.8/46.8 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m178.0/178.0 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m71.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m755.5/755.5 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.6/410.6 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m25.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import os
import cv2
import shutil
import numpy as np
from pathlib import Path
from tqdm.auto import tqdm
from collections import Counter

# Set random seed for reproducibility just in case
np.random.seed(42)

In [3]:
# --- Configuration ---
INPUT_DIR = "/kaggle/input/raf-au/aligned"
EMOLABEL_FILE = "/kaggle/input/raf-au/RAFCE_emolabel.txt"
PARTITION_FILE = "/kaggle/input/raf-au/RAFCE_partition.txt"
OUTPUT_DIR = "/kaggle/working/processed_dataset"

# Thresholds
TRASH_THRESHOLD = 48   # <48px = Garbage
SALVAGE_THRESHOLD = 112 # 48px-112px = Needs advanced upscaling (Lanczos/GAN)

# Dual-Path Resolutions
SIZE_RESNET = (224, 224) # Standard for ResNet50
SIZE_VLLM = (336, 336)   # Standard for Qwen2-VL / LLaVA

In [4]:
# Compound Emotion Labels (RAF-CE)
EMOTION_MAP = {
    0: "Happily_Surprised", 1: "Happily_Disgusted", 2: "Sadly_Fearful",
    3: "Sadly_Angry", 4: "Sadly_Surprised", 5: "Sadly_Disgusted",
    6: "Fearfully_Angry", 7: "Fearfully_Surprised", 8: "Angrily_Surprised",
    9: "Angrily_Disgusted", 10: "Disgustedly_Surprised"
}

# 1=Train, 2=Test, 3=Val (Adjust based on your specific partition file format if needed)
# Based on your previous snippet: 0:train, 1:test, 2:val
SPLIT_MAP = {0: "train", 1: "test", 2: "val"}

In [5]:
def load_file(path):
    data = {}
    with open(path, 'r') as f:
        for line in f:
            parts = line.strip().split()
            # Handle cases where filename might have spaces or tabs
            if len(parts) >= 2:
                # Key is filename, Value is the label/partition ID
                data[parts[0]] = int(parts[1])
    return data

labels = load_file(EMOLABEL_FILE)
partitions = load_file(PARTITION_FILE)

print(f"Loaded {len(labels)} labels and {len(partitions)} partition entries.")

# Sanity Check: Distribution
print("\nClass Distribution:")
label_counts = Counter(labels.values())
for lbl_id, count in sorted(label_counts.items()):
    name = EMOTION_MAP.get(lbl_id, f"Unknown_{lbl_id}")
    print(f"{name}: {count}")

Loaded 4549 labels and 4549 partition entries.

Class Distribution:
Happily_Surprised: 676
Happily_Disgusted: 279
Sadly_Fearful: 171
Sadly_Angry: 230
Sadly_Surprised: 120
Sadly_Disgusted: 835
Fearfully_Angry: 195
Fearfully_Surprised: 603
Angrily_Surprised: 36
Angrily_Disgusted: 210
Disgustedly_Surprised: 977
Unknown_11: 177
Unknown_12: 11
Unknown_13: 29


In [6]:
# Setup stats tracking
stats = {
    'processed': 0,
    'trash': 0,
    'missing_meta': 0,
    'salvage_lanczos': 0 # Tracking how many needed sharp upscaling
}

input_path = Path(INPUT_DIR)

# Define separate output paths
out_resnet = Path(OUTPUT_DIR) / "resnet_224"
out_vllm = Path(OUTPUT_DIR) / "vllm_336"

# Iterate
print("Starting Dual-Path Preprocessing...")
all_images = list(input_path.glob("*.jpg"))

for img_path in tqdm(all_images):
    # 1. Clean filename to match text file keys
    # specific to RAF-DB/RAF-CE format (removing _aligned suffix if present)
    base_name = img_path.stem.replace("_aligned", "") + ".jpg"
    
    # 2. Validation: Do we have info for this image?
    if base_name not in labels or base_name not in partitions:
        stats['missing_meta'] += 1
        continue
        
    partition_id = partitions[base_name]
    label_id = labels[base_name]
    
    # Safety: Skip if partition is unknown
    if partition_id not in SPLIT_MAP:
        continue
        
    split = SPLIT_MAP[partition_id]
    emotion = EMOTION_MAP.get(label_id, str(label_id))
    
    # 3. Load Image
    img = cv2.imread(str(img_path))
    if img is None:
        continue
        
    h, w = img.shape[:2]
    min_dim = min(h, w)
    
    # 4. Resolution Triage (The Filter)
    if min_dim < TRASH_THRESHOLD:
        stats['trash'] += 1
        continue
        
    # 5. Create Directories
    # Structure: processed/resnet_224/train/Happily_Surprised/image.jpg
    target_dir_res = out_resnet / split / emotion
    target_dir_vllm = out_vllm / split / emotion
    
    target_dir_res.mkdir(parents=True, exist_ok=True)
    target_dir_vllm.mkdir(parents=True, exist_ok=True)
    
    # --- PATH A: ResNet (Standard 224x224) ---
    # Bicubic is standard for CNNs.
    img_res = cv2.resize(img, SIZE_RESNET, interpolation=cv2.INTER_CUBIC)
    cv2.imwrite(str(target_dir_res / img_path.name), img_res)
    
    # --- PATH B: Vision-LLM (High-Res 336x336) ---
    # "Smart" Upscaling Logic
    if min_dim < SALVAGE_THRESHOLD:
        # Small image (48-112px) -> Needs help preserving edges
        # Use Lanczos4 (sharper than cubic) or Super-Res GAN here later
        stats['salvage_lanczos'] += 1
        img_vllm = cv2.resize(img, SIZE_VLLM, interpolation=cv2.INTER_LANCZOS4)
    else:
        # Good image -> Standard resize
        img_vllm = cv2.resize(img, SIZE_VLLM, interpolation=cv2.INTER_CUBIC)
        
    cv2.imwrite(str(target_dir_vllm / img_path.name), img_vllm)
    
    stats['processed'] += 1

print(f"\nPreprocessing Complete. Stats: {stats}")

Starting Dual-Path Preprocessing...


  0%|          | 0/4908 [00:00<?, ?it/s]


Preprocessing Complete. Stats: {'processed': 4549, 'trash': 0, 'missing_meta': 359, 'salvage_lanczos': 4549}


In [7]:
# Verify that both datasets align
print("Verifying Dataset Integrity...")

for ds_name, path in [("ResNet", out_resnet), ("V-LLM", out_vllm)]:
    print(f"\n--- {ds_name} Dataset ({path.name}) ---")
    for split in ["train", "val", "test"]:
        split_path = path / split
        if split_path.exists():
            count = sum(len(files) for _, _, files in os.walk(split_path))
            print(f"  {split}: {count} images")
        else:
            print(f"  {split}: MISSING!")

Verifying Dataset Integrity...

--- ResNet Dataset (resnet_224) ---
  train: 2709 images
  val: 931 images
  test: 909 images

--- V-LLM Dataset (vllm_336) ---
  train: 2709 images
  val: 931 images
  test: 909 images


In [8]:
# Zip ResNet dataset
!cd /kaggle/working/processed_dataset && zip -rq ../dataset_resnet_224.zip resnet_224
print("Zipped ResNet dataset.")

# Zip Vision-LLM dataset
!cd /kaggle/working/processed_dataset && zip -rq ../dataset_vllm_336.zip vllm_336
print("Zipped Vision-LLM dataset.")

Zipped ResNet dataset.
Zipped Vision-LLM dataset.
