In [1]:
# Cell 1: Install with Version Pinning
# We force trl to be <0.13 to avoid the "ValueError: dataset appears to be vision-related" bug
!pip install -q "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install -q --no-deps "trl<0.13" peft accelerate bitsandbytes triton xformers

import torch
print(f"GPU Available: {torch.cuda.is_available()}")

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
GPU Available: True


In [4]:
import os
import pandas as pd
import json
import torch
from pathlib import Path
from PIL import Image
from unsloth import FastVisionModel, UnslothVisionDataCollator
from trl import SFTTrainer, SFTConfig
from datasets import Dataset

# --- YOUR SPECIFIC PATHS ---
INPUT_DIR = "/kaggle/input/raf-au/aligned"
EMOLABEL_FILE = "/kaggle/input/raf-au/RAFCE_emolabel.txt"
PARTITION_FILE = "/kaggle/input/raf-au/RAFCE_partition.txt"
AU_FILE = "/kaggle/input/raf-au/RAFCE_AUlabel.txt"

# Output Directory (Working Directory)
OUTPUT_DIR = "/kaggle/working/qwen_fer_output"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# --- MODEL CONFIG ---
# Using 7B as per your "AI_note_1.pdf" recommendation for micro-expressions
MODEL_ID = "unsloth/Qwen2-VL-7B-Instruct" 
MAX_SEQ_LENGTH = 2048
LORA_RANK = 32
LORA_ALPHA = 64

AU Logic (The "Reasoning" Engine)

In [9]:
# ==========================================
# CELL 3: METADATA & EXPLANATION GENERATOR
# ==========================================
# 1. Maps
EMOTION_MAP = {
    1: 'Happily Surprised', 2: 'Happily Disgusted', 3: 'Sadly Fearful',
    4: 'Sadly Angry', 5: 'Sadly Surprised', 6: 'Sadly Disgusted',
    7: 'Fearfully Angry', 8: 'Fearfully Surprised', 9: 'Fearfully Disgusted',
    10: 'Angrily Surprised', 11: 'Angrily Disgusted', 12: 'Disgustedly Surprised',
    13: 'Happily Fearful', 14: 'Happily Sad'
}

AU_MAP = {
    '1': 'Inner Brow Raiser', '2': 'Outer Brow Raiser', '4': 'Brow Lowerer',
    '5': 'Upper Lid Raiser', '6': 'Cheek Raiser', '7': 'Lid Tightener',
    '9': 'Nose Wrinkler', '10': 'Upper Lip Raiser', '12': 'Lip Corner Puller',
    '15': 'Lip Corner Depressor', '16': 'Lower Lip Depressor', '20': 'Lip Stretcher',
    '23': 'Lip Tightener', '25': 'Lips Part', '26': 'Jaw Drop', '27': 'Mouth Stretch'
}

def decode_aus(au_string):
    if not isinstance(au_string, str) or au_string == "null": return ""
    codes = au_string.split()
    descriptions = [f"{AU_MAP.get(c, '')} (AU{c})" for c in codes if c in AU_MAP]
    return ", ".join(descriptions)

def prepare_raw_data():
    print("üìñ Reading Metadata...")
    df_label = pd.read_csv(EMOLABEL_FILE, sep=r'\s+', header=None, names=['filename', 'label_id'])
    df_part = pd.read_csv(PARTITION_FILE, sep=r'\s+', header=None, names=['filename', 'split_id'])
    df = pd.merge(df_label, df_part, on='filename')
    
    if os.path.exists(AU_FILE):
        with open(AU_FILE, 'r') as f:
            au_dict = {l.split()[0]: " ".join(l.split()[1:]) for l in f.readlines()}
        df['aus'] = df['filename'].map(au_dict)
    else:
        df['aus'] = "null"

    # Filter for Train (0 or 1)
    train_code = 0 if 0 in df['split_id'].unique() else 1
    train_df = df[df['split_id'] == train_code]
    print(f"‚öôÔ∏è Processing {len(train_df)} images...")
    
    raw_list = []
    for _, row in train_df.iterrows():
        # Resolve Path
        img_path = os.path.join(INPUT_DIR, row['filename'])
        if not os.path.exists(img_path):
             img_path = os.path.join(INPUT_DIR, row['filename'].replace(".jpg", "_aligned.jpg"))
             if not os.path.exists(img_path): continue
        
        # Build Explanation Text
        emo_text = EMOTION_MAP.get(row['label_id'], "Unknown")
        au_desc = decode_aus(row['aus'])
        explanation = f"The expression is {emo_text}. I observe: {au_desc}." if au_desc else f"The expression is {emo_text}."

        # Just store raw info, don't format messages yet
        raw_list.append({
            "img_path": img_path,
            "explanation": explanation
        })
        
    return raw_list

raw_data_list = prepare_raw_data()
print(f"‚úÖ Found {len(raw_data_list)} valid samples.")

üìñ Reading Metadata...
‚öôÔ∏è Processing 2709 images...
‚úÖ Found 2709 valid samples.


In [10]:
# ==========================================
# CELL 4: LOAD MODEL & FORMAT DATASET
# ==========================================
from datasets import Dataset

# 1. Load Model
print("ü§ñ Loading Qwen2-VL-7B...")
model, tokenizer = FastVisionModel.from_pretrained(
    MODEL_ID,
    load_in_4bit=True,
    use_gradient_checkpointing="unsloth", 
)

# 2. Apply LoRA
model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers=True, 
    finetune_language_layers=True,
    finetune_attention_modules=True,
    finetune_mlp_modules=True,
    r=LORA_RANK,
    lora_alpha=LORA_ALPHA,
    lora_dropout=0,
    bias="none",
    random_state=3407,
    use_rslora=False,
)

# 3. THE FIX: Format Function that inserts the Image Object
def format_for_unsloth(example):
    # Load the image into memory
    try:
        image = Image.open(example['img_path']).convert("RGB")
    except:
        return None # Skip broken images
    
    # Structure EXACTLY how Unsloth Qwen2-VL expects it
    # It needs: {"type": "image", "image": <PIL_Image_Object>}
    return {
        "messages": [
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": image},  # <--- THIS WAS MISSING
                    {"type": "text", "text": "Classify the compound emotion and explain the facial cues."}
                ]
            },
            {
                "role": "assistant",
                "content": [
                    {"type": "text", "text": example['explanation']}
                ]
            }
        ]
    }

# 4. Create and Map Dataset
print("üîÑ Formatting Dataset (Loading images into cache)...")
train_dataset = Dataset.from_list(raw_data_list)
# We map the function to create the 'messages' column with real images
train_dataset = train_dataset.map(format_for_unsloth, remove_columns=["img_path", "explanation"])

print(f"‚úÖ Dataset Ready! Sample keys: {train_dataset[0]['messages'][0]['content'][0].keys()}")
# Should print: dict_keys(['type', 'image']) -> This proves the fix working

ü§ñ Loading Qwen2-VL-7B...
==((====))==  Unsloth 2026.1.3: Fast Qwen2_Vl patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 2. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.


Unsloth: Making `model.base_model.model.model.visual` require gradients
üîÑ Formatting Dataset (Loading images into cache)...


Map:   0%|          | 0/2709 [00:00<?, ? examples/s]

‚úÖ Dataset Ready! Sample keys: dict_keys(['image', 'text', 'type'])


In [11]:
# Cell 5: Training (Updated config to bypass VLM check)
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=UnslothVisionDataCollator(model, tokenizer),
    train_dataset=train_dataset,
    args=SFTConfig(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        max_steps=1500,
        warmup_steps=50,
        learning_rate=2e-4,
        fp16=True,
        bf16=False,
        gradient_checkpointing=True,
        optim="adamw_8bit",
        logging_steps=10,
        output_dir=OUTPUT_DIR,
        save_strategy="steps",
        save_steps=200,
        report_to="none",
        
        # --- THE FIX FOR "ValueError: dataset appears vision-related" ---
        remove_unused_columns=False, 
        dataset_text_field="", 
        dataset_kwargs={"skip_prepare_dataset": True}, # Stops TRL from analyzing columns
    ),
)

print("üöÄ Starting Training...")
trainer_stats = trainer.train()

# Save Result
model.save_pretrained(f"{OUTPUT_DIR}/final_model")
tokenizer.save_pretrained(f"{OUTPUT_DIR}/final_model")
print(f"üíæ Model Saved to {OUTPUT_DIR}/final_model")

Unsloth: Model does not have a default image size - using 512


The model is already on multiple devices. Skipping the move to device specified in `args`.


üöÄ Starting Training...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2,709 | Num Epochs = 5 | Total steps = 1,500
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 101,711,872 of 8,393,087,488 (1.21% trained)


Step,Training Loss
10,3.8613
20,1.5957
30,0.193
40,0.1937
50,0.0934
60,0.081
70,0.083
80,0.0709
90,0.07
100,0.0703


üíæ Model Saved to /kaggle/working/qwen_fer_output/final_model


In [13]:
# Enable Inference Mode
FastVisionModel.for_inference(model)

def predict_emotion(image_path):
    image = Image.open(image_path).convert("RGB")
    messages = [
        {"role": "user", "content": [
            {"type": "image"},
            {"type": "text", "text": "Classify the compound emotion and explain the facial cues."}
        ]}
    ]
    
    input_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
    inputs = tokenizer(
        image,
        input_text,
        add_special_tokens=False,
        return_tensors="pt",
    ).to("cuda")

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=128,
            use_cache=True,
            
            # --- STRICT SETTINGS ---
            temperature=0.1, # Forces deterministic classification
            top_p=0.9,
            do_sample=True,
        )
    
    return tokenizer.decode(outputs[0], skip_special_tokens=True).split("assistant")[-1].strip()

# Test on a random image from dataset
import random

# Use 'raw_data_list' (the variable we created in the fixed Cell 3)
test_sample = random.choice(raw_data_list)

print(f"üñºÔ∏è Testing Image: {test_sample['img_path']}")
print(f"üìù Ground Truth: {test_sample['explanation']}")

# Run prediction
prediction = predict_emotion(test_sample['img_path'])
print(f"ü§ñ Prediction: {prediction}")

üñºÔ∏è Testing Image: /kaggle/input/raf-au/aligned/0453_aligned.jpg
üìù Ground Truth: The expression is Angrily Surprised.
ü§ñ Prediction: The expression is Angrily Surprised.


In [15]:
# ==========================================
# FORCE DOWNLOAD LINK
# ==========================================
from IPython.display import FileLink

# 1. Make sure the zip exists
import os
zip_filename = "fer_model_zip.zip"
if os.path.exists(zip_filename):
    print(f"‚úÖ Found {zip_filename} ({os.path.getsize(zip_filename)/1024/1024:.2f} MB)")
    print("üëá Click the link below to download:")
    
    # 2. Generate the clickable link
    display(FileLink(zip_filename))
else:
    print("‚ùå Zip file not found! Did you run the 'shutil.make_archive' code above?")

‚úÖ Found fer_model_zip.zip (363.39 MB)
üëá Click the link below to download:
