# üöÄ Vision-LLM "Zero to Hero": The Ultimate Workflow (QLoRA)

## Introduction
Ce notebook est la solution **d√©finitive** pour Meow-AI. Il impl√©mente un pipeline complet :
1.  **Data Loading** : Compatible avec votre structure Drive.
2.  **Model** : Qwen-VL-Chat optimis√© en 4-bit (QLoRA).
3.  **Training** : Early Stopping, Logging, Optimisation VRAM.
4.  **Validation** : M√©triques r√©elles (Accuracy, F1), Courbes de Loss, Matrice de Confusion.
5.  **Sauvegarde** : Export automatique vers Google Drive.

---

In [None]:
# --- CELL 1: Imports & Environment Setup ---
# üõ†Ô∏è INSTALLATION AUTOMATIQUE (ZERO CONFIG)

print("‚ö° Installing optimized libraries for QLoRA...")
!pip install -q -U torch torchvision torchaudio
!pip install -q -U transformers>=4.37.0 peft bitsandbytes accelerate datasets pillow scikit-learn scipy tensorboard einops tiktoken
print("‚úÖ Libraries installed! Loading imports...")

import os
import time
import copy
import zipfile
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from tqdm import tqdm
import shutil

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score
from sklearn.model_selection import train_test_split

# Vision-LLM & QLoRA libraries
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoProcessor,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    TrainerCallback
)
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
    PeftModel
)

# Mount Google Drive
from google.colab import drive
try:
    drive.mount('/content/drive')
except:
    print("‚ÑπÔ∏è Drive already mounted or local environment.")

# Check for GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"‚úÖ Using device: {device}")

In [None]:
# --- CELL 2: Configuration & Constants ---

# --- PATHS & DRIVE SAVING ---
DATASET_PATH = '/content/drive/MyDrive/Colab Datasets'
IMAGE_EXTRACT_PATH = '/content/raf-ce-images'
OUTPUT_DRIVE_PATH = '/content/drive/MyDrive/Meow_VisionLLM_Results/best_model' # Where to save final model

# Ensure drive output dir exists
os.makedirs(OUTPUT_DRIVE_PATH, exist_ok=True)

# --- HYPERPARAMETERS ---
BATCH_SIZE = 8          # Optimized for T4 GPU (with 4-bit)
GRAD_ACCUMULATION = 4   # Effective Batch Size = 32
NUM_EPOCHS = 10         # Max epochs (Early Stopping will likely stop earlier)
LEARNING_RATE = 2e-4    # Standard for QLoRA
MAX_LENGTH = 256        # Limit token length for speed

# Emotion Labels Mapping (0-14)
emotion_map = {
    0: 'Happily surprised', 1: 'Happily disgusted', 2: 'Sadly fearful',
    3: 'Sadly angry', 4: 'Sadly surprised', 5: 'Sadly disgusted',
    6: 'Fearfully angry', 7: 'Fearfully surprised', 8: 'Fearfully disgusted',
    9: 'Angrily surprised', 10: 'Angrily disgusted', 11: 'Disgustedly surprised',
    12: 'Happily fearful', 13: 'Happily angry', 14: 'Happily sad'
}

print(f"‚úÖ Config Loaded. Results will be saved to: {OUTPUT_DRIVE_PATH}")

In [None]:
# --- CELL 3: Data Loading Pipeline ---

def prepare_data(dataset_path, extract_to):
    # 1. Unzip Logic
    zip_file = os.path.join(dataset_path, 'aligned.zip')
    if not os.path.exists(zip_file):
        print(f"‚ö†Ô∏è Zip not found at {zip_file}. Checking current dir...")
        if os.path.exists('aligned.zip'): zip_file = 'aligned.zip'
        else: 
            print("‚ùå No aligned.zip found!")
            return None, None

    if not os.path.exists(extract_to):
        print(f"üìÇ Unzipping to {extract_to}...")
        with zipfile.ZipFile(zip_file, 'r') as zip_ref:
            zip_ref.extractall(extract_to)
    
    # 2. Find Image Root
    extracted_items = os.listdir(extract_to)
    if len(extracted_items) == 1 and os.path.isdir(os.path.join(extract_to, extracted_items[0])):
        img_root = os.path.join(extract_to, extracted_items[0])
    else:
        img_root = extract_to

    # 3. Load Labels
    # Attempt to find label file in Data Path OR extracted path
    emo_path = os.path.join(dataset_path, 'RAFCE_emolabel.txt')
    if not os.path.exists(emo_path):
        emo_path = os.path.join(img_root, '../RAFCE_emolabel.txt') # Common structure
    
    if not os.path.exists(emo_path) and os.path.exists('RAFCE_emolabel.txt'):
        emo_path = 'RAFCE_emolabel.txt'
        
    if not os.path.exists(emo_path):
        print("‚ùå Labels not found!")
        return None, None

    print(f"üìñ Loading labels from {emo_path}")
    df = pd.read_csv(emo_path, sep=r'\s+', header=None, names=['filename', 'label'])
    
    # 4. Process Attributes
    df['label_text'] = df['label'].map(emotion_map)
    df['path'] = df['filename'].apply(lambda x: os.path.join(img_root, x) if not x.endswith('.jpg') else os.path.join(img_root, x.replace('.jpg', '_aligned.jpg')))
    
    # Filter missing images
    df = df[df['path'].apply(os.path.exists)]
    
    return df, img_root

df, img_root = prepare_data(DATASET_PATH, IMAGE_EXTRACT_PATH)
print(f"‚úÖ Total Images available: {len(df) if df is not None else 0}")

In [None]:
# --- CELL 4: QLoRA Model Setup ---

MODEL_ID = "Qwen/Qwen-VL-Chat-Int4" # Optimized Base Model

def get_model():
    print(f"üîÑ Loading {MODEL_ID}...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
    processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
    
    # 4-bit Quantization Config
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
    )
    
    # Load Model
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True
    )
    model = prepare_model_for_kbit_training(model)
    
    # LoRA Config
    peft_config = LoraConfig(
        r=16, 
        lora_alpha=32, 
        target_modules=["c_attn", "attn.c_proj", "w1", "w2"],
        lora_dropout=0.05, 
        bias="none", 
        task_type="CAUSAL_LM"
    )
    
    model = get_peft_model(model, peft_config)
    return model, processor, tokenizer

model, processor, tokenizer = get_model()

In [None]:
# --- CELL 5: Custom Dataset & Prompt Engineering ---

class RAFCE_QwenDataset(Dataset):
    def __init__(self, data, processor):
        self.data = data
        self.processor = processor

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        
        # Optimized Prompt: Short & Direct
        prompt = f"User: <img>{item['path']}</img> Analyze facial cues. What is the compound emotion?\nAssistant: The emotion is {item['label_text']}.<|endoftext|>"
        
        inputs = self.processor(
            text=[prompt],
            images=None,
            return_tensors="pt",
            padding="max_length",
            max_length=MAX_LENGTH,
            truncation=True
        )
        
        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "labels": inputs["input_ids"].squeeze() # For Causal LM, labels = inputs
        }

# Split Data
train_df, val_df = train_test_split(df, test_size=0.15, stratify=df['label'], random_state=42)
train_dataset = RAFCE_QwenDataset(train_df, processor)
val_dataset = RAFCE_QwenDataset(val_df, processor)

print(f"üìä Train Set: {len(train_df)} | Val Set: {len(val_df)}")

In [None]:
# --- CELL 6: Training Setup with Early Stopping ---

training_args = TrainingArguments(
    output_dir="./checkpoints",
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACCUMULATION,
    num_train_epochs=NUM_EPOCHS,
    learning_rate=LEARNING_RATE,
    bf16=True,                           # Faster on T4/A10
    logging_steps=10,
    evaluation_strategy="steps",         # Evaluate every X steps
    eval_steps=100,
    save_steps=100,
    save_total_limit=1,                  # Keep only best checkpoint
    load_best_model_at_end=True,         # Important!
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to=["tensorboard"]
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)] # Stop if no improvement for 3 evals
)

print("üî• Starting Optimized Training...")
trainer.train()
print("‚úÖ Training Finished (Best Model Loaded).")

In [None]:
# --- CELL 7: Visualisation (Loss Curves) ---

history = pd.DataFrame(trainer.state.log_history)
train_loss = history[history['loss'].notna()][['step', 'loss']]
val_loss = history[history['eval_loss'].notna()][['step', 'eval_loss']]

plt.figure(figsize=(10, 6))
plt.plot(train_loss['step'], train_loss['loss'], label='Training Loss')
plt.plot(val_loss['step'], val_loss['eval_loss'], label='Validation Loss', marker='o')
plt.title('Training vs Validation Loss')
plt.xlabel('Steps')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# --- CELL 8: FULL EVALUATION (Confusion Matrix & Metrics) ---
# We run inference on the Validation Set to get real Accuracy/F1

def evaluate_performance(model, val_df, processor, tokenizer):
    print("üïµÔ∏è Running Full Evaluation on Validation Set...")
    true_labels = []
    pred_labels = []
    
    model.eval()
    with torch.no_grad():
        for _, row in tqdm(val_df.iterrows(), total=len(val_df)):
            # Prompt for inference (without answer)
            prompt = f"User: <img>{row['path']}</img> Analyze facial cues. What is the compound emotion?\nAssistant:"
            
            inputs = processor(text=[prompt], return_tensors="pt").to(device)
            
            # Generate Answer
            generated_ids = model.generate(
                **inputs,
                max_new_tokens=20,
                do_sample=False  # Deterministic
            )
            output_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
            
            # Extract Prediction
            prediction = output_text.split("Assistant:")[-1].strip().split(".")[0]
            
            # Store
            true_labels.append(row['label_text'])
            # Simple matching: check if expected label is in output
            # (Robust matching would be better, but this works for "The emotion is X" format)
            matched = False
            for label in emotion_map.values():
                if label.lower() in prediction.lower():
                    pred_labels.append(label)
                    matched = True
                    break
            if not matched:
                pred_labels.append("Unknown")

    # Metrics
    acc = accuracy_score(true_labels, pred_labels)
    f1 = f1_score(true_labels, pred_labels, average='macro')
    
    print(f"\nüèÜ Final Accuracy: {acc:.4f}")
    print(f"‚≠ê Final Macro F1: {f1:.4f}")
    print("\nClassification Report:")
    print(classification_report(true_labels, pred_labels))
    
    # Confusion Matrix Plot
    plt.figure(figsize=(12, 10))
    cm = confusion_matrix(true_labels, pred_labels, labels=list(emotion_map.values()))
    sns.heatmap(cm, annot=True, fmt='d', xticklabels=emotion_map.values(), yticklabels=emotion_map.values(), cmap='Blues')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.title('Confusion Matrix')
    plt.show()

# Run Evaluation (Takes some time but worth it)
evaluate_performance(model, val_df, processor, tokenizer)

In [None]:
# --- CELL 9: Save Best Model to Drive ---
print(f"üíæ Saving Adapter to {OUTPUT_DRIVE_PATH}...")
model.save_pretrained(OUTPUT_DRIVE_PATH)
tokenizer.save_pretrained(OUTPUT_DRIVE_PATH)
processor.save_pretrained(OUTPUT_DRIVE_PATH)
print("‚úÖ Saved successfully! You can load it later using PeftModel.")