In [None]:
from datasets import load_dataset
import json
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import Dataset
from transformers import Trainer, TrainingArguments
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image

# Load dataset
dataset = load_dataset("HuggingFaceM4/FairFace", "0.25")
print(dataset)

fairface_300 = dataset["train"].shuffle(seed=42).select(range(300))
print(len(fairface_300))

sample = fairface_300[0]
print(sample.keys())

ernie_data = []

for sample in fairface_300:
    gender = sample["gender"]
    race = sample["race"]
    prompt = "Describe the person in the image in a neutral and factual way."
    response = f"The image shows a {gender} person of {race} ethnicity."
    ernie_data.append({
        "instruction": prompt,
        "output": response
    })

# Save for Unsloth
with open("fairvis_ernie_sft.json", "w") as f:
    json.dump(ernie_data, f, indent=2)

print("âœ… ERNIE fine-tuning dataset ready:", len(ernie_data))

model_name = "baidu/ERNIE-4.5-0.3B-PT"

# Load tokenizer & model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype="auto"
)

# Quick inference test
inputs = tokenizer("Hello, describe fairness:", return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_new_tokens=50)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

# Load your dataset
with open("fairvis_ernie_sft.json", "r") as f:
    data = json.load(f)

# Convert to Hugging Face Dataset
dataset = Dataset.from_list(data)
print(dataset[0])

def tokenize_function(example):
    instruction = example["instruction"]
    output = example["output"]

    if isinstance(instruction, list):
        instruction = " ".join(instruction)
    if isinstance(output, list):
        output = " ".join(output)

    encodings = tokenizer(
        instruction + " " + output,
        truncation=True,
        max_length=512,
        padding="max_length"
    )

    encodings["labels"] = encodings["input_ids"].copy()
    return encodings

tokenized_dataset = dataset.map(tokenize_function, batched=False)

training_args = TrainingArguments(
    output_dir="./ernie_fairvis",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=2,
    num_train_epochs=1,
    learning_rate=5e-5,
    fp16=False,
    bf16=False,
    logging_steps=10,
    save_strategy="steps",
    save_steps=50,
    save_total_limit=2
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset
)

trainer.train()

# Pick the first sample
sample = fairface_300[0]

# Check keys
print(sample.keys())

# Get the image
image = sample["image"]

# Display it
image.show()

# Load BLIP Mini
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# Pick a sample image
sample = fairface_300[0]
image = sample["image"]

# Generate caption
inputs = processor(images=image, return_tensors="pt")
out = blip_model.generate(**inputs)
caption = processor.decode(out[0], skip_special_tokens=True)

print("BLIP caption:", caption)