In [None]:
!ls /root/.cache/kagglehub/datasets/awsaf49/coco-2017-dataset/versions/1


ls: cannot access '/root/.cache/kagglehub/datasets/awsaf49/coco-2017-dataset/versions/1': No such file or directory


In [None]:
!pip install --upgrade pip

!pip install torch torchvision accelerate transformers datasets \ bitsandbytes peft sentencepiece safetensors ftfy regex \ transformers[torch] pillow evaluate


In [None]:
!pip install kagglehub transformers accelerate bitsandbytes datasets peft safetensors pillow

!nvidia-smi


In [None]:
!pip install kaggle


In [None]:
from google.colab import files
uploaded = files.upload() # Upload kaggle.json


Saving kaggle.json to kaggle.json


In [None]:
!mkdir ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!kaggle datasets download -d awsaf49/coco-2017-dataset
!unzip -q coco-2017-dataset.zip -d coco2017


Dataset URL: https://www.kaggle.com/datasets/awsaf49/coco-2017-dataset
License(s): CC-BY-SA-4.0
User cancelled operation
unzip:  cannot find or open coco-2017-dataset.zip, coco-2017-dataset.zip.zip or coco-2017-dataset.zip.ZIP.


In [None]:
import kagglehub

path = kagglehub.dataset_download("awsaf49/coco-2017-dataset")
print("Dataset downloaded to:", path)


Downloading from https://www.kaggle.com/api/v1/datasets/download/awsaf49/coco-2017-dataset?dataset_version_number=2...


100%|██████████| 25.0G/25.0G [03:17<00:00, 136MB/s]

Extracting files...





Dataset downloaded to: /root/.cache/kagglehub/datasets/awsaf49/coco-2017-dataset/versions/2


In [None]:
DATASET_PATH = "/root/.cache/kagglehub/datasets/awsaf49/coco-2017-dataset/versions/1"
IMAGES_DIR   = f"{DATASET_PATH}/train2017"
ANN_FILE     = f"{DATASET_PATH}/annotations/captions_train2017.json"

print(IMAGES_DIR)
print(ANN_FILE)


/root/.cache/kagglehub/datasets/awsaf49/coco-2017-dataset/versions/1/train2017
/root/.cache/kagglehub/datasets/awsaf49/coco-2017-dataset/versions/1/annotations/captions_train2017.json


In [None]:
import json, os, random
from PIL import Image
from torch.utils.data import Dataset

class LocalCocoCaptionDataset(Dataset):
    def __init__(self, images_dir, ann_file, max_samples=10000, img_size=224):
        with open(ann_file, 'r', encoding='utf-8') as f:
            ann = json.load(f)
        id2file = {x['id']: x['file_name'] for x in ann['images']}
        samples = []
        count = 0
        for a in ann['annotations']:
            if count >= max_samples:
                break
            img_id = a['image_id']
            file_name = id2file.get(img_id)
            if not file_name:
                continue
            img_path = os.path.join(images_dir, file_name)
            if not os.path.exists(img_path):
                continue
            try:
                img = Image.open(img_path).convert('RGB')
            except Exception:
                continue
            caption = a.get('caption','').strip()
            if not caption:
                continue
            target = f"{caption} Explanation:"
            samples.append({"image": img, "target": target})
            count += 1
        self.samples = samples
        self.img_size = img_size
        print(f"[DATA] Loaded {len(self.samples)} samples")

    def __len__(self):
        return len(self.samples)
    def __getitem__(self, idx):
        return self.samples[idx]


In [None]:
NUM_SAMPLES = 10000
IMG_SIZE = 224
N_VIRTUAL_TOKENS = 32
BATCH_SIZE = 2
GRAD_ACCUM = 8
EPOCHS = 3
LR = 2e-4
BASE_MODEL = "mistralai/Mistral-7B-Instruct-v0.2"
CLIP_VISION = "openai/clip-vit-base-patch32"
OUTPUT_DIR = "/content/men_coco_run"
print("Config:", NUM_SAMPLES, "samples |", EPOCHS, "epochs | batch", BATCH_SIZE)


Config: 10000 samples | 3 epochs | batch 2


In [None]:
import torch, torch.nn as nn
from transformers import AutoTokenizer, AutoModelForCausalLM, CLIPVisionModel, CLIPProcessor, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, TaskType

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

clip_proc = CLIPProcessor.from_pretrained(CLIP_VISION)
clip_vision = CLIPVisionModel.from_pretrained(CLIP_VISION).to(device)
clip_vision.eval()

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=False)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token":"<pad>"})

bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16)
try:
    lm = AutoModelForCausalLM.from_pretrained(BASE_MODEL, device_map="auto", quantization_config=bnb_config, trust_remote_code=True)
    print("[INFO] Loaded LM in 4-bit mode")
except Exception as e:
    print("[WARN] 4-bit load failed:", e)
    bnb_config = BitsAndBytesConfig(load_in_8bit=True)
    lm = AutoModelForCausalLM.from_pretrained(BASE_MODEL, device_map="auto", quantization_config=bnb_config, trust_remote_code=True)
    print("[INFO] Loaded LM in 8-bit mode")

class MENModel(nn.Module):
    def __init__(self, lm, tokenizer, clip_v, clip_proc, n_virtual=N_VIRTUAL_TOKENS, device=device):
        super().__init__()
        self.lm = lm
        self.tokenizer = tokenizer
        self.clip = clip_v
        self.clip_proc = clip_proc
        self.device = device
        self.n_virtual = n_virtual

        # freeze clip
        for p in self.clip.parameters():
            p.requires_grad = False
        self.clip.to(device).eval()

        self.embed_dim = lm.get_input_embeddings().weight.shape[1]
        clip_hidden = getattr(self.clip.config, "hidden_size", None) or getattr(self.clip.config, "projection_dim", None) or 512
        self.image_proj = nn.Linear(clip_hidden, self.embed_dim)
        nn.init.normal_(self.image_proj.weight, std=0.02)
        self.image_ln = nn.LayerNorm(self.embed_dim)
        self.to(device)

    def image_to_embeds(self, pil_images):
        proc = self.clip_proc(images=pil_images, return_tensors="pt")
        for k in proc: proc[k] = proc[k].to(self.device)
        with torch.no_grad():
            outputs = self.clip(**proc)
            if hasattr(outputs, "pooler_output") and outputs.pooler_output is not None:
                clip_emb = outputs.pooler_output
            else:
                clip_emb = outputs.last_hidden_state.mean(dim=1)
        proj = self.image_proj(clip_emb)
        proj = self.image_ln(proj)
        b = proj.size(0)
        expanded = proj.unsqueeze(1).expand(-1, self.n_virtual, -1).contiguous()
        return expanded

men = MENModel(lm=lm, tokenizer=tokenizer, clip_v=clip_vision, clip_proc=clip_proc)
print("MEN ready. LM embed dim:", men.embed_dim)


Device: cuda


Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

[INFO] Loaded LM in 4-bit mode
MEN ready. LM embed dim: 4096


In [None]:
from peft import LoraConfig, get_peft_model
LORA_R = 8
LORA_ALPHA = 16
LORA_DROPOUT = 0.05
TARGET_MODULES = ["q_proj", "k_proj", "v_proj", "o_proj", "fc1", "fc2"]

lora_cfg = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=TARGET_MODULES,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)
men.lm = get_peft_model(men.lm, lora_cfg)
men.lm.print_trainable_parameters()


trainable params: 6,815,744 || all params: 7,248,547,840 || trainable%: 0.0940


In [None]:
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm

def build_inputs_from_images_and_texts(batch_texts, batch_images, men, tokenizer):
    device = men.device
    tokenized = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=128)
    input_ids = tokenized["input_ids"].to(device)
    attention_mask_text = tokenized["attention_mask"].to(device)
    with torch.no_grad():
        token_embeds = men.lm.get_input_embeddings()(input_ids)
    image_embeds = men.image_to_embeds(batch_images)
    inputs_embeds = torch.cat([image_embeds, token_embeds], dim=1)
    b, text_len = input_ids.shape
    total_len = inputs_embeds.size(1)
    labels = torch.full((b, total_len), -100, dtype=torch.long, device=device)
    labels[:, N_VIRTUAL_TOKENS:N_VIRTUAL_TOKENS + text_len] = input_ids
    attention_mask = torch.ones((b, total_len), dtype=torch.long, device=device)
    return inputs_embeds, attention_mask, labels

def collate_fn(batch):
    images = [b["image"].resize((IMG_SIZE, IMG_SIZE)) for b in batch]
    texts = [b["target"] for b in batch]
    return {"images": images, "texts": texts}


In [None]:
# Colab cell (code)
dataset = LocalCocoCaptionDataset(IMAGES_DIR, ANN_FILE, max_samples=NUM_SAMPLES, img_size=IMG_SIZE)

# shuffle and split
indices = list(range(len(dataset)))
random.shuffle(indices)
cut = int(0.95 * len(indices))
train_idx, val_idx = indices[:cut], indices[cut:]

def subset_list(ds, idxs):
    return [ds[i] for i in idxs]

train_samples = subset_list(dataset, train_idx)
val_samples = subset_list(dataset, val_idx)

train_loader = DataLoader(train_samples, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_samples, batch_size=1, shuffle=False, collate_fn=collate_fn)

print("Train size:", len(train_samples), "Val size:", len(val_samples))


FileNotFoundError: [Errno 2] No such file or directory: '/root/.cache/kagglehub/datasets/awsaf49/coco-2017-dataset/versions/1/annotations/captions_train2017.json'