In [None]:
# import os
# !wget https://github.com/RUCAIBox/POPE/blob/main/POPEv2/dataset/annotations.json -P /kaggle/working/data/annotations/pope

In [None]:
from pycocotools.coco import COCO

coco = COCO("/kaggle/input/coco-2014-dataset-for-yolov3/coco2014/annotations/instances_train2014.json")

img_id = 123
ann_ids = coco.getAnnIds(imgIds=img_id)
anns = coco.loadAnns(ann_ids)

objects = set()
for ann in anns:
    cat = coco.loadCats(ann["category_id"])[0]["name"]
    objects.add(cat)

In [None]:
import json
import os
from tqdm import tqdm

LLAVA_JSON = "/kaggle/input/llava-instruct-150k/llava_instruct_150k.json"

COCO_DIRS = [
    "/kaggle/input/coco-2014-dataset-for-yolov3/coco2014/images/train2014",
    "/kaggle/input/coco-2014-dataset-for-yolov3/coco2014/images/val2014",
    "/kaggle/input/coco-2014-dataset-for-yolov3/coco2014/images/test2014",
]

OUTPUT_JSONL = "/kaggle/working/llava_train.jsonl"


In [None]:
def find_img(img_name):
    coco_fix = [
        "COCO_train2014_",
        "COCO_val2014",
        "COCO_tes2014",
    ]

    for coco_dir in COCO_DIRS:
        for prefix in coco_fix:
            path = os.path.join(coco_dir, prefix + img_name)
            if os.path.exists(path):
                return path
    return None

In [None]:
with open(LLAVA_JSON, "r") as f:
    llava_data = json.load(f)

valid = 0
missing = 0

with open(OUTPUT_JSONL, "w") as out:
    for sample in tqdm(llava_data):
        image_name = sample.get("image", "")
        image_path = find_img(image_name)

        if image_path is None:
            missing += 1
            continue

        record = {
            "id": sample.get("id"),
            "image": image_path,
            "conversations": sample["conversations"]
        }

        out.write(json.dumps(record) + "\n")
        valid += 1

print(f"Valid samples: {valid}")
print(f"Missing images: {missing}")
print(f"Saved to: {OUTPUT_JSONL}")

In [None]:
import random

with open(OUTPUT_JSONL) as f:
    lines = f.readlines()

sample = json.loads(random.choice(lines))
print(sample["image"])
print(sample["conversations"][0])


In [None]:
# !pip install --upgrade transformers accelerate bitsandbytes sentencepiece protobuf pillow torch

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import (
    CLIPVisionModel, CLIPImageProcessor,
    LlamaForCausalLM, LlamaTokenizer,
    get_linear_schedule_with_warmup
)
from PIL import Image
import json
from pathlib import Path
from tqdm import tqdm
import os

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
!git clone https://github.com/dzungnguyen21/VLM.git

In [None]:
!ls

In [None]:
%cd VLM

In [None]:
!dir

In [None]:
!pip install uv

In [None]:
!uv pip install -r requirements.txt

In [None]:
!python train.py --train_data="/kaggle/working/llava_train.jsonl" --load_in_8bit --limit_sample 10000 --output_dir "/kaggle/working/weights" --epochs 5