# Prepare Dataset

**Below is the code to run to connect to google drive if working on Colab, else skip it and move to next one**

In [None]:
import sys, os
from pathlib import Path

In [None]:
IN_COLAB = "google.colab" in sys.modules         # True only when the notebook is opened in Colab
ROOT_DIR: Path                                   # will point either to CWD (local) or to Drive

if IN_COLAB:
    from google.colab import drive
    drive.mount("/content/drive")
    ROOT_DIR = Path("/content/drive/MyDrive/refcoco_project")  # change the name if you like
else:
    # Running outside Colab → fall back to current working directory
    ROOT_DIR = Path.cwd()

ROOT_DIR.mkdir(parents=True, exist_ok=True)      # make sure it exists
print("All data will be saved to:", ROOT_DIR.resolve())


Mounted at /content/drive
All data will be saved to: /content/drive/MyDrive/refcoco_project


**Run this if you are on PC**

In [None]:
DEFAULT_DIR_NAME = "refcoco_project"
ROOT_DIR : Path = Path("")          # change

PROJ_DIR = ROOT_DIR / DEFAULT_DIR_NAME

PROJ_DIR.mkdir(parents=True,exist_ok=True)

print("All data will be saved to:", ROOT_DIR.resolve())

### 1. Installing and PreProcessing

In [None]:
import importlib.util, subprocess

def _ensure(pkg):
    if importlib.util.find_spec(pkg) is None:
        subprocess.check_call([sys.executable, "-m", "pip", "install", pkg])
for p in ("datasets", "tqdm", "pillow"):
    _ensure(p)

import json, random, time, shutil
from urllib.parse import urlparse
import requests
from datasets import load_dataset
from tqdm import tqdm

os.makedirs(os.path.join(os.getcwd(),"TEMP"),exist_ok=True)

Determine how many images you need. Remember the sum has to below 42404 as there as many samples only

In [None]:
NUM_TRAIN, NUM_VAL, NUM_TEST = 1500, 250, 250
OVERSHOOT = 100                 # draw a few extra ids to offset 404 losses
CACHE_DIR = ROOT_DIR / "TEMP"     # Hugging Face cache inside CWD
OUT_DIR   = ROOT_DIR / "refcoco_2000"
TIMEOUT   = 30                  # seconds per image request
random.seed(0)                  # reproducible splits

Below tells where Hugging Face to store all cache like datasets, etc. It usually defaults to python storage in C: unless specified by line below

In [None]:
import os
os.environ["HF_HOME"] = str(CACHE_DIR.resolve())

Create Train, Val and Test Splits

In [None]:
for sub in ("train", "val", "test"):
    (OUT_DIR / sub).mkdir(parents=True, exist_ok=True)
CACHE_DIR.mkdir(parents=True, exist_ok=True)

Loading from jxu124/refcoco dataset

In [None]:
print("→ Streaming RefCOCO-UNC 'train' split from Hugging Face …")
hf_train = load_dataset(
    "jxu124/refcoco",
    split="train",
    cache_dir=str(CACHE_DIR),
    streaming=True,          # prevent full download into RAM
)

→ Streaming RefCOCO-UNC 'train' split from Hugging Face …


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

Splitting all IDS

In [None]:
print("→ Indexing examples by image-id …")
imgid2samples = {}
for ex in tqdm(hf_train, desc="Indexing"):
    img_id = ex["image_id"]
    imgid2samples.setdefault(img_id, []).append(ex)

ALL_IDS = list(imgid2samples)
if len(ALL_IDS) < NUM_TRAIN + NUM_VAL + NUM_TEST:
    raise RuntimeError("Dataset unexpectedly small.")

random.shuffle(ALL_IDS)
train_ids = set(ALL_IDS[:NUM_TRAIN])
val_ids   = set(ALL_IDS[NUM_TRAIN:NUM_TRAIN+NUM_VAL])
test_ids  = set(ALL_IDS[NUM_TRAIN+NUM_VAL:NUM_TRAIN+NUM_VAL+NUM_TEST])

→ Indexing examples by image-id …


Indexing: 42404it [00:44, 952.90it/s] 


Due to changes in the dataset, the url needs to be properly created/parsed

In [None]:
COCO_BASE = "http://images.cocodataset.org/"

def canonical_url(p:str) -> str:
    if p.startswith(("http://", "https://")):
        return p
    # drop any leading "coco/" in the relative path
    rel = p.lstrip("/").replace("coco/", "")
    return COCO_BASE + rel

In [None]:
def download_jpeg(rel_or_abs: str, dest: Path, timeout: int = 30) -> bool:
    """
    Stream a JPEG to `dest`.

    Returns
    -------
    bool
        True  – image is now present on disk (either it was already there,
                or it has just been downloaded successfully).
        False – file is still missing due to 404/network failure.
    """
    dest.parent.mkdir(parents=True, exist_ok=True)

    # Below checks if any data exists as file_path, if it does we skip (image already there)
    if dest.exists():
        return True

    url = canonical_url(rel_or_abs)
    try:
        r = requests.get(url, stream=True, timeout=timeout)
        if r.status_code == 404:
            return False
        r.raise_for_status()

        # Stores the image
        with dest.open("wb") as f_out:
            for chunk in r.iter_content(chunk_size=8192):
                if chunk:
                    f_out.write(chunk)

        # Returns True if something was written
        return dest.stat().st_size > 0

    except requests.RequestException:
        dest.unlink(missing_ok=True)                        # clean half baked files
        return False


### 2. Processing the Dataset into final splits and Annotations

In [None]:
ann_train, ann_val, ann_test = [], [], []
counters = {"train": 0, "val": 0, "test": 0}
next_new = 1

splits = [("train", train_ids, ann_train, NUM_TRAIN),
          ("val",   val_ids,   ann_val,   NUM_VAL),
          ("test",  test_ids,  ann_test,  NUM_TEST)]

This may take upto one hour, due to connection speeds from HuggingFace Server

In [None]:
print("→ Writing JPEGs and JSON annotations …")

for split_name, id_pool, ann_list, need in splits:
    for orig_id in tqdm(id_pool, desc=f"Processing {split_name}"):
        if counters[split_name] >= need:   # already met quota
            continue
        samples = imgid2samples[orig_id]
        img_url = samples[0]["image_path"]         # full COCO URL
        dest = OUT_DIR / split_name / f"image_{next_new:04d}.jpg"
        ok = download_jpeg(img_url, dest,TIMEOUT)
        if not ok:
            continue                                  # skip missing image
        # success
        for s in samples:
            for phrase in s["captions"]:
                ann_list.append({
                    "image_id": next_new,
                    "phrase":   phrase,
                    "bbox":     s["bbox"],
                })
        counters[split_name] += 1
        next_new += 1
        if counters[split_name] == need:
            print(f"✓ {split_name} quota reached ({need})")

→ Writing JPEGs and JSON annotations …


Processing train: 100%|██████████| 1500/1500 [27:52<00:00,  1.12s/it]


✓ train quota reached (1500)


Processing val: 100%|██████████| 250/250 [04:38<00:00,  1.11s/it]


✓ val quota reached (250)


Processing test: 100%|██████████| 250/250 [04:43<00:00,  1.13s/it]

✓ test quota reached (250)





Assertation - Check and Balance Only

In [None]:
assert counters["train"] == NUM_TRAIN, f"train short: {counters['train']}"
assert counters["val"]   == NUM_VAL,   f"val short: {counters['val']}"
assert counters["test"]  == NUM_TEST,  f"test short: {counters['test']}"

### 3. Save the Data

In [None]:
print("→ Saving JSON annotation files …")
(OUT_DIR / "anns_train.json").write_text(json.dumps(ann_train, indent=2))
(OUT_DIR / "anns_val.json"  ).write_text(json.dumps(ann_val,   indent=2))
(OUT_DIR / "anns_test.json" ).write_text(json.dumps(ann_test,  indent=2))

print("\n✓ All done — refcoco_2000 subset ready in", OUT_DIR.resolve())

→ Saving JSON annotation files …

✓ All done — refcoco_2000 subset ready in /content/drive/MyDrive/refcoco_project/refcoco_2000
