In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers torch tqdm Pillow

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
import os
import pandas as pd
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import torch
from tqdm import tqdm
import concurrent.futures

csv_path = "path/to/your/dataset.csv"
base_image_dir = "path/to/images/"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"🚀 Using device: {device}")

processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)
model.eval()

df = pd.read_csv(csv_path)

def generate_caption(image_path):
    try:
        image = Image.open(image_path).convert("RGB")
        inputs = processor(images=image, return_tensors="pt").to(device)
        with torch.no_grad():
            output = model.generate(**inputs)
        caption = processor.decode(output[0], skip_special_tokens=True)
        return caption
    except Exception as e:
        print(f"⚠️ Error processing {image_path}: {e}")
        return ""

def process_images_parallel(image_paths):
    captions = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
        results = list(tqdm(executor.map(generate_caption, image_paths), total=len(image_paths), desc="🖼 Generating captions"))
        captions.extend(results)
    return captions

chunk_size = 100
image_paths = df["image_path"].apply(lambda x: os.path.join(base_image_dir, x.replace("\\", "/"))).tolist()
captions = []

total_chunks = len(image_paths) // chunk_size + (1 if len(image_paths) % chunk_size > 0 else 0)
print(f"Total chunks: {total_chunks}")

for i in range(0, len(image_paths), chunk_size):
    chunk = image_paths[i:i+chunk_size]
    chunk_captions = process_images_parallel(chunk)
    captions.extend(chunk_captions)
    remaining_chunks = total_chunks - (i // chunk_size + 1)
    print(f"Processed chunk {i // chunk_size + 1}/{total_chunks}, Remaining chunks: {remaining_chunks}")

df["generated_caption"] = captions
df.to_csv("path/to/output.csv", index=False)
print("✅ Captions added and CSV saved.")


🚀 Using device: cuda
Total chunks: 185


🖼 Generating captions: 100%|██████████| 100/100 [00:30<00:00,  3.31it/s]


Processed chunk 1/185, Remaining chunks: 184


🖼 Generating captions: 100%|██████████| 100/100 [00:29<00:00,  3.41it/s]


Processed chunk 2/185, Remaining chunks: 183


🖼 Generating captions: 100%|██████████| 100/100 [00:29<00:00,  3.44it/s]


Processed chunk 3/185, Remaining chunks: 182


🖼 Generating captions: 100%|██████████| 100/100 [00:29<00:00,  3.37it/s]


Processed chunk 4/185, Remaining chunks: 181


🖼 Generating captions: 100%|██████████| 100/100 [00:34<00:00,  2.92it/s]


Processed chunk 5/185, Remaining chunks: 180


🖼 Generating captions: 100%|██████████| 100/100 [00:32<00:00,  3.11it/s]


Processed chunk 6/185, Remaining chunks: 179


🖼 Generating captions: 100%|██████████| 100/100 [00:31<00:00,  3.13it/s]


Processed chunk 7/185, Remaining chunks: 178


🖼 Generating captions: 100%|██████████| 100/100 [00:30<00:00,  3.24it/s]


Processed chunk 8/185, Remaining chunks: 177


🖼 Generating captions: 100%|██████████| 100/100 [00:31<00:00,  3.17it/s]


Processed chunk 9/185, Remaining chunks: 176


🖼 Generating captions: 100%|██████████| 100/100 [00:32<00:00,  3.09it/s]


Processed chunk 10/185, Remaining chunks: 175


🖼 Generating captions: 100%|██████████| 100/100 [00:32<00:00,  3.07it/s]


Processed chunk 11/185, Remaining chunks: 174


🖼 Generating captions: 100%|██████████| 100/100 [00:32<00:00,  3.10it/s]


Processed chunk 12/185, Remaining chunks: 173


🖼 Generating captions: 100%|██████████| 100/100 [00:32<00:00,  3.05it/s]


Processed chunk 13/185, Remaining chunks: 172


🖼 Generating captions: 100%|██████████| 100/100 [00:31<00:00,  3.18it/s]


Processed chunk 14/185, Remaining chunks: 171


🖼 Generating captions: 100%|██████████| 100/100 [00:31<00:00,  3.13it/s]


Processed chunk 15/185, Remaining chunks: 170


🖼 Generating captions: 100%|██████████| 100/100 [00:30<00:00,  3.30it/s]


Processed chunk 16/185, Remaining chunks: 169


🖼 Generating captions: 100%|██████████| 100/100 [00:30<00:00,  3.23it/s]


Processed chunk 17/185, Remaining chunks: 168


🖼 Generating captions: 100%|██████████| 100/100 [00:30<00:00,  3.25it/s]


Processed chunk 18/185, Remaining chunks: 167


🖼 Generating captions: 100%|██████████| 100/100 [00:31<00:00,  3.13it/s]


Processed chunk 19/185, Remaining chunks: 166


🖼 Generating captions: 100%|██████████| 100/100 [00:32<00:00,  3.05it/s]


Processed chunk 20/185, Remaining chunks: 165


🖼 Generating captions: 100%|██████████| 100/100 [00:33<00:00,  2.97it/s]


Processed chunk 21/185, Remaining chunks: 164


🖼 Generating captions: 100%|██████████| 100/100 [00:31<00:00,  3.16it/s]


Processed chunk 22/185, Remaining chunks: 163


🖼 Generating captions: 100%|██████████| 100/100 [00:30<00:00,  3.27it/s]


Processed chunk 23/185, Remaining chunks: 162


🖼 Generating captions: 100%|██████████| 100/100 [00:29<00:00,  3.34it/s]


Processed chunk 24/185, Remaining chunks: 161


🖼 Generating captions: 100%|██████████| 100/100 [00:29<00:00,  3.38it/s]


Processed chunk 25/185, Remaining chunks: 160


🖼 Generating captions: 100%|██████████| 100/100 [00:30<00:00,  3.33it/s]


Processed chunk 26/185, Remaining chunks: 159


🖼 Generating captions: 100%|██████████| 100/100 [00:30<00:00,  3.32it/s]


Processed chunk 27/185, Remaining chunks: 158


🖼 Generating captions: 100%|██████████| 100/100 [00:31<00:00,  3.17it/s]


Processed chunk 28/185, Remaining chunks: 157


🖼 Generating captions: 100%|██████████| 100/100 [00:31<00:00,  3.22it/s]


Processed chunk 29/185, Remaining chunks: 156


🖼 Generating captions: 100%|██████████| 100/100 [00:31<00:00,  3.16it/s]


Processed chunk 30/185, Remaining chunks: 155


🖼 Generating captions: 100%|██████████| 100/100 [00:31<00:00,  3.20it/s]


Processed chunk 31/185, Remaining chunks: 154


🖼 Generating captions: 100%|██████████| 100/100 [00:29<00:00,  3.34it/s]


Processed chunk 32/185, Remaining chunks: 153


🖼 Generating captions: 100%|██████████| 100/100 [00:29<00:00,  3.36it/s]


Processed chunk 33/185, Remaining chunks: 152


🖼 Generating captions: 100%|██████████| 100/100 [00:30<00:00,  3.28it/s]


Processed chunk 34/185, Remaining chunks: 151


🖼 Generating captions: 100%|██████████| 100/100 [00:30<00:00,  3.32it/s]


Processed chunk 35/185, Remaining chunks: 150


🖼 Generating captions: 100%|██████████| 100/100 [00:30<00:00,  3.23it/s]


Processed chunk 36/185, Remaining chunks: 149


🖼 Generating captions: 100%|██████████| 100/100 [00:29<00:00,  3.37it/s]


Processed chunk 37/185, Remaining chunks: 148


🖼 Generating captions: 100%|██████████| 100/100 [00:28<00:00,  3.48it/s]


Processed chunk 38/185, Remaining chunks: 147


🖼 Generating captions: 100%|██████████| 100/100 [00:29<00:00,  3.40it/s]


Processed chunk 39/185, Remaining chunks: 146


🖼 Generating captions: 100%|██████████| 100/100 [00:29<00:00,  3.35it/s]


Processed chunk 40/185, Remaining chunks: 145


🖼 Generating captions: 100%|██████████| 100/100 [00:28<00:00,  3.56it/s]


Processed chunk 41/185, Remaining chunks: 144


🖼 Generating captions: 100%|██████████| 100/100 [00:30<00:00,  3.31it/s]


Processed chunk 42/185, Remaining chunks: 143


🖼 Generating captions: 100%|██████████| 100/100 [00:30<00:00,  3.29it/s]


Processed chunk 43/185, Remaining chunks: 142


🖼 Generating captions: 100%|██████████| 100/100 [00:29<00:00,  3.36it/s]


Processed chunk 44/185, Remaining chunks: 141


🖼 Generating captions: 100%|██████████| 100/100 [00:29<00:00,  3.43it/s]


Processed chunk 45/185, Remaining chunks: 140


🖼 Generating captions: 100%|██████████| 100/100 [00:30<00:00,  3.29it/s]


Processed chunk 46/185, Remaining chunks: 139


🖼 Generating captions: 100%|██████████| 100/100 [00:30<00:00,  3.29it/s]


Processed chunk 47/185, Remaining chunks: 138


🖼 Generating captions: 100%|██████████| 100/100 [00:29<00:00,  3.39it/s]


Processed chunk 48/185, Remaining chunks: 137


🖼 Generating captions: 100%|██████████| 100/100 [00:29<00:00,  3.42it/s]


Processed chunk 49/185, Remaining chunks: 136


🖼 Generating captions: 100%|██████████| 100/100 [00:28<00:00,  3.47it/s]


Processed chunk 50/185, Remaining chunks: 135


🖼 Generating captions: 100%|██████████| 100/100 [00:30<00:00,  3.28it/s]


Processed chunk 51/185, Remaining chunks: 134


🖼 Generating captions: 100%|██████████| 100/100 [00:29<00:00,  3.38it/s]


Processed chunk 52/185, Remaining chunks: 133


🖼 Generating captions: 100%|██████████| 100/100 [00:29<00:00,  3.36it/s]


Processed chunk 53/185, Remaining chunks: 132


🖼 Generating captions: 100%|██████████| 100/100 [00:31<00:00,  3.22it/s]


Processed chunk 54/185, Remaining chunks: 131


🖼 Generating captions: 100%|██████████| 100/100 [00:30<00:00,  3.24it/s]


Processed chunk 55/185, Remaining chunks: 130


🖼 Generating captions: 100%|██████████| 100/100 [00:28<00:00,  3.45it/s]


Processed chunk 56/185, Remaining chunks: 129


🖼 Generating captions: 100%|██████████| 100/100 [00:31<00:00,  3.16it/s]


Processed chunk 57/185, Remaining chunks: 128


🖼 Generating captions: 100%|██████████| 100/100 [00:30<00:00,  3.27it/s]


Processed chunk 58/185, Remaining chunks: 127


🖼 Generating captions: 100%|██████████| 100/100 [00:30<00:00,  3.31it/s]


Processed chunk 59/185, Remaining chunks: 126


🖼 Generating captions: 100%|██████████| 100/100 [00:30<00:00,  3.31it/s]


Processed chunk 60/185, Remaining chunks: 125


🖼 Generating captions: 100%|██████████| 100/100 [00:28<00:00,  3.46it/s]


Processed chunk 61/185, Remaining chunks: 124


🖼 Generating captions: 100%|██████████| 100/100 [00:30<00:00,  3.30it/s]


Processed chunk 62/185, Remaining chunks: 123


🖼 Generating captions: 100%|██████████| 100/100 [00:30<00:00,  3.28it/s]


Processed chunk 63/185, Remaining chunks: 122


🖼 Generating captions: 100%|██████████| 100/100 [00:29<00:00,  3.36it/s]


Processed chunk 64/185, Remaining chunks: 121


🖼 Generating captions: 100%|██████████| 100/100 [00:30<00:00,  3.25it/s]


Processed chunk 65/185, Remaining chunks: 120


🖼 Generating captions: 100%|██████████| 100/100 [00:30<00:00,  3.23it/s]


Processed chunk 66/185, Remaining chunks: 119


🖼 Generating captions: 100%|██████████| 100/100 [00:29<00:00,  3.37it/s]


Processed chunk 67/185, Remaining chunks: 118


🖼 Generating captions: 100%|██████████| 100/100 [00:29<00:00,  3.41it/s]


Processed chunk 68/185, Remaining chunks: 117


🖼 Generating captions: 100%|██████████| 100/100 [00:29<00:00,  3.34it/s]


Processed chunk 69/185, Remaining chunks: 116


🖼 Generating captions: 100%|██████████| 100/100 [00:30<00:00,  3.28it/s]


Processed chunk 70/185, Remaining chunks: 115


🖼 Generating captions: 100%|██████████| 100/100 [00:30<00:00,  3.25it/s]


Processed chunk 71/185, Remaining chunks: 114


🖼 Generating captions: 100%|██████████| 100/100 [00:29<00:00,  3.41it/s]


Processed chunk 72/185, Remaining chunks: 113


🖼 Generating captions: 100%|██████████| 100/100 [00:30<00:00,  3.23it/s]


Processed chunk 73/185, Remaining chunks: 112


🖼 Generating captions: 100%|██████████| 100/100 [00:30<00:00,  3.28it/s]


Processed chunk 74/185, Remaining chunks: 111


🖼 Generating captions: 100%|██████████| 100/100 [00:29<00:00,  3.35it/s]


Processed chunk 75/185, Remaining chunks: 110


🖼 Generating captions: 100%|██████████| 100/100 [00:30<00:00,  3.24it/s]


Processed chunk 76/185, Remaining chunks: 109


🖼 Generating captions: 100%|██████████| 100/100 [00:31<00:00,  3.13it/s]


Processed chunk 77/185, Remaining chunks: 108


🖼 Generating captions: 100%|██████████| 100/100 [00:31<00:00,  3.20it/s]


Processed chunk 78/185, Remaining chunks: 107


🖼 Generating captions: 100%|██████████| 100/100 [00:29<00:00,  3.44it/s]


Processed chunk 79/185, Remaining chunks: 106


🖼 Generating captions: 100%|██████████| 100/100 [00:30<00:00,  3.33it/s]


Processed chunk 80/185, Remaining chunks: 105


🖼 Generating captions: 100%|██████████| 100/100 [00:29<00:00,  3.41it/s]


Processed chunk 81/185, Remaining chunks: 104


🖼 Generating captions: 100%|██████████| 100/100 [00:31<00:00,  3.16it/s]


Processed chunk 82/185, Remaining chunks: 103


🖼 Generating captions: 100%|██████████| 100/100 [00:30<00:00,  3.31it/s]


Processed chunk 83/185, Remaining chunks: 102


🖼 Generating captions: 100%|██████████| 100/100 [00:30<00:00,  3.27it/s]


Processed chunk 84/185, Remaining chunks: 101


🖼 Generating captions: 100%|██████████| 100/100 [00:32<00:00,  3.10it/s]


Processed chunk 85/185, Remaining chunks: 100


🖼 Generating captions: 100%|██████████| 100/100 [00:29<00:00,  3.35it/s]


Processed chunk 86/185, Remaining chunks: 99


🖼 Generating captions: 100%|██████████| 100/100 [00:31<00:00,  3.19it/s]


Processed chunk 87/185, Remaining chunks: 98


🖼 Generating captions: 100%|██████████| 100/100 [00:30<00:00,  3.24it/s]


Processed chunk 88/185, Remaining chunks: 97


🖼 Generating captions: 100%|██████████| 100/100 [00:29<00:00,  3.36it/s]


Processed chunk 89/185, Remaining chunks: 96


🖼 Generating captions: 100%|██████████| 100/100 [00:30<00:00,  3.29it/s]


Processed chunk 90/185, Remaining chunks: 95


🖼 Generating captions: 100%|██████████| 100/100 [00:28<00:00,  3.47it/s]


Processed chunk 91/185, Remaining chunks: 94


🖼 Generating captions: 100%|██████████| 100/100 [00:30<00:00,  3.31it/s]


Processed chunk 92/185, Remaining chunks: 93


🖼 Generating captions: 100%|██████████| 100/100 [00:31<00:00,  3.14it/s]


Processed chunk 93/185, Remaining chunks: 92


🖼 Generating captions: 100%|██████████| 100/100 [00:29<00:00,  3.36it/s]


Processed chunk 94/185, Remaining chunks: 91


🖼 Generating captions: 100%|██████████| 100/100 [00:31<00:00,  3.16it/s]


Processed chunk 95/185, Remaining chunks: 90


🖼 Generating captions: 100%|██████████| 100/100 [00:33<00:00,  2.98it/s]


Processed chunk 96/185, Remaining chunks: 89


🖼 Generating captions: 100%|██████████| 100/100 [00:30<00:00,  3.33it/s]


Processed chunk 97/185, Remaining chunks: 88


🖼 Generating captions: 100%|██████████| 100/100 [00:30<00:00,  3.27it/s]


Processed chunk 98/185, Remaining chunks: 87


🖼 Generating captions: 100%|██████████| 100/100 [00:30<00:00,  3.28it/s]


Processed chunk 99/185, Remaining chunks: 86


🖼 Generating captions: 100%|██████████| 100/100 [00:30<00:00,  3.29it/s]


Processed chunk 100/185, Remaining chunks: 85


🖼 Generating captions: 100%|██████████| 100/100 [00:31<00:00,  3.22it/s]


Processed chunk 101/185, Remaining chunks: 84


🖼 Generating captions: 100%|██████████| 100/100 [00:28<00:00,  3.52it/s]


Processed chunk 102/185, Remaining chunks: 83


🖼 Generating captions: 100%|██████████| 100/100 [00:32<00:00,  3.10it/s]


Processed chunk 103/185, Remaining chunks: 82


🖼 Generating captions: 100%|██████████| 100/100 [00:31<00:00,  3.14it/s]


Processed chunk 104/185, Remaining chunks: 81


🖼 Generating captions: 100%|██████████| 100/100 [00:30<00:00,  3.31it/s]


Processed chunk 105/185, Remaining chunks: 80


🖼 Generating captions: 100%|██████████| 100/100 [00:30<00:00,  3.31it/s]


Processed chunk 106/185, Remaining chunks: 79


🖼 Generating captions: 100%|██████████| 100/100 [00:29<00:00,  3.34it/s]


Processed chunk 107/185, Remaining chunks: 78


🖼 Generating captions: 100%|██████████| 100/100 [00:31<00:00,  3.18it/s]


Processed chunk 108/185, Remaining chunks: 77


🖼 Generating captions: 100%|██████████| 100/100 [00:29<00:00,  3.37it/s]


Processed chunk 109/185, Remaining chunks: 76


🖼 Generating captions: 100%|██████████| 100/100 [00:29<00:00,  3.45it/s]


Processed chunk 110/185, Remaining chunks: 75


🖼 Generating captions: 100%|██████████| 100/100 [00:30<00:00,  3.26it/s]


Processed chunk 111/185, Remaining chunks: 74


🖼 Generating captions: 100%|██████████| 100/100 [00:30<00:00,  3.23it/s]


Processed chunk 112/185, Remaining chunks: 73


🖼 Generating captions: 100%|██████████| 100/100 [00:29<00:00,  3.39it/s]


Processed chunk 113/185, Remaining chunks: 72


🖼 Generating captions: 100%|██████████| 100/100 [00:30<00:00,  3.28it/s]


Processed chunk 114/185, Remaining chunks: 71


🖼 Generating captions: 100%|██████████| 100/100 [00:30<00:00,  3.29it/s]


Processed chunk 115/185, Remaining chunks: 70


🖼 Generating captions: 100%|██████████| 100/100 [00:30<00:00,  3.25it/s]


Processed chunk 116/185, Remaining chunks: 69


🖼 Generating captions: 100%|██████████| 100/100 [00:30<00:00,  3.32it/s]


Processed chunk 117/185, Remaining chunks: 68


🖼 Generating captions: 100%|██████████| 100/100 [00:31<00:00,  3.16it/s]


Processed chunk 118/185, Remaining chunks: 67


🖼 Generating captions: 100%|██████████| 100/100 [00:31<00:00,  3.19it/s]


Processed chunk 119/185, Remaining chunks: 66


🖼 Generating captions: 100%|██████████| 100/100 [00:30<00:00,  3.25it/s]


Processed chunk 120/185, Remaining chunks: 65


🖼 Generating captions: 100%|██████████| 100/100 [00:30<00:00,  3.33it/s]


Processed chunk 121/185, Remaining chunks: 64


🖼 Generating captions: 100%|██████████| 100/100 [00:30<00:00,  3.23it/s]


Processed chunk 122/185, Remaining chunks: 63


🖼 Generating captions: 100%|██████████| 100/100 [00:30<00:00,  3.24it/s]


Processed chunk 123/185, Remaining chunks: 62


🖼 Generating captions: 100%|██████████| 100/100 [00:29<00:00,  3.37it/s]


Processed chunk 124/185, Remaining chunks: 61


🖼 Generating captions: 100%|██████████| 100/100 [00:29<00:00,  3.36it/s]


Processed chunk 125/185, Remaining chunks: 60


🖼 Generating captions: 100%|██████████| 100/100 [00:31<00:00,  3.17it/s]


Processed chunk 126/185, Remaining chunks: 59


🖼 Generating captions: 100%|██████████| 100/100 [00:40<00:00,  2.49it/s]


Processed chunk 127/185, Remaining chunks: 58


🖼 Generating captions: 100%|██████████| 100/100 [00:41<00:00,  2.44it/s]


Processed chunk 128/185, Remaining chunks: 57


🖼 Generating captions: 100%|██████████| 100/100 [00:41<00:00,  2.43it/s]


Processed chunk 129/185, Remaining chunks: 56


🖼 Generating captions: 100%|██████████| 100/100 [00:30<00:00,  3.33it/s]


Processed chunk 130/185, Remaining chunks: 55


🖼 Generating captions: 100%|██████████| 100/100 [00:30<00:00,  3.24it/s]


Processed chunk 131/185, Remaining chunks: 54


🖼 Generating captions: 100%|██████████| 100/100 [00:30<00:00,  3.32it/s]


Processed chunk 132/185, Remaining chunks: 53


🖼 Generating captions: 100%|██████████| 100/100 [00:31<00:00,  3.14it/s]


Processed chunk 133/185, Remaining chunks: 52


🖼 Generating captions: 100%|██████████| 100/100 [00:32<00:00,  3.06it/s]


Processed chunk 134/185, Remaining chunks: 51


🖼 Generating captions: 100%|██████████| 100/100 [00:30<00:00,  3.27it/s]


Processed chunk 135/185, Remaining chunks: 50


🖼 Generating captions: 100%|██████████| 100/100 [00:31<00:00,  3.22it/s]


Processed chunk 136/185, Remaining chunks: 49


🖼 Generating captions: 100%|██████████| 100/100 [00:30<00:00,  3.25it/s]


Processed chunk 137/185, Remaining chunks: 48


🖼 Generating captions: 100%|██████████| 100/100 [00:30<00:00,  3.26it/s]


Processed chunk 138/185, Remaining chunks: 47


🖼 Generating captions: 100%|██████████| 100/100 [00:30<00:00,  3.27it/s]


Processed chunk 139/185, Remaining chunks: 46


🖼 Generating captions: 100%|██████████| 100/100 [00:32<00:00,  3.10it/s]


Processed chunk 140/185, Remaining chunks: 45


🖼 Generating captions: 100%|██████████| 100/100 [00:31<00:00,  3.22it/s]


Processed chunk 141/185, Remaining chunks: 44


🖼 Generating captions: 100%|██████████| 100/100 [00:31<00:00,  3.15it/s]


Processed chunk 142/185, Remaining chunks: 43


🖼 Generating captions: 100%|██████████| 100/100 [00:32<00:00,  3.08it/s]


Processed chunk 143/185, Remaining chunks: 42


🖼 Generating captions: 100%|██████████| 100/100 [00:30<00:00,  3.23it/s]


Processed chunk 144/185, Remaining chunks: 41


🖼 Generating captions: 100%|██████████| 100/100 [00:28<00:00,  3.46it/s]


Processed chunk 145/185, Remaining chunks: 40


🖼 Generating captions: 100%|██████████| 100/100 [00:28<00:00,  3.49it/s]


Processed chunk 146/185, Remaining chunks: 39


🖼 Generating captions: 100%|██████████| 100/100 [00:28<00:00,  3.47it/s]


Processed chunk 147/185, Remaining chunks: 38


🖼 Generating captions: 100%|██████████| 100/100 [00:27<00:00,  3.62it/s]


Processed chunk 148/185, Remaining chunks: 37


🖼 Generating captions: 100%|██████████| 100/100 [00:28<00:00,  3.50it/s]


Processed chunk 149/185, Remaining chunks: 36


🖼 Generating captions: 100%|██████████| 100/100 [00:27<00:00,  3.63it/s]


Processed chunk 150/185, Remaining chunks: 35


🖼 Generating captions: 100%|██████████| 100/100 [00:28<00:00,  3.57it/s]


Processed chunk 151/185, Remaining chunks: 34


🖼 Generating captions: 100%|██████████| 100/100 [00:27<00:00,  3.59it/s]


Processed chunk 152/185, Remaining chunks: 33


🖼 Generating captions: 100%|██████████| 100/100 [00:29<00:00,  3.44it/s]


Processed chunk 153/185, Remaining chunks: 32


🖼 Generating captions: 100%|██████████| 100/100 [00:28<00:00,  3.45it/s]


Processed chunk 154/185, Remaining chunks: 31


🖼 Generating captions: 100%|██████████| 100/100 [00:27<00:00,  3.70it/s]


Processed chunk 155/185, Remaining chunks: 30


🖼 Generating captions: 100%|██████████| 100/100 [00:28<00:00,  3.55it/s]


Processed chunk 156/185, Remaining chunks: 29


🖼 Generating captions: 100%|██████████| 100/100 [00:27<00:00,  3.59it/s]


Processed chunk 157/185, Remaining chunks: 28


🖼 Generating captions: 100%|██████████| 100/100 [00:27<00:00,  3.67it/s]


Processed chunk 158/185, Remaining chunks: 27


🖼 Generating captions: 100%|██████████| 100/100 [00:28<00:00,  3.48it/s]


Processed chunk 159/185, Remaining chunks: 26


🖼 Generating captions: 100%|██████████| 100/100 [00:28<00:00,  3.50it/s]


Processed chunk 160/185, Remaining chunks: 25


🖼 Generating captions: 100%|██████████| 100/100 [00:28<00:00,  3.54it/s]


Processed chunk 161/185, Remaining chunks: 24


🖼 Generating captions: 100%|██████████| 100/100 [00:30<00:00,  3.24it/s]


Processed chunk 162/185, Remaining chunks: 23


🖼 Generating captions: 100%|██████████| 100/100 [00:29<00:00,  3.40it/s]


Processed chunk 163/185, Remaining chunks: 22


🖼 Generating captions: 100%|██████████| 100/100 [00:33<00:00,  3.02it/s]


Processed chunk 164/185, Remaining chunks: 21


🖼 Generating captions: 100%|██████████| 100/100 [00:32<00:00,  3.07it/s]


Processed chunk 165/185, Remaining chunks: 20


🖼 Generating captions: 100%|██████████| 100/100 [00:32<00:00,  3.09it/s]


Processed chunk 166/185, Remaining chunks: 19


🖼 Generating captions: 100%|██████████| 100/100 [00:32<00:00,  3.05it/s]


Processed chunk 167/185, Remaining chunks: 18


🖼 Generating captions: 100%|██████████| 100/100 [00:32<00:00,  3.07it/s]


Processed chunk 168/185, Remaining chunks: 17


🖼 Generating captions: 100%|██████████| 100/100 [00:32<00:00,  3.12it/s]


Processed chunk 169/185, Remaining chunks: 16


🖼 Generating captions: 100%|██████████| 100/100 [00:33<00:00,  3.00it/s]


Processed chunk 170/185, Remaining chunks: 15


🖼 Generating captions: 100%|██████████| 100/100 [00:30<00:00,  3.30it/s]


Processed chunk 171/185, Remaining chunks: 14


🖼 Generating captions: 100%|██████████| 100/100 [00:31<00:00,  3.15it/s]


Processed chunk 172/185, Remaining chunks: 13


🖼 Generating captions: 100%|██████████| 100/100 [00:31<00:00,  3.20it/s]


Processed chunk 173/185, Remaining chunks: 12


🖼 Generating captions: 100%|██████████| 100/100 [00:32<00:00,  3.09it/s]


Processed chunk 174/185, Remaining chunks: 11


🖼 Generating captions: 100%|██████████| 100/100 [00:31<00:00,  3.14it/s]


Processed chunk 175/185, Remaining chunks: 10


🖼 Generating captions: 100%|██████████| 100/100 [00:31<00:00,  3.15it/s]


Processed chunk 176/185, Remaining chunks: 9


🖼 Generating captions: 100%|██████████| 100/100 [00:33<00:00,  3.01it/s]


Processed chunk 177/185, Remaining chunks: 8


🖼 Generating captions: 100%|██████████| 100/100 [00:32<00:00,  3.05it/s]


Processed chunk 178/185, Remaining chunks: 7


🖼 Generating captions: 100%|██████████| 100/100 [00:31<00:00,  3.17it/s]


Processed chunk 179/185, Remaining chunks: 6


🖼 Generating captions: 100%|██████████| 100/100 [00:31<00:00,  3.16it/s]


Processed chunk 180/185, Remaining chunks: 5


🖼 Generating captions: 100%|██████████| 100/100 [00:32<00:00,  3.09it/s]


Processed chunk 181/185, Remaining chunks: 4


🖼 Generating captions: 100%|██████████| 100/100 [00:31<00:00,  3.18it/s]


Processed chunk 182/185, Remaining chunks: 3


🖼 Generating captions: 100%|██████████| 100/100 [00:32<00:00,  3.07it/s]


Processed chunk 183/185, Remaining chunks: 2


🖼 Generating captions: 100%|██████████| 100/100 [00:32<00:00,  3.09it/s]


Processed chunk 184/185, Remaining chunks: 1


🖼 Generating captions: 100%|██████████| 16/16 [00:04<00:00,  3.22it/s]


Processed chunk 185/185, Remaining chunks: 0
✅ Captions added and CSV saved.


In [None]:
!ls


drive  sample_data


In [None]:
!rm torch.py torch.pyc


rm: cannot remove 'torch.py': No such file or directory
rm: cannot remove 'torch.pyc': No such file or directory


In [None]:
!pip uninstall -y torch torchvision torchaudio
!pip uninstall -y transformers

!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install transformers


Found existing installation: torch 2.6.0+cu124
Uninstalling torch-2.6.0+cu124:
  Successfully uninstalled torch-2.6.0+cu124
Found existing installation: torchvision 0.21.0+cu124
Uninstalling torchvision-0.21.0+cu124:
  Successfully uninstalled torchvision-0.21.0+cu124
Found existing installation: torchaudio 2.6.0+cu124
Uninstalling torchaudio-2.6.0+cu124:
  Successfully uninstalled torchaudio-2.6.0+cu124
Found existing installation: transformers 4.51.3
Uninstalling transformers-4.51.3:
  Successfully uninstalled transformers-4.51.3
Looking in indexes: https://download.pytorch.org/whl/cu118
Collecting torch
  Downloading https://download.pytorch.org/whl/cu118/torch-2.7.0%2Bcu118-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (28 kB)
Collecting torchvision
  Downloading https://download.pytorch.org/whl/cu118/torchvision-0.22.0%2Bcu118-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (6.1 kB)
Collecting torchaudio
  Downloading https://download.pytorch.org/whl/cu118/torchaudio-2.7.0%2Bcu118

Collecting transformers
  Downloading transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Downloading transformers-4.51.3-py3-none-any.whl (10.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/10.4 MB[0m [31m109.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
[31mERROR: Operation cancelled by user[0m[31m
[0m^C


In [None]:
import os
import pandas as pd
from transformers import AutoProcessor, GitForCausalLM
from PIL import Image
import torch
from tqdm import tqdm
import concurrent.futures

csv_path = "path/to/your/dataset.csv"
base_image_dir = "path/to/images/"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"🚀 Using device: {device}")

processor = AutoProcessor.from_pretrained("microsoft/git-base")
model = GitForCausalLM.from_pretrained("microsoft/git-base").to(device)
model.eval()

df = pd.read_csv(csv_path)

def generate_caption(image_path):
    try:
        image = Image.open(image_path).convert("RGB")
        pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(device)
        input_ids = processor(text="generate caption:", return_tensors="pt").input_ids.to(device)

        with torch.no_grad():
            generated_ids = model.generate(pixel_values=pixel_values, input_ids=input_ids, max_length=50)

        caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
        return caption
    except Exception as e:
        print(f"⚠️ Error processing {image_path}: {e}")
        return ""

def process_images_parallel(image_paths):
    captions = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
        results = list(tqdm(executor.map(generate_caption, image_paths), total=len(image_paths), desc="🖼 Generating captions"))
        captions.extend(results)
    return captions

chunk_size = 100
image_paths = df["image_path"].apply(lambda x: os.path.join(base_image_dir, x.replace("\\", "/"))).tolist()
captions = []

total_chunks = len(image_paths) // chunk_size + (1 if len(image_paths) % chunk_size > 0 else 0)
print(f"Total chunks: {total_chunks}")

for i in range(0, len(image_paths), chunk_size):
    chunk = image_paths[i:i+chunk_size]
    chunk_captions = process_images_parallel(chunk)
    captions.extend(chunk_captions)
    remaining_chunks = total_chunks - (i // chunk_size + 1)
    print(f"Processed chunk {i // chunk_size + 1}/{total_chunks}, Remaining chunks: {remaining_chunks}")

df["generated_caption"] = captions
df.to_csv("path/to/output_git_base.csv", index=False)
print("✅ Captions added and CSV saved.")


🚀 Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/503 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/453 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.82k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/707M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

Total chunks: 185


🖼 Generating captions: 100%|██████████| 100/100 [00:21<00:00,  4.60it/s]


Processed chunk 1/185, Remaining chunks: 184


🖼 Generating captions: 100%|██████████| 100/100 [00:20<00:00,  4.94it/s]


Processed chunk 2/185, Remaining chunks: 183


🖼 Generating captions: 100%|██████████| 100/100 [00:20<00:00,  4.97it/s]


Processed chunk 3/185, Remaining chunks: 182


🖼 Generating captions: 100%|██████████| 100/100 [00:28<00:00,  3.47it/s]


Processed chunk 4/185, Remaining chunks: 181


🖼 Generating captions: 100%|██████████| 100/100 [00:21<00:00,  4.60it/s]


Processed chunk 5/185, Remaining chunks: 180


🖼 Generating captions: 100%|██████████| 100/100 [00:22<00:00,  4.39it/s]


Processed chunk 6/185, Remaining chunks: 179


🖼 Generating captions: 100%|██████████| 100/100 [00:21<00:00,  4.60it/s]


Processed chunk 7/185, Remaining chunks: 178


🖼 Generating captions: 100%|██████████| 100/100 [00:22<00:00,  4.51it/s]


Processed chunk 8/185, Remaining chunks: 177


🖼 Generating captions: 100%|██████████| 100/100 [00:24<00:00,  4.00it/s]


Processed chunk 9/185, Remaining chunks: 176


🖼 Generating captions: 100%|██████████| 100/100 [00:23<00:00,  4.33it/s]


Processed chunk 10/185, Remaining chunks: 175


🖼 Generating captions: 100%|██████████| 100/100 [00:23<00:00,  4.20it/s]


Processed chunk 11/185, Remaining chunks: 174


🖼 Generating captions: 100%|██████████| 100/100 [00:22<00:00,  4.37it/s]


Processed chunk 12/185, Remaining chunks: 173


🖼 Generating captions: 100%|██████████| 100/100 [00:21<00:00,  4.59it/s]


Processed chunk 13/185, Remaining chunks: 172


🖼 Generating captions: 100%|██████████| 100/100 [00:21<00:00,  4.66it/s]


Processed chunk 14/185, Remaining chunks: 171


🖼 Generating captions: 100%|██████████| 100/100 [00:23<00:00,  4.30it/s]


Processed chunk 15/185, Remaining chunks: 170


🖼 Generating captions: 100%|██████████| 100/100 [00:22<00:00,  4.39it/s]


Processed chunk 16/185, Remaining chunks: 169


🖼 Generating captions: 100%|██████████| 100/100 [00:24<00:00,  4.12it/s]


Processed chunk 17/185, Remaining chunks: 168


🖼 Generating captions: 100%|██████████| 100/100 [00:23<00:00,  4.30it/s]


Processed chunk 18/185, Remaining chunks: 167


🖼 Generating captions: 100%|██████████| 100/100 [00:23<00:00,  4.24it/s]


Processed chunk 19/185, Remaining chunks: 166


🖼 Generating captions: 100%|██████████| 100/100 [00:21<00:00,  4.75it/s]


Processed chunk 20/185, Remaining chunks: 165


🖼 Generating captions: 100%|██████████| 100/100 [00:23<00:00,  4.25it/s]


Processed chunk 21/185, Remaining chunks: 164


🖼 Generating captions: 100%|██████████| 100/100 [00:25<00:00,  3.90it/s]


Processed chunk 22/185, Remaining chunks: 163


🖼 Generating captions: 100%|██████████| 100/100 [00:38<00:00,  2.61it/s]


Processed chunk 23/185, Remaining chunks: 162


🖼 Generating captions: 100%|██████████| 100/100 [00:22<00:00,  4.44it/s]


Processed chunk 24/185, Remaining chunks: 161


🖼 Generating captions: 100%|██████████| 100/100 [00:23<00:00,  4.29it/s]


Processed chunk 25/185, Remaining chunks: 160


🖼 Generating captions: 100%|██████████| 100/100 [00:22<00:00,  4.39it/s]


Processed chunk 26/185, Remaining chunks: 159


🖼 Generating captions: 100%|██████████| 100/100 [00:21<00:00,  4.61it/s]


Processed chunk 27/185, Remaining chunks: 158


🖼 Generating captions: 100%|██████████| 100/100 [00:24<00:00,  4.13it/s]


Processed chunk 28/185, Remaining chunks: 157


🖼 Generating captions: 100%|██████████| 100/100 [00:22<00:00,  4.41it/s]


Processed chunk 29/185, Remaining chunks: 156


🖼 Generating captions: 100%|██████████| 100/100 [00:23<00:00,  4.27it/s]


Processed chunk 30/185, Remaining chunks: 155


🖼 Generating captions: 100%|██████████| 100/100 [00:21<00:00,  4.72it/s]


Processed chunk 31/185, Remaining chunks: 154


🖼 Generating captions: 100%|██████████| 100/100 [00:22<00:00,  4.44it/s]


Processed chunk 32/185, Remaining chunks: 153


🖼 Generating captions: 100%|██████████| 100/100 [00:22<00:00,  4.41it/s]


Processed chunk 33/185, Remaining chunks: 152


🖼 Generating captions: 100%|██████████| 100/100 [00:22<00:00,  4.53it/s]


Processed chunk 34/185, Remaining chunks: 151


🖼 Generating captions: 100%|██████████| 100/100 [00:22<00:00,  4.37it/s]


Processed chunk 35/185, Remaining chunks: 150


🖼 Generating captions: 100%|██████████| 100/100 [00:21<00:00,  4.68it/s]


Processed chunk 36/185, Remaining chunks: 149


🖼 Generating captions: 100%|██████████| 100/100 [00:22<00:00,  4.48it/s]


Processed chunk 37/185, Remaining chunks: 148


🖼 Generating captions: 100%|██████████| 100/100 [00:20<00:00,  4.90it/s]


Processed chunk 38/185, Remaining chunks: 147


🖼 Generating captions: 100%|██████████| 100/100 [01:03<00:00,  1.58it/s]


Processed chunk 39/185, Remaining chunks: 146


🖼 Generating captions: 100%|██████████| 100/100 [00:19<00:00,  5.13it/s]


Processed chunk 40/185, Remaining chunks: 145


🖼 Generating captions: 100%|██████████| 100/100 [00:18<00:00,  5.44it/s]


Processed chunk 41/185, Remaining chunks: 144


🖼 Generating captions: 100%|██████████| 100/100 [00:21<00:00,  4.56it/s]


Processed chunk 42/185, Remaining chunks: 143


🖼 Generating captions: 100%|██████████| 100/100 [00:20<00:00,  4.94it/s]


Processed chunk 43/185, Remaining chunks: 142


🖼 Generating captions: 100%|██████████| 100/100 [00:19<00:00,  5.20it/s]


Processed chunk 44/185, Remaining chunks: 141


🖼 Generating captions: 100%|██████████| 100/100 [00:20<00:00,  4.88it/s]


Processed chunk 45/185, Remaining chunks: 140


🖼 Generating captions: 100%|██████████| 100/100 [00:20<00:00,  4.77it/s]


Processed chunk 46/185, Remaining chunks: 139


🖼 Generating captions: 100%|██████████| 100/100 [00:19<00:00,  5.10it/s]


Processed chunk 47/185, Remaining chunks: 138


🖼 Generating captions: 100%|██████████| 100/100 [00:20<00:00,  4.92it/s]


Processed chunk 48/185, Remaining chunks: 137


🖼 Generating captions: 100%|██████████| 100/100 [00:21<00:00,  4.65it/s]


Processed chunk 49/185, Remaining chunks: 136


🖼 Generating captions: 100%|██████████| 100/100 [00:21<00:00,  4.59it/s]


Processed chunk 50/185, Remaining chunks: 135


🖼 Generating captions: 100%|██████████| 100/100 [00:20<00:00,  4.77it/s]


Processed chunk 51/185, Remaining chunks: 134


🖼 Generating captions: 100%|██████████| 100/100 [00:19<00:00,  5.14it/s]


Processed chunk 52/185, Remaining chunks: 133


🖼 Generating captions: 100%|██████████| 100/100 [00:19<00:00,  5.22it/s]


Processed chunk 53/185, Remaining chunks: 132


🖼 Generating captions: 100%|██████████| 100/100 [00:20<00:00,  4.95it/s]


Processed chunk 54/185, Remaining chunks: 131


🖼 Generating captions: 100%|██████████| 100/100 [00:19<00:00,  5.04it/s]


Processed chunk 55/185, Remaining chunks: 130


🖼 Generating captions: 100%|██████████| 100/100 [00:19<00:00,  5.04it/s]


Processed chunk 56/185, Remaining chunks: 129


🖼 Generating captions: 100%|██████████| 100/100 [00:20<00:00,  4.82it/s]


Processed chunk 57/185, Remaining chunks: 128


🖼 Generating captions: 100%|██████████| 100/100 [00:21<00:00,  4.61it/s]


Processed chunk 58/185, Remaining chunks: 127


🖼 Generating captions: 100%|██████████| 100/100 [00:21<00:00,  4.73it/s]


Processed chunk 59/185, Remaining chunks: 126


🖼 Generating captions: 100%|██████████| 100/100 [00:22<00:00,  4.37it/s]


Processed chunk 60/185, Remaining chunks: 125


🖼 Generating captions: 100%|██████████| 100/100 [00:20<00:00,  4.86it/s]


Processed chunk 61/185, Remaining chunks: 124


🖼 Generating captions: 100%|██████████| 100/100 [00:19<00:00,  5.06it/s]


Processed chunk 62/185, Remaining chunks: 123


🖼 Generating captions: 100%|██████████| 100/100 [00:29<00:00,  3.35it/s]


Processed chunk 63/185, Remaining chunks: 122


🖼 Generating captions: 100%|██████████| 100/100 [00:20<00:00,  4.83it/s]


Processed chunk 64/185, Remaining chunks: 121


🖼 Generating captions: 100%|██████████| 100/100 [00:20<00:00,  4.90it/s]


Processed chunk 65/185, Remaining chunks: 120


🖼 Generating captions: 100%|██████████| 100/100 [00:19<00:00,  5.05it/s]


Processed chunk 66/185, Remaining chunks: 119


🖼 Generating captions: 100%|██████████| 100/100 [00:21<00:00,  4.76it/s]


Processed chunk 67/185, Remaining chunks: 118


🖼 Generating captions: 100%|██████████| 100/100 [00:35<00:00,  2.86it/s]


Processed chunk 68/185, Remaining chunks: 117


🖼 Generating captions: 100%|██████████| 100/100 [00:19<00:00,  5.20it/s]


Processed chunk 69/185, Remaining chunks: 116


🖼 Generating captions: 100%|██████████| 100/100 [00:20<00:00,  4.77it/s]


Processed chunk 70/185, Remaining chunks: 115


🖼 Generating captions: 100%|██████████| 100/100 [00:21<00:00,  4.56it/s]


Processed chunk 71/185, Remaining chunks: 114


🖼 Generating captions: 100%|██████████| 100/100 [00:19<00:00,  5.15it/s]


Processed chunk 72/185, Remaining chunks: 113


🖼 Generating captions: 100%|██████████| 100/100 [00:19<00:00,  5.13it/s]


Processed chunk 73/185, Remaining chunks: 112


🖼 Generating captions: 100%|██████████| 100/100 [00:20<00:00,  5.00it/s]


Processed chunk 74/185, Remaining chunks: 111


🖼 Generating captions: 100%|██████████| 100/100 [00:20<00:00,  4.87it/s]


Processed chunk 75/185, Remaining chunks: 110


🖼 Generating captions: 100%|██████████| 100/100 [00:20<00:00,  5.00it/s]


Processed chunk 76/185, Remaining chunks: 109


🖼 Generating captions: 100%|██████████| 100/100 [00:20<00:00,  4.78it/s]


Processed chunk 77/185, Remaining chunks: 108


🖼 Generating captions: 100%|██████████| 100/100 [00:19<00:00,  5.17it/s]


Processed chunk 78/185, Remaining chunks: 107


🖼 Generating captions: 100%|██████████| 100/100 [00:19<00:00,  5.19it/s]


Processed chunk 79/185, Remaining chunks: 106


🖼 Generating captions: 100%|██████████| 100/100 [00:20<00:00,  4.92it/s]


Processed chunk 80/185, Remaining chunks: 105


🖼 Generating captions: 100%|██████████| 100/100 [00:21<00:00,  4.73it/s]


Processed chunk 81/185, Remaining chunks: 104


🖼 Generating captions: 100%|██████████| 100/100 [00:20<00:00,  4.99it/s]


Processed chunk 82/185, Remaining chunks: 103


🖼 Generating captions: 100%|██████████| 100/100 [00:19<00:00,  5.16it/s]


Processed chunk 83/185, Remaining chunks: 102


🖼 Generating captions: 100%|██████████| 100/100 [00:18<00:00,  5.33it/s]


Processed chunk 84/185, Remaining chunks: 101


🖼 Generating captions: 100%|██████████| 100/100 [00:21<00:00,  4.74it/s]


Processed chunk 85/185, Remaining chunks: 100


🖼 Generating captions: 100%|██████████| 100/100 [00:18<00:00,  5.38it/s]


Processed chunk 86/185, Remaining chunks: 99


🖼 Generating captions: 100%|██████████| 100/100 [00:21<00:00,  4.61it/s]


Processed chunk 87/185, Remaining chunks: 98


🖼 Generating captions: 100%|██████████| 100/100 [00:18<00:00,  5.40it/s]


Processed chunk 88/185, Remaining chunks: 97


🖼 Generating captions: 100%|██████████| 100/100 [00:18<00:00,  5.48it/s]


Processed chunk 89/185, Remaining chunks: 96


🖼 Generating captions: 100%|██████████| 100/100 [00:19<00:00,  5.10it/s]


Processed chunk 90/185, Remaining chunks: 95


🖼 Generating captions: 100%|██████████| 100/100 [00:41<00:00,  2.43it/s]


Processed chunk 91/185, Remaining chunks: 94


🖼 Generating captions: 100%|██████████| 100/100 [00:23<00:00,  4.21it/s]


Processed chunk 92/185, Remaining chunks: 93


🖼 Generating captions: 100%|██████████| 100/100 [00:23<00:00,  4.30it/s]


Processed chunk 93/185, Remaining chunks: 92


🖼 Generating captions: 100%|██████████| 100/100 [00:22<00:00,  4.39it/s]


Processed chunk 94/185, Remaining chunks: 91


🖼 Generating captions: 100%|██████████| 100/100 [00:19<00:00,  5.11it/s]


Processed chunk 95/185, Remaining chunks: 90


🖼 Generating captions: 100%|██████████| 100/100 [01:13<00:00,  1.37it/s]


Processed chunk 96/185, Remaining chunks: 89


🖼 Generating captions: 100%|██████████| 100/100 [00:23<00:00,  4.19it/s]


Processed chunk 97/185, Remaining chunks: 88


🖼 Generating captions: 100%|██████████| 100/100 [00:22<00:00,  4.35it/s]


Processed chunk 98/185, Remaining chunks: 87


🖼 Generating captions: 100%|██████████| 100/100 [00:19<00:00,  5.12it/s]


Processed chunk 99/185, Remaining chunks: 86


🖼 Generating captions: 100%|██████████| 100/100 [00:30<00:00,  3.31it/s]


Processed chunk 100/185, Remaining chunks: 85


🖼 Generating captions: 100%|██████████| 100/100 [00:22<00:00,  4.36it/s]


Processed chunk 101/185, Remaining chunks: 84


🖼 Generating captions: 100%|██████████| 100/100 [00:22<00:00,  4.44it/s]


Processed chunk 102/185, Remaining chunks: 83


🖼 Generating captions: 100%|██████████| 100/100 [00:21<00:00,  4.74it/s]


Processed chunk 103/185, Remaining chunks: 82


🖼 Generating captions: 100%|██████████| 100/100 [00:20<00:00,  4.87it/s]


Processed chunk 104/185, Remaining chunks: 81


🖼 Generating captions: 100%|██████████| 100/100 [00:22<00:00,  4.46it/s]


Processed chunk 105/185, Remaining chunks: 80


🖼 Generating captions: 100%|██████████| 100/100 [00:21<00:00,  4.71it/s]


Processed chunk 106/185, Remaining chunks: 79


🖼 Generating captions: 100%|██████████| 100/100 [00:22<00:00,  4.48it/s]


Processed chunk 107/185, Remaining chunks: 78


🖼 Generating captions: 100%|██████████| 100/100 [00:21<00:00,  4.66it/s]


Processed chunk 108/185, Remaining chunks: 77


🖼 Generating captions: 100%|██████████| 100/100 [00:21<00:00,  4.70it/s]


Processed chunk 109/185, Remaining chunks: 76


🖼 Generating captions: 100%|██████████| 100/100 [00:25<00:00,  3.98it/s]


Processed chunk 110/185, Remaining chunks: 75


🖼 Generating captions: 100%|██████████| 100/100 [00:54<00:00,  1.84it/s]


Processed chunk 111/185, Remaining chunks: 74


🖼 Generating captions: 100%|██████████| 100/100 [00:22<00:00,  4.53it/s]


Processed chunk 112/185, Remaining chunks: 73


🖼 Generating captions: 100%|██████████| 100/100 [00:21<00:00,  4.74it/s]


Processed chunk 113/185, Remaining chunks: 72


🖼 Generating captions: 100%|██████████| 100/100 [00:19<00:00,  5.05it/s]


Processed chunk 114/185, Remaining chunks: 71


🖼 Generating captions: 100%|██████████| 100/100 [00:21<00:00,  4.74it/s]


Processed chunk 115/185, Remaining chunks: 70


🖼 Generating captions: 100%|██████████| 100/100 [00:20<00:00,  4.90it/s]


Processed chunk 116/185, Remaining chunks: 69


🖼 Generating captions: 100%|██████████| 100/100 [00:21<00:00,  4.61it/s]


Processed chunk 117/185, Remaining chunks: 68


🖼 Generating captions: 100%|██████████| 100/100 [00:23<00:00,  4.29it/s]


Processed chunk 118/185, Remaining chunks: 67


🖼 Generating captions: 100%|██████████| 100/100 [00:21<00:00,  4.57it/s]


Processed chunk 119/185, Remaining chunks: 66


🖼 Generating captions: 100%|██████████| 100/100 [00:24<00:00,  4.08it/s]


Processed chunk 120/185, Remaining chunks: 65


🖼 Generating captions: 100%|██████████| 100/100 [00:21<00:00,  4.74it/s]


Processed chunk 121/185, Remaining chunks: 64


🖼 Generating captions: 100%|██████████| 100/100 [00:21<00:00,  4.63it/s]


Processed chunk 122/185, Remaining chunks: 63


🖼 Generating captions: 100%|██████████| 100/100 [00:22<00:00,  4.52it/s]


Processed chunk 123/185, Remaining chunks: 62


🖼 Generating captions: 100%|██████████| 100/100 [00:19<00:00,  5.20it/s]


Processed chunk 124/185, Remaining chunks: 61


🖼 Generating captions: 100%|██████████| 100/100 [00:23<00:00,  4.25it/s]


Processed chunk 125/185, Remaining chunks: 60


🖼 Generating captions: 100%|██████████| 100/100 [00:22<00:00,  4.44it/s]


Processed chunk 126/185, Remaining chunks: 59


🖼 Generating captions: 100%|██████████| 100/100 [00:21<00:00,  4.74it/s]


Processed chunk 127/185, Remaining chunks: 58


🖼 Generating captions: 100%|██████████| 100/100 [00:20<00:00,  4.89it/s]


Processed chunk 128/185, Remaining chunks: 57


🖼 Generating captions: 100%|██████████| 100/100 [00:22<00:00,  4.36it/s]


Processed chunk 129/185, Remaining chunks: 56


🖼 Generating captions: 100%|██████████| 100/100 [00:21<00:00,  4.67it/s]


Processed chunk 130/185, Remaining chunks: 55


🖼 Generating captions: 100%|██████████| 100/100 [00:20<00:00,  4.76it/s]


Processed chunk 131/185, Remaining chunks: 54


🖼 Generating captions: 100%|██████████| 100/100 [00:22<00:00,  4.41it/s]


Processed chunk 132/185, Remaining chunks: 53


🖼 Generating captions: 100%|██████████| 100/100 [00:22<00:00,  4.45it/s]


Processed chunk 133/185, Remaining chunks: 52


🖼 Generating captions: 100%|██████████| 100/100 [00:21<00:00,  4.55it/s]


Processed chunk 134/185, Remaining chunks: 51


🖼 Generating captions: 100%|██████████| 100/100 [00:23<00:00,  4.27it/s]


Processed chunk 135/185, Remaining chunks: 50


🖼 Generating captions: 100%|██████████| 100/100 [00:21<00:00,  4.69it/s]


Processed chunk 136/185, Remaining chunks: 49


🖼 Generating captions: 100%|██████████| 100/100 [00:21<00:00,  4.58it/s]


Processed chunk 137/185, Remaining chunks: 48


🖼 Generating captions: 100%|██████████| 100/100 [00:23<00:00,  4.18it/s]


Processed chunk 138/185, Remaining chunks: 47


🖼 Generating captions: 100%|██████████| 100/100 [00:23<00:00,  4.31it/s]


Processed chunk 139/185, Remaining chunks: 46


🖼 Generating captions: 100%|██████████| 100/100 [00:20<00:00,  4.78it/s]


Processed chunk 140/185, Remaining chunks: 45


🖼 Generating captions: 100%|██████████| 100/100 [00:22<00:00,  4.50it/s]


Processed chunk 141/185, Remaining chunks: 44


🖼 Generating captions: 100%|██████████| 100/100 [00:23<00:00,  4.23it/s]


Processed chunk 142/185, Remaining chunks: 43


🖼 Generating captions: 100%|██████████| 100/100 [00:50<00:00,  1.98it/s]


Processed chunk 143/185, Remaining chunks: 42


🖼 Generating captions: 100%|██████████| 100/100 [00:19<00:00,  5.04it/s]


Processed chunk 144/185, Remaining chunks: 41


🖼 Generating captions: 100%|██████████| 100/100 [00:28<00:00,  3.51it/s]


Processed chunk 145/185, Remaining chunks: 40


🖼 Generating captions: 100%|██████████| 100/100 [00:23<00:00,  4.26it/s]


Processed chunk 146/185, Remaining chunks: 39


🖼 Generating captions: 100%|██████████| 100/100 [00:22<00:00,  4.36it/s]


Processed chunk 147/185, Remaining chunks: 38


🖼 Generating captions: 100%|██████████| 100/100 [00:20<00:00,  4.81it/s]


Processed chunk 148/185, Remaining chunks: 37


🖼 Generating captions: 100%|██████████| 100/100 [00:22<00:00,  4.47it/s]


Processed chunk 149/185, Remaining chunks: 36


🖼 Generating captions: 100%|██████████| 100/100 [00:25<00:00,  3.92it/s]


Processed chunk 150/185, Remaining chunks: 35


🖼 Generating captions: 100%|██████████| 100/100 [00:24<00:00,  4.10it/s]


Processed chunk 151/185, Remaining chunks: 34


🖼 Generating captions: 100%|██████████| 100/100 [00:20<00:00,  4.94it/s]


Processed chunk 152/185, Remaining chunks: 33


🖼 Generating captions: 100%|██████████| 100/100 [00:24<00:00,  4.16it/s]


Processed chunk 153/185, Remaining chunks: 32


🖼 Generating captions: 100%|██████████| 100/100 [00:24<00:00,  4.04it/s]


Processed chunk 154/185, Remaining chunks: 31


🖼 Generating captions: 100%|██████████| 100/100 [00:21<00:00,  4.69it/s]


Processed chunk 155/185, Remaining chunks: 30


🖼 Generating captions: 100%|██████████| 100/100 [00:23<00:00,  4.18it/s]


Processed chunk 156/185, Remaining chunks: 29


🖼 Generating captions: 100%|██████████| 100/100 [00:22<00:00,  4.37it/s]


Processed chunk 157/185, Remaining chunks: 28


🖼 Generating captions: 100%|██████████| 100/100 [00:24<00:00,  4.03it/s]


Processed chunk 158/185, Remaining chunks: 27


🖼 Generating captions: 100%|██████████| 100/100 [00:20<00:00,  4.86it/s]


Processed chunk 159/185, Remaining chunks: 26


🖼 Generating captions: 100%|██████████| 100/100 [00:18<00:00,  5.37it/s]


Processed chunk 160/185, Remaining chunks: 25


🖼 Generating captions: 100%|██████████| 100/100 [00:18<00:00,  5.39it/s]


Processed chunk 161/185, Remaining chunks: 24


🖼 Generating captions: 100%|██████████| 100/100 [00:19<00:00,  5.20it/s]


Processed chunk 162/185, Remaining chunks: 23


🖼 Generating captions: 100%|██████████| 100/100 [00:27<00:00,  3.64it/s]


Processed chunk 163/185, Remaining chunks: 22


🖼 Generating captions: 100%|██████████| 100/100 [00:24<00:00,  4.10it/s]


Processed chunk 164/185, Remaining chunks: 21


🖼 Generating captions: 100%|██████████| 100/100 [00:22<00:00,  4.43it/s]


Processed chunk 165/185, Remaining chunks: 20


🖼 Generating captions: 100%|██████████| 100/100 [00:22<00:00,  4.47it/s]


Processed chunk 166/185, Remaining chunks: 19


🖼 Generating captions: 100%|██████████| 100/100 [00:21<00:00,  4.59it/s]


Processed chunk 167/185, Remaining chunks: 18


🖼 Generating captions: 100%|██████████| 100/100 [00:21<00:00,  4.65it/s]


Processed chunk 168/185, Remaining chunks: 17


🖼 Generating captions: 100%|██████████| 100/100 [00:21<00:00,  4.57it/s]


Processed chunk 169/185, Remaining chunks: 16


🖼 Generating captions: 100%|██████████| 100/100 [00:21<00:00,  4.60it/s]


Processed chunk 170/185, Remaining chunks: 15


🖼 Generating captions: 100%|██████████| 100/100 [00:20<00:00,  4.87it/s]


Processed chunk 171/185, Remaining chunks: 14


🖼 Generating captions: 100%|██████████| 100/100 [00:22<00:00,  4.40it/s]


Processed chunk 172/185, Remaining chunks: 13


🖼 Generating captions: 100%|██████████| 100/100 [00:23<00:00,  4.26it/s]


Processed chunk 173/185, Remaining chunks: 12


🖼 Generating captions: 100%|██████████| 100/100 [00:21<00:00,  4.55it/s]


Processed chunk 174/185, Remaining chunks: 11


🖼 Generating captions: 100%|██████████| 100/100 [00:23<00:00,  4.33it/s]


Processed chunk 175/185, Remaining chunks: 10


🖼 Generating captions: 100%|██████████| 100/100 [00:21<00:00,  4.69it/s]


Processed chunk 176/185, Remaining chunks: 9


🖼 Generating captions: 100%|██████████| 100/100 [00:23<00:00,  4.21it/s]


Processed chunk 177/185, Remaining chunks: 8


🖼 Generating captions: 100%|██████████| 100/100 [00:30<00:00,  3.27it/s]


Processed chunk 178/185, Remaining chunks: 7


🖼 Generating captions: 100%|██████████| 100/100 [00:20<00:00,  4.84it/s]


Processed chunk 179/185, Remaining chunks: 6


🖼 Generating captions: 100%|██████████| 100/100 [00:22<00:00,  4.46it/s]


Processed chunk 180/185, Remaining chunks: 5


🖼 Generating captions: 100%|██████████| 100/100 [00:22<00:00,  4.40it/s]


Processed chunk 181/185, Remaining chunks: 4


🖼 Generating captions: 100%|██████████| 100/100 [00:21<00:00,  4.64it/s]


Processed chunk 182/185, Remaining chunks: 3


🖼 Generating captions: 100%|██████████| 100/100 [00:21<00:00,  4.70it/s]


Processed chunk 183/185, Remaining chunks: 2


🖼 Generating captions: 100%|██████████| 100/100 [00:21<00:00,  4.60it/s]


Processed chunk 184/185, Remaining chunks: 1


🖼 Generating captions: 100%|██████████| 16/16 [00:05<00:00,  2.67it/s]


Processed chunk 185/185, Remaining chunks: 0
✅ Captions added and CSV saved.


In [None]:
import os
import pandas as pd
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
from PIL import Image, UnidentifiedImageError
import torch
from tqdm import tqdm
import concurrent.futures

csv_path = "path/to/your/dataset.csv"
image_folder = "path/to/images"
output_path = "path/to/output.csv"

df = pd.read_csv(csv_path)

def is_valid_image(unique_id):
    image_filename = f"{unique_id}.jpg"
    image_path = os.path.join(image_folder, image_filename)
    if not os.path.exists(image_path):
        return False
    try:
        img = Image.open(image_path)
        img.verify()
        return True
    except (UnidentifiedImageError, OSError):
        return False

valid_ids = []
for uid in tqdm(df["unique_id"], desc="Checking images"):
    if is_valid_image(uid):
        valid_ids.append(uid)
    if len(valid_ids) >= 20000:
        break

df = df[df["unique_id"].isin(valid_ids)].copy()
df.reset_index(drop=True, inplace=True)

model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
processor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

def generate_caption(image_path):
    try:
        image = Image.open(image_path).convert("RGB")
        pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(device)
        with torch.no_grad():
            output_ids = model.generate(pixel_values, max_length=50, num_beams=4)
        caption = tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()
        return caption
    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return ""

def process_images_parallel(image_paths):
    captions = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
        results = list(tqdm(executor.map(generate_caption, image_paths), total=len(image_paths), desc="Generating captions"))
        captions.extend(results)
    return captions

image_paths = df["unique_id"].apply(lambda uid: os.path.join(image_folder, f"{uid}.jpg")).tolist()

chunk_size = 100
captions = []

total_chunks = len(image_paths) // chunk_size + (1 if len(image_paths) % chunk_size > 0 else 0)

for i in range(0, len(image_paths), chunk_size):
    chunk = image_paths[i:i + chunk_size]
    chunk_captions = process_images_parallel(chunk)
    captions.extend(chunk_captions)

df["generated_caption"] = captions
df.to_csv(output_path, index=False)
print("Captions saved.")


📥 Loading CSV...
🔎 Validating images...


🔍 Checking images: 100%|██████████| 40945/40945 [57:26<00:00, 11.88it/s]


✅ Found 7736 valid images.
📦 Loading model...


Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "architectures": [
    "ViTModel"
  ],
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 224,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "pooler_act": "tanh",
  "pooler_output_size": 768,
  "qkv_bias": true,
  "torch_dtype": "float32",
  "transformers_version": "4.51.3"
}

Config of the decoder: <class 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'> is overwritten by shared decoder config: GPT2Config {
  "activation_function": "gelu_new",
  "add_cross_attention": true,
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "decoder_start_to

🚀 Using device: cuda
🗂 Building image paths...
🧩 Total chunks: 78
🔄 Processing chunk 1 of 78...


🖼 Generating captions:   0%|          | 0/100 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
We strongly recommend

✅ Chunk 1 done.
🔄 Processing chunk 2 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:41<00:00,  2.40it/s]


✅ Chunk 2 done.
🔄 Processing chunk 3 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:37<00:00,  2.64it/s]


✅ Chunk 3 done.
🔄 Processing chunk 4 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:34<00:00,  2.91it/s]


✅ Chunk 4 done.
🔄 Processing chunk 5 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:34<00:00,  2.93it/s]


✅ Chunk 5 done.
🔄 Processing chunk 6 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:33<00:00,  2.94it/s]


✅ Chunk 6 done.
🔄 Processing chunk 7 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:35<00:00,  2.80it/s]


✅ Chunk 7 done.
🔄 Processing chunk 8 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:34<00:00,  2.87it/s]


✅ Chunk 8 done.
🔄 Processing chunk 9 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:34<00:00,  2.86it/s]


✅ Chunk 9 done.
🔄 Processing chunk 10 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:34<00:00,  2.91it/s]


✅ Chunk 10 done.
🔄 Processing chunk 11 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:35<00:00,  2.79it/s]


✅ Chunk 11 done.
🔄 Processing chunk 12 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:36<00:00,  2.74it/s]


✅ Chunk 12 done.
🔄 Processing chunk 13 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:39<00:00,  2.56it/s]


✅ Chunk 13 done.
🔄 Processing chunk 14 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:39<00:00,  2.54it/s]


✅ Chunk 14 done.
🔄 Processing chunk 15 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:36<00:00,  2.74it/s]


✅ Chunk 15 done.
🔄 Processing chunk 16 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:35<00:00,  2.81it/s]


✅ Chunk 16 done.
🔄 Processing chunk 17 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:35<00:00,  2.84it/s]


✅ Chunk 17 done.
🔄 Processing chunk 18 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:35<00:00,  2.83it/s]


✅ Chunk 18 done.
🔄 Processing chunk 19 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:34<00:00,  2.93it/s]


✅ Chunk 19 done.
🔄 Processing chunk 20 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:34<00:00,  2.90it/s]


✅ Chunk 20 done.
🔄 Processing chunk 21 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:35<00:00,  2.84it/s]


✅ Chunk 21 done.
🔄 Processing chunk 22 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:34<00:00,  2.89it/s]


✅ Chunk 22 done.
🔄 Processing chunk 23 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:35<00:00,  2.79it/s]


✅ Chunk 23 done.
🔄 Processing chunk 24 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:34<00:00,  2.86it/s]


✅ Chunk 24 done.
🔄 Processing chunk 25 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:34<00:00,  2.89it/s]


✅ Chunk 25 done.
🔄 Processing chunk 26 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:34<00:00,  2.93it/s]


✅ Chunk 26 done.
🔄 Processing chunk 27 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:32<00:00,  3.03it/s]


✅ Chunk 27 done.
🔄 Processing chunk 28 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:34<00:00,  2.89it/s]


✅ Chunk 28 done.
🔄 Processing chunk 29 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:37<00:00,  2.69it/s]


✅ Chunk 29 done.
🔄 Processing chunk 30 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:36<00:00,  2.71it/s]


✅ Chunk 30 done.
🔄 Processing chunk 31 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:39<00:00,  2.55it/s]


✅ Chunk 31 done.
🔄 Processing chunk 32 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:40<00:00,  2.44it/s]


✅ Chunk 32 done.
🔄 Processing chunk 33 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:35<00:00,  2.83it/s]


✅ Chunk 33 done.
🔄 Processing chunk 34 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:33<00:00,  2.99it/s]


✅ Chunk 34 done.
🔄 Processing chunk 35 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:34<00:00,  2.93it/s]


✅ Chunk 35 done.
🔄 Processing chunk 36 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:33<00:00,  2.95it/s]


✅ Chunk 36 done.
🔄 Processing chunk 37 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:34<00:00,  2.91it/s]


✅ Chunk 37 done.
🔄 Processing chunk 38 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:34<00:00,  2.92it/s]


✅ Chunk 38 done.
🔄 Processing chunk 39 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:33<00:00,  2.97it/s]


✅ Chunk 39 done.
🔄 Processing chunk 40 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:34<00:00,  2.92it/s]


✅ Chunk 40 done.
🔄 Processing chunk 41 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:33<00:00,  3.00it/s]


✅ Chunk 41 done.
🔄 Processing chunk 42 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:34<00:00,  2.86it/s]


✅ Chunk 42 done.
🔄 Processing chunk 43 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:36<00:00,  2.76it/s]


✅ Chunk 43 done.
🔄 Processing chunk 44 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:34<00:00,  2.86it/s]


✅ Chunk 44 done.
🔄 Processing chunk 45 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:35<00:00,  2.84it/s]


✅ Chunk 45 done.
🔄 Processing chunk 46 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:35<00:00,  2.84it/s]


✅ Chunk 46 done.
🔄 Processing chunk 47 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:35<00:00,  2.81it/s]


✅ Chunk 47 done.
🔄 Processing chunk 48 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:34<00:00,  2.88it/s]


✅ Chunk 48 done.
🔄 Processing chunk 49 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:39<00:00,  2.51it/s]


✅ Chunk 49 done.
🔄 Processing chunk 50 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:39<00:00,  2.56it/s]


✅ Chunk 50 done.
🔄 Processing chunk 51 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:37<00:00,  2.63it/s]


✅ Chunk 51 done.
🔄 Processing chunk 52 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:35<00:00,  2.79it/s]


✅ Chunk 52 done.
🔄 Processing chunk 53 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:35<00:00,  2.84it/s]


✅ Chunk 53 done.
🔄 Processing chunk 54 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:33<00:00,  2.96it/s]


✅ Chunk 54 done.
🔄 Processing chunk 55 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:34<00:00,  2.91it/s]


✅ Chunk 55 done.
🔄 Processing chunk 56 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:34<00:00,  2.87it/s]


✅ Chunk 56 done.
🔄 Processing chunk 57 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:34<00:00,  2.92it/s]


✅ Chunk 57 done.
🔄 Processing chunk 58 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:34<00:00,  2.87it/s]


✅ Chunk 58 done.
🔄 Processing chunk 59 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:33<00:00,  2.96it/s]


✅ Chunk 59 done.
🔄 Processing chunk 60 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:34<00:00,  2.91it/s]


✅ Chunk 60 done.
🔄 Processing chunk 61 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:34<00:00,  2.89it/s]


✅ Chunk 61 done.
🔄 Processing chunk 62 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:34<00:00,  2.90it/s]


✅ Chunk 62 done.
🔄 Processing chunk 63 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:34<00:00,  2.86it/s]


✅ Chunk 63 done.
🔄 Processing chunk 64 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:47<00:00,  2.10it/s]


✅ Chunk 64 done.
🔄 Processing chunk 65 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:39<00:00,  2.56it/s]


✅ Chunk 65 done.
🔄 Processing chunk 66 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:38<00:00,  2.63it/s]


✅ Chunk 66 done.
🔄 Processing chunk 67 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:34<00:00,  2.90it/s]


✅ Chunk 67 done.
🔄 Processing chunk 68 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:34<00:00,  2.93it/s]


✅ Chunk 68 done.
🔄 Processing chunk 69 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:34<00:00,  2.91it/s]


✅ Chunk 69 done.
🔄 Processing chunk 70 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:37<00:00,  2.68it/s]


✅ Chunk 70 done.
🔄 Processing chunk 71 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:33<00:00,  2.97it/s]


✅ Chunk 71 done.
🔄 Processing chunk 72 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:35<00:00,  2.83it/s]


✅ Chunk 72 done.
🔄 Processing chunk 73 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:34<00:00,  2.90it/s]


✅ Chunk 73 done.
🔄 Processing chunk 74 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:37<00:00,  2.69it/s]


✅ Chunk 74 done.
🔄 Processing chunk 75 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:37<00:00,  2.67it/s]


✅ Chunk 75 done.
🔄 Processing chunk 76 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:35<00:00,  2.82it/s]


✅ Chunk 76 done.
🔄 Processing chunk 77 of 78...


🖼 Generating captions: 100%|██████████| 100/100 [00:35<00:00,  2.83it/s]


✅ Chunk 77 done.
🔄 Processing chunk 78 of 78...


🖼 Generating captions: 100%|██████████| 36/36 [00:12<00:00,  2.79it/s]


✅ Chunk 78 done.
✅ Captions saved to file: /content/drive/MyDrive/ColabNotebooks_/dataset_thesis/labels/dataset_with_generated_captions_git_20000.csv


In [None]:
# !pip install transformers torch torchvision
# !pip install ftfy  # для корректной обработки текста
# !pip install git+https://github.com/rmokady/CLIP_prefix_caption.git
!pip install git+https://github.com/openai/CLIP.git



Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-nmrupw96
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-nmrupw96
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->clip==1.0)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->clip==1.0)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->clip==1.0)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch->clip==1.0)
  Downloading nvidia_cudnn_cu12-9.1.0.70-

In [None]:
# !pip install ftfy
!pip install transformers


Collecting transformers
  Using cached transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Using cached transformers-4.51.3-py3-none-any.whl (10.4 MB)
Installing collected packages: transformers
Successfully installed transformers-4.51.3


In [None]:
!pip install --upgrade transformers




In [None]:
!pip install --upgrade transformers




In [None]:
!pip install ftfy


Collecting ftfy
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Downloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ftfy
Successfully installed ftfy-6.3.1


In [None]:
from transformers import AutoProcessor, AutoModelForCausalLM
import torch
from PIL import Image, UnidentifiedImageError
import ftfy
from tqdm import tqdm
import concurrent.futures
import pandas as pd
import os

csv_path = "path/to/your/dataset.csv"
image_folder = "path/to/images"
output_csv_path = "path/to/output.csv"

git_processor = AutoProcessor.from_pretrained("microsoft/git-base")
git_model = AutoModelForCausalLM.from_pretrained("microsoft/git-base")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
git_model.to(device)

print(f"Using device: {device}")

df = pd.read_csv(csv_path)

def is_valid_image(unique_id):
    image_filename = f"{unique_id}.jpg"
    image_path = os.path.join(image_folder, image_filename)
    if not os.path.exists(image_path):
        return False
    try:
        img = Image.open(image_path)
        img.verify()
        return True
    except (UnidentifiedImageError, OSError):
        return False

def generate_caption(image_path):
    try:
        image = Image.open(image_path).convert("RGB")
        inputs = git_processor(images=image, return_tensors="pt").to(device)
        pixel_values = inputs.pixel_values
        generated_ids = git_model.generate(pixel_values=pixel_values, max_length=50)
        generated_caption = git_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        return ftfy.fix_text(generated_caption)
    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return ""

def process_images_parallel(image_paths):
    captions = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
        results = list(tqdm(executor.map(generate_caption, image_paths), total=len(image_paths), desc="Generating captions"))
        captions.extend(results)
    return captions

valid_entries = []
for _, row in tqdm(df.iterrows(), total=len(df), desc="Checking valid images"):
    uid = row["unique_id"]
    if is_valid_image(uid):
        valid_entries.append(row)
    if len(valid_entries) >= 20000:
        break

valid_df = pd.DataFrame(valid_entries)
valid_df = valid_df.reset_index(drop=True)

image_paths = [os.path.join(image_folder, f"{uid}.jpg") for uid in valid_df["unique_id"]]

chunk_size = 100
captions = []
total_chunks = len(image_paths) // chunk_size + (1 if len(image_paths) % chunk_size > 0 else 0)
print(f"Total chunks: {total_chunks}")

for i in range(0, len(image_paths), chunk_size):
    chunk = image_paths[i:i + chunk_size]
    chunk_captions = process_images_parallel(chunk)
    captions.extend(chunk_captions)
    print(f"Processed chunk {i // chunk_size + 1}/{total_chunks}")

valid_df["generated_caption"] = captions
valid_df.to_csv(output_csv_path, index=False)
print("Captions generated and saved.")


preprocessor_config.json:   0%|          | 0.00/503 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/453 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.82k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/707M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

🚀 Using device: cuda


🔍 Checking valid images: 100%|██████████| 40945/40945 [01:39<00:00, 413.00it/s]


Total chunks: 78


🖼 Generating captions: 100%|██████████| 100/100 [00:19<00:00,  5.18it/s]


Processed chunk 1/78


🖼 Generating captions: 100%|██████████| 100/100 [00:25<00:00,  3.99it/s]


Processed chunk 2/78


🖼 Generating captions: 100%|██████████| 100/100 [00:22<00:00,  4.44it/s]


Processed chunk 3/78


🖼 Generating captions: 100%|██████████| 100/100 [00:27<00:00,  3.60it/s]


Processed chunk 4/78


🖼 Generating captions: 100%|██████████| 100/100 [00:21<00:00,  4.56it/s]


Processed chunk 5/78


🖼 Generating captions: 100%|██████████| 100/100 [00:25<00:00,  3.97it/s]


Processed chunk 6/78


🖼 Generating captions: 100%|██████████| 100/100 [00:19<00:00,  5.06it/s]


Processed chunk 7/78


🖼 Generating captions: 100%|██████████| 100/100 [00:19<00:00,  5.01it/s]


Processed chunk 8/78


🖼 Generating captions: 100%|██████████| 100/100 [00:20<00:00,  4.92it/s]


Processed chunk 9/78


🖼 Generating captions: 100%|██████████| 100/100 [00:20<00:00,  4.97it/s]


Processed chunk 10/78


🖼 Generating captions: 100%|██████████| 100/100 [00:19<00:00,  5.04it/s]


Processed chunk 11/78


🖼 Generating captions: 100%|██████████| 100/100 [00:19<00:00,  5.01it/s]


Processed chunk 12/78


🖼 Generating captions: 100%|██████████| 100/100 [00:23<00:00,  4.27it/s]


Processed chunk 13/78


🖼 Generating captions: 100%|██████████| 100/100 [00:22<00:00,  4.40it/s]


Processed chunk 14/78


🖼 Generating captions: 100%|██████████| 100/100 [00:22<00:00,  4.49it/s]


Processed chunk 15/78


🖼 Generating captions: 100%|██████████| 100/100 [00:19<00:00,  5.14it/s]


Processed chunk 16/78


🖼 Generating captions: 100%|██████████| 100/100 [00:18<00:00,  5.28it/s]


Processed chunk 17/78


🖼 Generating captions: 100%|██████████| 100/100 [00:19<00:00,  5.19it/s]


Processed chunk 18/78


🖼 Generating captions: 100%|██████████| 100/100 [00:18<00:00,  5.44it/s]


Processed chunk 19/78


🖼 Generating captions: 100%|██████████| 100/100 [00:20<00:00,  4.99it/s]


Processed chunk 20/78


🖼 Generating captions: 100%|██████████| 100/100 [00:20<00:00,  4.93it/s]


Processed chunk 21/78


🖼 Generating captions: 100%|██████████| 100/100 [00:25<00:00,  3.91it/s]


Processed chunk 22/78


🖼 Generating captions: 100%|██████████| 100/100 [00:19<00:00,  5.18it/s]


Processed chunk 23/78


🖼 Generating captions: 100%|██████████| 100/100 [00:18<00:00,  5.35it/s]


Processed chunk 24/78


🖼 Generating captions: 100%|██████████| 100/100 [00:18<00:00,  5.46it/s]


Processed chunk 25/78


🖼 Generating captions: 100%|██████████| 100/100 [00:19<00:00,  5.15it/s]


Processed chunk 26/78


🖼 Generating captions: 100%|██████████| 100/100 [00:26<00:00,  3.76it/s]


Processed chunk 27/78


🖼 Generating captions: 100%|██████████| 100/100 [00:24<00:00,  4.15it/s]


Processed chunk 28/78


🖼 Generating captions: 100%|██████████| 100/100 [00:19<00:00,  5.14it/s]


Processed chunk 29/78


🖼 Generating captions: 100%|██████████| 100/100 [00:19<00:00,  5.06it/s]


Processed chunk 30/78


🖼 Generating captions: 100%|██████████| 100/100 [00:22<00:00,  4.44it/s]


Processed chunk 31/78


🖼 Generating captions: 100%|██████████| 100/100 [00:25<00:00,  3.99it/s]


Processed chunk 32/78


🖼 Generating captions: 100%|██████████| 100/100 [00:22<00:00,  4.50it/s]


Processed chunk 33/78


🖼 Generating captions: 100%|██████████| 100/100 [00:23<00:00,  4.33it/s]


Processed chunk 34/78


🖼 Generating captions: 100%|██████████| 100/100 [00:22<00:00,  4.35it/s]


Processed chunk 35/78


🖼 Generating captions: 100%|██████████| 100/100 [00:23<00:00,  4.35it/s]


Processed chunk 36/78


🖼 Generating captions: 100%|██████████| 100/100 [00:22<00:00,  4.41it/s]


Processed chunk 37/78


🖼 Generating captions: 100%|██████████| 100/100 [00:24<00:00,  4.13it/s]


Processed chunk 38/78


🖼 Generating captions: 100%|██████████| 100/100 [00:19<00:00,  5.14it/s]


Processed chunk 39/78


🖼 Generating captions: 100%|██████████| 100/100 [00:20<00:00,  4.95it/s]


Processed chunk 40/78


🖼 Generating captions: 100%|██████████| 100/100 [00:20<00:00,  4.98it/s]


Processed chunk 41/78


🖼 Generating captions: 100%|██████████| 100/100 [00:19<00:00,  5.01it/s]


Processed chunk 42/78


🖼 Generating captions: 100%|██████████| 100/100 [00:18<00:00,  5.49it/s]


Processed chunk 43/78


🖼 Generating captions: 100%|██████████| 100/100 [00:19<00:00,  5.17it/s]


Processed chunk 44/78


🖼 Generating captions: 100%|██████████| 100/100 [00:18<00:00,  5.34it/s]


Processed chunk 45/78


🖼 Generating captions: 100%|██████████| 100/100 [00:18<00:00,  5.28it/s]


Processed chunk 46/78


🖼 Generating captions: 100%|██████████| 100/100 [00:18<00:00,  5.48it/s]


Processed chunk 47/78


🖼 Generating captions: 100%|██████████| 100/100 [00:21<00:00,  4.61it/s]


Processed chunk 48/78


🖼 Generating captions: 100%|██████████| 100/100 [00:24<00:00,  4.15it/s]


Processed chunk 49/78


🖼 Generating captions: 100%|██████████| 100/100 [00:22<00:00,  4.42it/s]


Processed chunk 50/78


🖼 Generating captions: 100%|██████████| 100/100 [00:22<00:00,  4.54it/s]


Processed chunk 51/78


🖼 Generating captions: 100%|██████████| 100/100 [00:22<00:00,  4.45it/s]


Processed chunk 52/78


🖼 Generating captions: 100%|██████████| 100/100 [00:19<00:00,  5.13it/s]


Processed chunk 53/78


🖼 Generating captions: 100%|██████████| 100/100 [00:40<00:00,  2.50it/s]


Processed chunk 54/78


🖼 Generating captions: 100%|██████████| 100/100 [00:19<00:00,  5.11it/s]


Processed chunk 55/78


🖼 Generating captions: 100%|██████████| 100/100 [00:20<00:00,  4.95it/s]


Processed chunk 56/78


🖼 Generating captions: 100%|██████████| 100/100 [00:20<00:00,  4.95it/s]


Processed chunk 57/78


🖼 Generating captions: 100%|██████████| 100/100 [00:19<00:00,  5.05it/s]


Processed chunk 58/78


🖼 Generating captions: 100%|██████████| 100/100 [00:21<00:00,  4.70it/s]


Processed chunk 59/78


🖼 Generating captions: 100%|██████████| 100/100 [00:22<00:00,  4.46it/s]


Processed chunk 60/78


🖼 Generating captions: 100%|██████████| 100/100 [00:25<00:00,  3.90it/s]


Processed chunk 61/78


🖼 Generating captions: 100%|██████████| 100/100 [00:25<00:00,  3.85it/s]


Processed chunk 62/78


🖼 Generating captions: 100%|██████████| 100/100 [00:24<00:00,  4.15it/s]


Processed chunk 63/78


🖼 Generating captions: 100%|██████████| 100/100 [00:31<00:00,  3.13it/s]


Processed chunk 64/78


🖼 Generating captions: 100%|██████████| 100/100 [00:23<00:00,  4.20it/s]


Processed chunk 65/78


🖼 Generating captions: 100%|██████████| 100/100 [00:21<00:00,  4.68it/s]


Processed chunk 66/78


🖼 Generating captions: 100%|██████████| 100/100 [00:20<00:00,  4.81it/s]


Processed chunk 67/78


🖼 Generating captions: 100%|██████████| 100/100 [00:20<00:00,  4.98it/s]


Processed chunk 68/78


🖼 Generating captions: 100%|██████████| 100/100 [00:23<00:00,  4.28it/s]


Processed chunk 69/78


🖼 Generating captions: 100%|██████████| 100/100 [00:22<00:00,  4.48it/s]


Processed chunk 70/78


🖼 Generating captions: 100%|██████████| 100/100 [00:24<00:00,  4.06it/s]


Processed chunk 71/78


🖼 Generating captions: 100%|██████████| 100/100 [00:20<00:00,  4.99it/s]


Processed chunk 72/78


🖼 Generating captions: 100%|██████████| 100/100 [00:19<00:00,  5.11it/s]


Processed chunk 73/78


🖼 Generating captions: 100%|██████████| 100/100 [00:23<00:00,  4.32it/s]


Processed chunk 74/78


🖼 Generating captions: 100%|██████████| 100/100 [00:22<00:00,  4.49it/s]


Processed chunk 75/78


🖼 Generating captions: 100%|██████████| 100/100 [00:20<00:00,  4.90it/s]


Processed chunk 76/78


🖼 Generating captions: 100%|██████████| 100/100 [00:23<00:00,  4.30it/s]


Processed chunk 77/78


🖼 Generating captions: 100%|██████████| 36/36 [00:08<00:00,  4.10it/s]


Processed chunk 78/78
✅ Captions generated and saved.
