# Download the dataset

In [None]:
import os
import zipfile
import requests

def download_and_extract_flickr8k(destination_folder):
    url = "https://www.kaggle.com/api/v1/datasets/download/adityajn105/flickr8k"
    zip_path = os.path.join(destination_folder, "flickr8k.zip")

    # Download the dataset
    print("Downloading Flickr8k dataset...")
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        with open(zip_path, "wb") as file:
            for chunk in response.iter_content(chunk_size=1024):
                file.write(chunk)
        print("Download complete.")
    else:
        print(f"Failed to download dataset. Status code: {response.status_code}")
        return

    # Extract the dataset
    print("Extracting Flickr8k dataset...")
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(destination_folder)
    print("Extraction complete.")

    # Remove the zip file
    os.remove(zip_path)
    print("Cleaned up zip file.")

# Example usage
destination = os.path.expanduser("~/data")
download_and_extract_flickr8k(destination)

In [30]:
from PIL import Image
import pandas as pd
import os
from transformers import Blip2Processor, Blip2ForConditionalGeneration

processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xxl")
model     = Blip2ForConditionalGeneration.from_pretrained("Salesforce/instructblip-flan-t5-xxl")

# Path to the images folder
base_dir = os.getcwd()
data_folder = os.path.join(base_dir, "data")
images_folder = os.path.join(base_dir, "data", "Images")

print(f"Images folder: {images_folder}")

# Ensure the directories exist
if not os.path.exists(images_folder):
    raise FileNotFoundError(f"Images folder not found at {images_folder}")


destination = os.path.join(data_folder, "output")
os.makedirs(destination, exist_ok=True)


# List to store image filenames and their captions
captions_data = []

prompt = "Describe this image in rich detail (≈80 tokens):"


# Generate captions for each image
print("Generating captions for images...")
for image_file in os.listdir(images_folder):
    if image_file.endswith(".jpg"):
        image_path = os.path.join(images_folder, image_file)
        image = Image.open(image_path).convert("RGB")
      
        
        inputs = processor(images=image, text=prompt, return_tensors="pt")
        outputs = model.generate(max_new_tokens=150, do_sample=True, top_p=0.95)
        caption = processor.decode(outputs[0], skip_special_tokens=True)
        captions_data.append({"Image": image_file, "Caption": caption})
        print(f"Processed {image_file}: {caption}")
# Save the captions to a CSV file
captions_csv_path = os.path.join(destination, "captions.csv")
captions_df = pd.DataFrame(captions_data)
captions_df.to_csv(captions_csv_path, index=False)
print(f"Captions saved to {captions_csv_path}")

preprocessor_config.json:   0%|          | 0.00/432 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/21.0k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/68.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

You are using a model of type instructblip to instantiate a model of type blip-2. This is not supported for all configurations of models and can yield errors.


model.safetensors.index.json:   0%|          | 0.00/135k [00:00<?, ?B/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/9.99G [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/9.42G [00:00<?, ?B/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/9.87G [00:00<?, ?B/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/9.97G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/10.0G [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
# List to store image filenames and their captions
captions_data = []

prompt = "Describe this image in rich detail (≈80 tokens):"


# Generate captions for each image
print("Generating captions for images...")
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

for image_file in os.listdir(images_folder):
    if image_file.endswith(".jpg"):
        image_path = os.path.join(images_folder, image_file)
        image = Image.open(image_path).convert("RGB")
      
        inputs = processor(images=image, text=prompt, return_tensors="pt").to(device)
        outputs = model.generate(pixel_values=inputs.pixel_values, max_new_tokens=150, do_sample=True, top_p=0.95)
        caption = processor.decode(outputs[0], skip_special_tokens=True)
        captions_data.append({"Image": image_file, "Caption": caption})
        print(f"Processed {image_file}: {caption}")
captions_csv_path = os.path.join(destination, "captions.csv")
captions_df = pd.DataFrame(captions_data)
captions_df.to_csv(captions_csv_path, index=False)
print(f"Captions saved to {captions_csv_path}")