In [1]:
import requests
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration

## blip-image-captioning-base

In [3]:
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

In [5]:
def create_captions(image_path):
    raw_image = Image.open(image_path)
    if raw_image.mode != "RGB":
        raw_image = raw_image.convert(mode="RGB")
    
    inputs = processor(raw_image, return_tensors="pt")
    out = model.generate(**inputs, num_beams=3, num_return_sequences=3)

    for i, caption in enumerate(out):
        print(f"Caption {i+1}: {processor.decode(caption, skip_special_tokens=True)}")

In [7]:
create_captions('Image1.png')

Caption 1: a soccer player running on a soccer field
Caption 2: a soccer player is running on the field
Caption 3: a soccer player running on a field with a ball in his hand


In [8]:
create_captions('Image2.png')

Caption 1: two horses standing in a field under a cloudy sky
Caption 2: a couple of horses standing in a field
Caption 3: two horses are standing in a field under a cloudy sky


In [9]:
create_captions('Image3.png')

Caption 1: an image of a group of people in a circle
Caption 2: an image of a group of people in a circle with the words,'oraly adopt '
Caption 3: an image of a group of people in a circle with the words,'oraly adopts


## blip-image-captioning-large

In [2]:
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")

In [3]:
def create_captions(image_path):
    raw_image = Image.open(image_path)
    if raw_image.mode != "RGB":
        raw_image = raw_image.convert(mode="RGB")
    
    inputs = processor(raw_image, return_tensors="pt")
    out = model.generate(**inputs, num_beams=3, num_return_sequences=3)

    for i, caption in enumerate(out):
        print(f"Caption {i+1}: {processor.decode(caption, skip_special_tokens=True)}")

In [5]:
create_captions('Image1.png')

Caption 1: there is a man that is running on the field with a soccer ball
Caption 2: there is a man that is running with a soccer ball in his hand
Caption 3: there is a male soccer player that is running on the field


In [6]:
create_captions('Image2.png')

Caption 1: there are two horses that are standing together in the field
Caption 2: there are two horses that are standing in a field together
Caption 3: there are two horses that are standing in the grass together


In [7]:
create_captions('Image1.png')

Caption 1: there is a man that is running on the field with a soccer ball
Caption 2: there is a man that is running with a soccer ball in his hand
Caption 3: there is a male soccer player that is running on the field
