In [2]:
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
import requests
from io import BytesIO
import os

## blip-image-captioning-base

In [4]:
pwd

'/workspaces/sample-image-captioning'

In [3]:
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base",cache_dir="/workspaces/sample-image-captioning/cache")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base",cache_dir="/workspaces/sample-image-captioning/cache")

Downloading (…)rocessor_config.json: 100%|██████████| 287/287 [00:00<00:00, 687kB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 438/438 [00:00<00:00, 1.92MB/s]
Downloading (…)solve/main/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 79.6MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 711k/711k [00:00<00:00, 1.68MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 125/125 [00:00<00:00, 525kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 4.56k/4.56k [00:00<00:00, 3.84MB/s]
Downloading pytorch_model.bin: 100%|██████████| 990M/990M [00:05<00:00, 192MB/s] 


In [44]:
processor = BlipProcessor.from_pretrained("cache\models--Salesforce--blip-image-captioning-base",local_files_only=True)
model = BlipForConditionalGeneration.from_pretrained("cache\models--Salesforce--blip-image-captioning-base",local_files_only=True)

In [54]:
def create_captions(image_path):
    if os.path.isfile(image_path):
        raw_image = Image.open(image_path)
    else:
        try:
            response = requests.get(image_path)
            raw_image = Image.open(BytesIO(response.content))
        except:
            print("Invalid url or file path")
    
    if raw_image.mode != "RGB":
        raw_image = raw_image.convert(mode="RGB")
    
    inputs = processor(raw_image, return_tensors="pt")
    out = model.generate(**inputs, num_beams=3, num_return_sequences=3, max_new_tokens=40)

    for i, caption in enumerate(out):
        print(f"Caption {i+1}: {processor.decode(caption, skip_special_tokens=True)}")

In [48]:
url = "https://th.bing.com/th/id/R.c378c27d8ea14c9912d8a3bfa43af8d5?rik=0utUgRzIevYDuQ&riu=http%3a%2f%2fbuzznigeria.com%2fwp-content%2fuploads%2f2015%2f07%2fOld-People-Doing-Sport-Photography_9.jpg&ehk=fjLqLr4hXha1AWFAKFUG4pL70aauzLsXbKzSOfW8xV0%3d&risl=&pid=ImgRaw&r=0"
create_captions(url)

Caption 1: an older woman doing a barbell exercise
Caption 2: an older woman is doing a barbell exercise
Caption 3: an older woman doing a barbell squat exercise


In [55]:
create_captions('Image1.png')

Caption 1: a soccer player running on a soccer field
Caption 2: a soccer player is running on the field
Caption 3: a soccer player running on a field with a ball in his hand


In [56]:
create_captions('Image2.png')

Caption 1: two horses standing in a field under a cloudy sky
Caption 2: a couple of horses standing in a field
Caption 3: two horses are standing in a field under a cloudy sky


In [57]:
create_captions('Image3.png')

Caption 1: an image of a group of people in a circle
Caption 2: an image of a group of people in a circle with the words,'oraly adopt'and '
Caption 3: an image of a group of people in a circle with the words,'oraly adopts'and


## blip-image-captioning-large

In [2]:
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")

In [3]:
def create_captions(image_path):
    raw_image = Image.open(image_path)
    if raw_image.mode != "RGB":
        raw_image = raw_image.convert(mode="RGB")
    
    inputs = processor(raw_image, return_tensors="pt")
    out = model.generate(**inputs, num_beams=3, num_return_sequences=3)

    for i, caption in enumerate(out):
        print(f"Caption {i+1}: {processor.decode(caption, skip_special_tokens=True)}")

In [5]:
create_captions('Image1.png')

Caption 1: there is a man that is running on the field with a soccer ball
Caption 2: there is a man that is running with a soccer ball in his hand
Caption 3: there is a male soccer player that is running on the field


In [6]:
create_captions('Image2.png')

Caption 1: there are two horses that are standing together in the field
Caption 2: there are two horses that are standing in a field together
Caption 3: there are two horses that are standing in the grass together


In [7]:
create_captions('Image1.png')

Caption 1: there is a man that is running on the field with a soccer ball
Caption 2: there is a man that is running with a soccer ball in his hand
Caption 3: there is a male soccer player that is running on the field
