In [2]:
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
import torch
from PIL import Image

In [3]:
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

Downloading (…)lve/main/config.json: 100%|██████████| 4.61k/4.61k [00:00<00:00, 13.1MB/s]
Downloading pytorch_model.bin: 100%|██████████| 982M/982M [00:03<00:00, 276MB/s] 
Downloading (…)rocessor_config.json: 100%|██████████| 228/228 [00:00<00:00, 1.03MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 241/241 [00:00<00:00, 747kB/s]
Downloading (…)olve/main/vocab.json: 100%|██████████| 798k/798k [00:00<00:00, 47.3MB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 710kB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 1.36M/1.36M [00:00<00:00, 134MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 120/120 [00:00<00:00, 518kB/s]


In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

VisionEncoderDecoderModel(
  (encoder): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_featur

In [5]:
max_length = 16
num_beams = 4
gen_kwargs = {"max_length": max_length, "num_beams": num_beams}
def predict_step(image_paths):
  images = []
  for image_path in image_paths:
    i_image = Image.open(image_path)
    if i_image.mode != "RGB":
      i_image = i_image.convert(mode="RGB")

    images.append(i_image)

  pixel_values = feature_extractor(images=images, return_tensors="pt").pixel_values
  pixel_values = pixel_values.to(device)

  output_ids = model.generate(pixel_values, **gen_kwargs)

  preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
  preds = [pred.strip() for pred in preds]
  return preds

In [10]:
image = Image.open('Image1.png')

In [12]:
image.mode

'RGBA'

In [13]:
image = image.convert(mode="RGB")

In [14]:
pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values

In [18]:
pixel_values = pixel_values.to(device)

In [24]:
output_ids = model.generate(pixel_values)
output_ids



tensor([[50256,    64,   582,   287,   257, 11783,  8187, 17997,   257, 11783,
          2613,   220, 50256]])

In [28]:
tokenizer.batch_decode(output_ids, skip_special_tokens=True)

['a man in a soccer uniform kicking a soccer ball ']

In [9]:
predict_step(['Image1.png'])

['a man kicking a soccer ball on a field']