In [1]:
import requests
from PIL import Image
from transformers import AutoTokenizer, ViTImageProcessor, VisionEncoderDecoderModel
from transformers import pipeline

In [2]:
# 各種モデルの読み込み
# model = VisionEncoderDecoderModel.from_pretrained("./models/vit-gpt2-japanese-image-captioning_stair-captions/checkpoint-613500/")
model = VisionEncoderDecoderModel.from_pretrained("./models/vit-gpt2-japanese-image-captioning_stair-captions-result")
tokenizer = AutoTokenizer.from_pretrained("rinna/japanese-gpt2-medium", use_fast=False)
tokenizer.do_lower_case = True  # due to some bug of tokenizer config loading
image_processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")

Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at ./models/vit-gpt2-japanese-image-captioning_stair-captions/checkpoint-613500/ and are newly initialized: ['decoder.transformer.h.23.attn.masked_bias', 'decoder.transformer.h.10.attn.bias', 'decoder.transformer.h.11.attn.bias', 'decoder.transformer.h.16.attn.masked_bias', 'decoder.transformer.h.7.attn.masked_bias', 'decoder.transformer.h.13.attn.masked_bias', 'decoder.transformer.h.11.crossattention.masked_bias', 'decoder.transformer.h.4.crossattention.bias', 'decoder.transformer.h.1.crossattention.bias', 'decoder.transformer.h.3.crossattention.masked_bias', 'decoder.transformer.h.5.crossattention.bias', 'decoder.transformer.h.21.attn.masked_bias', 'decoder.transformer.h.14.crossattention.masked_bias', 'decoder.transformer.h.5.crossattention.masked_bias', 'decoder.transformer.h.5.attn.bias', 'decoder.transformer.h.4.crossattention.masked_bias', 'decoder.transformer.h.16.attn.bias', 'decoder.trans

In [3]:
# let's perform inference on an image
url = "http://images.cocodataset.org/val2017/000000039769.jpg"

image = Image.open(requests.get(url, stream=True).raw)
pixel_values = image_processor(image, return_tensors="pt").pixel_values

# autoregressively generate caption (uses greedy decoding by default)
generated_ids = model.generate(pixel_values,
                               max_new_tokens=30, 
                               num_beams=5,
                               early_stopping=True,
                               do_sample=True,
                               temperature=1.2,
                               top_k=50,
                               top_p=0.95, 
                               no_repeat_ngram_size=3,
                               num_return_sequences=5)
generated_texts = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
for text in generated_texts:
    print(text)



ピンクのクッションの上で二匹の猫がじゃれあっている
ピンクのクッションの上で二匹の猫がじゃれている
ピンクの布の上に二匹の猫が寝ている
ピンクのクッションの上で二匹の猫がじゃれている
ピンクのクッションの上で2匹の猫がじゃれている


In [4]:
model.save_pretrained("./models/vit-gpt2-japanese-image-captioning_stair-captions-result/")

In [5]:
pl = pipeline("image-to-text",
             model=model,
             tokenizer=tokenizer,
             feature_extractor=image_processor,)

In [6]:
pl(url)



[{'generated_text': 'ピンクの布の上に二匹の猫がいる'}]

In [7]:
pl("./photos/006.jpg")

[{'generated_text': 'たくさんの人が机に向かって作業をしている'}]