In [1]:
import requests
from PIL import Image
from transformers import AutoTokenizer, ViTImageProcessor, VisionEncoderDecoderModel
from transformers import pipeline

In [2]:
# 各種モデルの読み込み
model = VisionEncoderDecoderModel.from_pretrained("./models/vit-gpt2-japanese-image-captioning_stair-captions/checkpoint-82500/")
tokenizer = AutoTokenizer.from_pretrained("rinna/japanese-gpt2-medium", use_fast=False)
tokenizer.do_lower_case = True  # due to some bug of tokenizer config loading
image_processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")

You are using the legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565


In [7]:
# let's perform inference on an image
url = "http://images.cocodataset.org/val2017/000000039769.jpg"

image = Image.open(requests.get(url, stream=True).raw)
pixel_values = image_processor(image, return_tensors="pt").pixel_values

# autoregressively generate caption (uses greedy decoding by default)
generated_ids = model.generate(pixel_values,
                               max_new_tokens=30, 
                               num_beams=5,
                               early_stopping=True,
                               do_sample=True,
                               temperature=1.2,
                               top_k=50,
                               top_p=0.95, 
                               no_repeat_ngram_size=3,
                               num_return_sequences=5)
generated_texts = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
for text in generated_texts:
    print(text)

ピンク色のクッションの上に2匹の猫がいる
ピンクのベッドの上に猫が2匹寝ている
ピンクのクッションの上に猫が二匹いる
猫が二匹寄り添って寝転んでいる
ピンクのクッションの上に2匹の猫がいる


In [4]:
model.save_pretrained("./models/vit-gpt2-japanese-image-captioning_stair-captions-result/")

In [5]:
pipe = pipeline("image-to-text", model=model, tokenizer=tokenizer, feature_extractor=image_processor)

In [11]:
pipe.save_pretrained("./models/vit-gpt2-japanese-image-captioning_stair-captions-result-pipeline/")