Learning Objectives
- OpenAI GPT-4V 모델 이용해서 비정형 이미지 데이터로부터 정형 필드 파싱해보기

In [None]:
%pip install llama-index-multi-modal-llms-openai
%pip install llama-index-multi-modal-llms-replicate

In [None]:
import os

os.environ["OPENAI_API_KEY"] = '<YOUR_API_KEY>'

In [None]:
from pathlib import Path

input_image_path = Path("restaurant_images")
if not input_image_path.exists():
    Path.mkdir(input_image_path)

In [None]:
!wget "https://docs.google.com/uc?export=download&id=1GlqcNJhGGbwLKjJK1QJ_nyswCTQ2K2Fq" -O ./restaurant_images/fried_chicken.png

In [None]:
from PIL import Image
import matplotlib.pyplot as plt

imageUrl = "./restaurant_images/fried_chicken.png"
image = Image.open(imageUrl).convert("RGB")

plt.figure(figsize=(16, 5))
plt.imshow(image)

In [None]:
from pydantic import BaseModel

class Restaurant(BaseModel):
    # 파싱 클래스 설명넣기
    """Data model for an restaurant."""
    # 파싱 필드 정의하기
    restaurant: str
    food: str
    discount: str
    price: str
    rating: str
    review: str

In [None]:
!pip install llama-index-readers-file

In [None]:
from llama_index.multi_modal_llms.openai import OpenAIMultiModal
from llama_index.core import SimpleDirectoryReader

image_documents = SimpleDirectoryReader("./restaurant_images").load_data()

openai_mm_llm = OpenAIMultiModal(
    model="gpt-4-vision-preview", max_new_tokens=1000
)

In [None]:
from llama_index.core.program import MultiModalLLMCompletionProgram
from llama_index.core.output_parsers import PydanticOutputParser

#파싱 인스트럭션
prompt_template_str = """\
    can you summarize what is in the image\
    and return the answer with json format \
"""
openai_program = MultiModalLLMCompletionProgram.from_defaults(
    output_parser=PydanticOutputParser(Restaurant),
    image_documents=image_documents,
    prompt_template_str=prompt_template_str,
    multi_modal_llm=openai_mm_llm,
    verbose=True,
)

In [None]:
response = openai_program()
for res in response:
    print(res)

In [None]:
# 아마존 상품
input_image_path = Path("amazon_images")
if not input_image_path.exists():
    Path.mkdir(input_image_path)

In [None]:
!wget "https://docs.google.com/uc?export=download&id=1p1Y1qAoM68eC4sAvvHaiJyPhdUZS0Gqb" -O ./amazon_images/amazon.png

In [None]:
imageUrl = "./amazon_images/amazon.png"
image = Image.open(imageUrl).convert("RGB")

plt.figure(figsize=(16, 5))
plt.imshow(image)

In [None]:
from pydantic import BaseModel

class Product(BaseModel):
    # 클래스 디스크립션 작성
    """Data model for a Amazon Product."""

    title: str
    category: str
    discount: str
    inventory: str
    description: str
    # 파싱 필드 정의

In [None]:
amazon_image_documents = SimpleDirectoryReader("./amazon_images").load_data()

prompt_template_str = """\
    can you summarize what is in the image\
    and return the answer with json format \
"""
openai_program_amazon = MultiModalLLMCompletionProgram.from_defaults(
    output_parser=PydanticOutputParser(Product),
    image_documents=amazon_image_documents,
    prompt_template_str=prompt_template_str,
    multi_modal_llm=openai_mm_llm,
    verbose=True,
)

In [None]:
response = openai_program_amazon()
for res in response:
    print(res)

In [None]:
input_image_path = Path("instagram_images")
if not input_image_path.exists():
    Path.mkdir(input_image_path)

In [None]:
!wget "https://docs.google.com/uc?export=download&id=12ZpBBFkYu-jzz1iz356U5kMikn4uN9ww" -O ./instagram_images/jordan.png

In [None]:
from PIL import Image
import matplotlib.pyplot as plt

imageUrl = "./instagram_images/jordan.png"
image = Image.open(imageUrl).convert("RGB")

plt.figure(figsize=(16, 5))
plt.imshow(image)

In [None]:
from pydantic import BaseModel


class InsAds(BaseModel):
    # 파싱 클래스 디스크립션 작성
    """Data model for a Instagram Ads."""

    account: str
    brand: str
    product: str
    price: str
    comments: str
    # 파싱 희망 필드 작성

In [None]:
ins_image_documents = SimpleDirectoryReader("./instagram_images").load_data()

prompt_template_str = """\
    can you summarize what is in the image\
    and return the answer with json format \
"""
openai_program_ins = MultiModalLLMCompletionProgram.from_defaults(
    output_parser=PydanticOutputParser(InsAds),
    image_documents=ins_image_documents,
    prompt_template_str=prompt_template_str,
    multi_modal_llm=openai_mm_llm,
    verbose=True,
)


response = openai_program_ins()
for res in response:
    print(res)