In [2]:
from openai import OpenAI, AsyncOpenAI
from anthropic import Anthropic
from google.genai import types
from google import genai
from mistralai import Mistral

from dotenv import load_dotenv

import base64, os

In [3]:
image_path = "C:\\test_test\\response_rate_of_image_description\\data\\table_test.png"

from pydantic import BaseModel, Field

class ImageDescription(BaseModel):
    category : str = Field(description = "이미지의 카테고리. 4가지 카테고리 : ['figure', 'chart', 'table', 'equation']")
    description : str = Field(description = "Describe image in detail")

def encode_image(image_path : str):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')
    
base64_image = encode_image(image_path)


GPT

In [9]:
client = OpenAI()

ext = os.path.splitext(image_path)[1].replace('.','')

message_list = [
        {
            "role": "user",
            "content": [
                {
                    "type": "input_text",
                    "text": "주어진 이미지를 분석하세요. 이미지 종류('figure', 'chart', 'table', 'equation')와 설명을 반환하세요. 표의 경우 담겨있는 정보를 생략하지 말고 전부 일목요연하게 제공하세요. 설명의 경우 마크다운으로 작성하세요요"
                },
                {
                    "type": "input_image",
                    "image_url": f"data:image/{ext};base64,{base64_image}",
                    "detail": "high"
                }
            ]
        }
    ]

In [10]:
response = client.responses.parse(
        model = "gpt-4.1",
        input = message_list,
        text_format=ImageDescription
    )

In [12]:
client = AsyncOpenAI()

ext = os.path.splitext(image_path)[1].replace('.','')

message_list = [
        {
            "role": "user",
            "content": [
                {
                    "type": "input_text",
                    "text": "주어진 이미지를 분석하세요. 이미지 종류('figure', 'chart', 'table', 'equation')와 설명을 반환하세요. 표의 경우 담겨있는 정보를 생략하지 말고 전부 일목요연하게 제공하세요. 설명의 경우 마크다운으로 작성하세요요"
                },
                {
                    "type": "input_image",
                    "image_url": f"data:image/{ext};base64,{base64_image}",
                    "detail": "high"
                }
            ]
        }
    ]

In [13]:
response = await client.responses.parse(
        model = "gpt-4.1",
        input = message_list,
        text_format=ImageDescription
    )

In [9]:
response = await client.responses.create(
    model = "gpt-4.1",
    input = message_list,
    stream=True
)

In [11]:
from glom import glom

In [14]:
async for chunk in await client.responses.create(
    model = "gpt-4.1",
    input = message_list,
    stream=True
):
    
    content = glom(chunk, 'delta', default = None)

    if content is None:
        continue

    print(content)

**
이미
지
 종류
**
:
 **
table
**
 (
표
)


---


##
 설명



이
 이미
지는
 **
‘
운
영
위원
’
**
 명
단
을
 열
거
한
 표
입니다
.
 표
는
 다음
 네
 가지
 열
로
 구성
되어
 있습니다
:


1
.
 **
순
서
**

2
.
 **
소
속
/
성
명
**

3
.
 **
사진
**

4
.
 **
주
요
약
력
**


각
 운영
위원
별
 정보
는
 아래
와
 같이
 정
리
되어
 있습니다
.


---


###
 
1
.
 메
가
존
클
라우
드
(
주
)
 이
주
완
 대표
이
사
 (
위원
장
)

-
 **
사진
**
:
 제공
됨


-
 **
주
요
약
력
**

 
 -
 메
가
존
클
라우
드
(
주
)
 대표
이
사


 
 -
 벤
처
기업
협
회
 수
석
부
회
장


 
 -
 코
리아
스타
트
업
포
럼
 설
립
 부
회
장


 
 -
 클
라우
드
산
업
협
회
 협
회
장


 
 -
 한국
소
프트
웨어
산
업
협
회
 이
사


 
 -
 한국
클
라우
드
컴
퓨
팅
협
회
 정책
위원
회
 위
원
장


 
 -
 한국
프
롭
테
크
포
럼
 해외
진
출
위원
회
 위
원
장


 
 -
 대한민국
과
학
기
술
지
주
협
회
 위
원



---


###
 
2
.
 (
주
)
포
티
투
마
루
 김
동
환
 대표
이
사
 (
운
영
위원
)

-
 **
사진
**
:
 제공
됨


-
 **
주
요
약
력
**

 
 -
 데이터
 챌
린
지
(
공
공
 AI
 TF
 전문
위원
)

 
 -
 AI
 윤
리
/
보
안
 네
트
워크
 자
문
위원
(
과
기
부
)

 
 -
 AI
 학
습
용
 데이터
 품
질
 검
증
 주요
 인
사
(
과
기
부
)

 
 -
 AI
 정책
 자
문
(S
OS
!
 Safety
 인
공지
능
 정책
)

 
 -
 과
기
정
통
부
 장
관
 표
창


 
 -
 지
능
정보
산
업
협
회
 부
회
장



---

In [11]:
print(response.output_parsed)

category='figure' description='이 이미지는 만화 스타일로 그려진, 활짝 웃으며 춤을 추는 듯한 하마를 묘사하고 있습니다. 하마는 밝은 색조와 부드러운 터치로 표현되어 있으며, 배경은 따뜻한 색상의 식물 무늬로 이루어져 부드럽고 아늑한 분위기를 연출합니다. 하마는 한쪽 다리를 들고 두 팔을 크게 벌리며 환하게 웃고 있어, 유쾌하고 생동감 넘치는 모습을 나타냅니다.'


Anthropic

In [145]:
client = Anthropic()

In [146]:
message_list = [
    {
        "role" : "user",
        "content" : [
            {
                "type" : "image",
                "source" : {
                    "type" : "base64",
                    "media_type" : "image/png",
                    "data" : base64_image
                }
            },
            {
                "type" : "text",
                "text" : "Describe this image."
            }
        ]
    }
]

In [150]:

message = client.messages.create(
    model = "claude-sonnet-4-20250514",
    max_tokens = 1024,
    messages =message_list
)

In [151]:
print(message.content[0].text)

This image shows an organizational chart or committee roster (운영위원, meaning "Executive Committee Members") with 7 members listed. The table has columns for:

- 순서 (Order/Number)
- 소속/성명 (Affiliation/Name) 
- 사진 (Photo)
- 주요약력 (Major Career/Background)

Each row contains:
1. A numbered entry (1-7)
2. The person's company affiliation and name in Korean
3. A professional headshot photo
4. A detailed list of their career achievements and positions, marked with bullet points

The individuals appear to be executives or leaders from various technology and business companies, with their backgrounds including roles at major Korean companies, educational institutions, and industry organizations. The format suggests this is likely a committee or board composition for a Korean business organization or association.

The layout is professional and standardized, typical of corporate governance or organizational documentation in Korean business contexts.


Gemini

In [17]:
client = genai.Client()._aio

In [125]:
response = client.models.generate_content(
    model = "gemini-2.5-flash-preview-04-17",
    contents = [
        types.Part.from_bytes(
            data = base64_image,
            mime_type = "image/png"
        ),
        'Describe this image.'
    ]
)

In [21]:
async for chunk in await client.models.generate_content_stream(
    model = "gemini-2.5-flash-preview-04-17",
    contents = [
        types.Part.from_bytes(
            data = base64_image,
            mime_type = "image/png"
        ),
        'Describe this image.'
    ]
):
    candidate = glom(chunk, 'candidates', default = None)

    if candidate is None:
        continue

    content = candidate[0]

    response = content.content.parts[0].text

    finish_reason = glom(content, 'finish_reason', default = None)
    if finish_reason is not None:
        print(finish_reason.STOP)
        reason = finish_reason.STOP.value
        print(f"reason : {reason}")
    print(content)

    

content=Content(parts=[Part(video_metadata=None, thought=None, inline_data=None, file_data=None, thought_signature=None, code_execution_result=None, executable_code=None, function_call=None, function_response=None, text='Here is the extracted information from the')], role='model') citation_metadata=None finish_message=None token_count=None finish_reason=None url_context_metadata=None avg_logprobs=None grounding_metadata=None index=0 logprobs_result=None safety_ratings=None
content=Content(parts=[Part(video_metadata=None, thought=None, inline_data=None, file_data=None, thought_signature=None, code_execution_result=None, executable_code=None, function_call=None, function_response=None, text=' image:\n\n**운영위원 (Operating Committee)**\n\n**1.**\n*   **순서:** 1\n*   **소속/성명:** (주)메가존클라우드 이주완 대표이사\n*   **주')], role='model') citation_metadata=None finish_message=None token_count=None finish_reason=None url_context_metadata=None avg_logprobs=None grounding_metadata=None index=0 logprobs_result=

In [126]:
print(response.text)

Here is the text extracted from the image:

**운영위원**

| 순서 | 소속/성명                    | 사진                                      | 주요약력                                                                                                                                                                                                                                                                                          |
| :--- | :--------------------------- | :---------------------------------------- | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| 1    | (주) 메가존 클라우드 이주완 대표이사 | (사진)                                    | <의장> <br> o 메가존클라우드(주) 대표이사 <br> o 벤처기업협회 수석부회장 <br> o 한국소프트웨어산업협회 부회장 <br> o 코리아스타트업포럼 부의장 <br> o 디지털플랫폼정부위원회 위원 <br> o 한국소프트웨어산업협

In [127]:
response = client.models.generate_content(
    model = "gemini-2.0-flash",
    contents = [
        types.Part.from_bytes(
            data = base64_image,
            mime_type = "image/png"
        ),
        'Describe this image.'
    ]
)

In [128]:
print(response.text)

Here is a description of the image:

The image is a table presenting the members of an operating committee. The table has the following columns: Sequence (순서), Affiliation/Name (소속/성명), Photo (사진), and Major Achievements (주요약력). There are seven rows in the table, each presenting a different member of the operating committee.

The table lists the name, company, and position of each committee member, along with a photo and bullet points highlighting their major achievements and affiliations. The first entry, for example, shows Lee Ju-wan from Megazone Cloud (주) as the chairman (의장) and lists several other positions such as Senior Vice Chairman of the Venture Business Association and Vice President of the Korea Software Industry Association.

The other entries follow a similar format, listing various positions held by each member and highlighting accomplishments in their respective fields, such as AI, startups, technology, and business. The roles and achievements listed range from foundin

GEMINI STRUCTURED OUTPUT

In [129]:
client = genai.Client()

In [130]:
response = client.models.generate_content(
    model = "gemini-2.5-flash-preview-04-17",
    contents = [
        types.Part.from_bytes(
            data = base64_image,
            mime_type = "image/png"
        ),
        'Describe this image.'
    ],
    config = {
        "response_mime_type" : "application/json",
        "response_schema" : ImageDescription
    }
)

In [131]:
print(response.text)

{"category": "table", "description": "A table titled '운영위원' (Steering Committee), listing seven individuals with columns for 순서 (Number), 소속/성명 (Affiliation/Name), 사진 (Photo), and 주요 약력 (Major Career Highlights). Each row represents a person, including their photo and a bulleted list of their achievements and positions. The individuals listed are 이유환, 김동환, 임재원, 이세영, 김태수, 김태준, and 주명규."}


Gemini GPT 스타일일

In [104]:
client = OpenAI(
    api_key = os.getenv("GOOGLE_API_KEY"),
    base_url="https://generativelanguage.googleapis.com/v1beta/openai/"
)

In [105]:
message_list = [
        {
            "role": "user",
            "content": [
                {
                    "type": "input_text",
                    "text": "주어진 이미지를 분석하세요. 이미지 종류('figure', 'chart', 'table', 'equation')와 설명을 반환하세요. 표의 경우 담겨있는 정보를 생략하지 말고 전부 일목요연하게 제공하세요. 설명의 경우 마크다운으로 작성하세요요"
                },
                {
                    "type": "input_image",
                    "image_url": f"data:image/{ext};base64,{base64_image}",
                    "detail": "high"
                }
            ]
        }
    ]

In [106]:
response = client.chat.completions.create(
  model="gemini-2.5-flash-preview-04-17",
  messages=[
    {
      "role": "user",
      "content": [
        {
          "type": "text",
          "text": "What is in this image?",
        },
        {
          "type": "image_url",
          "image_url": {
            "url":  f"data:image/jpeg;base64,{base64_image}"
          },
        },
      ],
    }
  ],
)


In [107]:
print(response.choices[0].message.content)

The image is a table listing the members of an "운영위원" (Operating Committee). It includes the following information for each member:

1.  **Sequence Number**
2.  **Affiliation/Name:** The company or organization the person is associated with, and their name.
3.  **Photo:** A headshot of the individual.
4.  **Key Bio/Profile:** A summary of their professional background, roles, achievements, and positions.

There are 7 members listed:

1.  **이주완 (Lee Ju-wan)**, CEO of 메가존클라우드(주) (Megazone Cloud Co., Ltd.), listed as the Chairman (의장).
2.  **김동환 (Kim Dong-hwan)**, CEO of (주)포티투마루 (FortyTwo Maru Co., Ltd.), listed as an Operating Committee Member.
3.  **임재원 (Lim Jae-won)**, CEO of (주)고피자 (GoPizza Co., Ltd.), listed as an Operating Committee Member.
4.  **이세영 (Lee Se-young)**, CEO of (주)뤼튼테크놀로지스 (Wrtn Technologies Co., Ltd.), listed as an Operating Committee Member.
5.  **김태수 (Kim Tae-soo)**, CEO of (주)모비젠 (Mobizen Co., Ltd.), listed as an Operating Committee Member.
6.  **김태준 (Kim Tae-joon

이미지 업로드 방식(GEMINI)

이미지 사이즈가 20mb 이상이면 인라인 방식으로 분석 불가능

그럴때 이미지를 업로드하여 사용하면 가능

In [87]:
# 이미지 파일 업로드
my_file = client.files.upload(
    file = image_path,
)

In [88]:
response = client.models.generate_content(
    model = "gemini-2.0-flash",
    contents = [my_file, "Describe this image."]
)

In [89]:
print(response.text)

Here's a description of the image:

**Content:**

The image displays two important mathematical concepts related to complex numbers:

*   **Euler's Formula:** This is shown as  `e^(iφ) = cos φ + i sin φ`. This formula connects the exponential function with complex arguments to the trigonometric functions sine and cosine. Here, 'e' represents Euler's number (approximately 2.718), 'i' is the imaginary unit (√-1), and 'φ' (phi) is an angle in radians.

*   **Euler's Identity:** This is presented as `e^(iπ) + 1 = 0`. It's a special case of Euler's formula where φ = π (pi).  It's notable because it links five fundamental mathematical constants: 0, 1, e, i, and π in a single equation.

**Visuals:**

*   The text is written in a clear, easy-to-read font.
*   The mathematical symbols and notation are correctly rendered.
*   The background color is a light beige or cream.

**Overall:**

The image is a straightforward and concise representation of two foundational results in complex number theor

XAI

In [139]:
XAI_API_KEY = os.getenv("XAI_API_KEY")

In [140]:
client = OpenAI(
    api_key=XAI_API_KEY,
    base_url="https://api.x.ai/v1",
)

In [110]:
response = client.chat.completions.create(
  model="grok-2-vision-latest",
  messages=[
    {
      "role": "user",
      "content": [
        {
          "type": "text",
          "text": "What is in this image?",
        },
        {
          "type": "image_url",
          "image_url": {
            "url":  f"data:image/jpeg;base64,{base64_image}"
          },
        },
      ],
    }
  ],
)

In [None]:
print(response.choices[0].message.content)

In [143]:
response = client.beta.chat.completions.parse(
  model="grok-2-vision-latest",
  messages=[
    {
      "role": "user",
      "content": [
        {
          "type": "text",
          "text": "What is in this image?",
        },
        {
          "type": "image_url",
          "image_url": {
            "url":  f"data:image/jpeg;base64,{base64_image}"
          },
        },
      ],
    }
  ],
  response_format = ImageDescription
)

In [144]:
print(response.choices[0].message.parsed)

category='table' description="The image is a table listing speakers for an event. The table has columns for the speaker's number, name, photo, and description. Here are the details:\n\n1. **Speaker 1**: 김경준, CEO of Hanwha Systems. Description includes his achievements and contributions to the company.\n\n2. **Speaker 2**: 김범수, CEO of Kakao. Description includes his background in IT and AI, and his contributions to the industry.\n\n3. **Speaker 3**: 김상헌, CEO of Kakao Games. Description includes his achievements in the gaming industry.\n\n4. **Speaker 4**: 박지훈, CEO of Tmax Group. Description includes his leadership in software development and his vision for the future.\n\n5. **Speaker 5**: 박찬우, CEO of Naver. Description includes his role in developing search engines and AI.\n\n6. **Speaker 6**: 박현우, CEO of SK Hynix. Description includes his contributions to the semiconductor industry.\n\n7. **Speaker 7**: 박용만, Chairman of Doosan Group. Description includes his leadership in various secto

The image is a table listing information about speakers at a conference or event. Here is the breakdown of the content:

**Columns:**
1. **순번 (Order)**
2. **소속/직함 (Affiliation/Position)**
3. **사진 (Photo)**
4. **연사/멘토 (Speaker/Mentor)**

**Rows:**

1. **Order 1:**
   - **Affiliation/Position:** SK하이닉스 AI융합총괄, AI 사업부문장
   - **Photo:** [Photo of the speaker]
   - **Speaker/Mentor:** 
     - 이름: 박유근
     - 소속/직함: SK하이닉스 AI융합총괄, AI 사업부문장
     - 주요 경력: 
       - 삼성전자 종합기술원 책임연구원
       - SK하이닉스 AI융합센터장
       - SK하이닉스 AI융합총괄
     - 주제: 반도체와 AI의 만남

2. **Order 2:**
   - **Affiliation/Position:** SK하이닉스 AI융합센터
   - **Photo:** [Photo of the speaker]
   - **Speaker/Mentor:** 
     - 이름: 김효준
     - 소속/직함: SK하이닉스 AI융합센터 AI 전략담당
     - 주요 경력: 
       - 삼성전자 종합기술원 책임연구원
       - SK하이닉스 AI융합센터 AI 전략담당
     - 주제: AI 기술의 발전과 활용 방안

3. **Order 3:**
   - **Affiliation/Position:** SK하이닉스 AI융합센터
   - **Photo:** [Photo of the speaker]
   - **Speaker/Mentor:** 
     - 이름: 김영훈
     - 소속/직함: SK하이닉스 AI융합센터 AI 전

Mistral

In [133]:
api_key = os.getenv("MISTRAL_API_KEY")

In [134]:
client = Mistral(api_key = api_key)

In [135]:
messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "text",
                "text": "What's in this image?"
            },
            {
                "type": "image_url",
                "image_url": f"data:image/jpeg;base64,{base64_image}" 
            }
        ]
    }
]

In [None]:
chat_response = client.chat.complete(
    model="mistral-medium-latest",
    messages=messages,
)

In [None]:
async for chunk in await client.chat.complete(
    model = "mistral-medium-latest",
    messages = messages,
    stream = True
): 
    print(chunk)

In [137]:
chat_response = client.chat.parse(
    model="mistral-medium-latest",
    messages=messages,
    response_format = ImageDescription
)



In [138]:
print(chat_response.choices[0].message.content)

{
  "category": "table",
  "description": "The image contains a table listing various individuals along with their photographs, brief introductions, and notable achievements or affiliations. The table is titled '융영위원' which translates to 'Fusion Committee' and includes columns for 순서 (Order), 소속/성명 (Affiliation/Name), and 주요약력 (Main Achievements). Here is a detailed summary of each row in the table: \n\n1. 메가존클라우드(주) 이주완 대표이사 (CEO of Megazone Cloud, Lee Joo-wan)\n   - Notable Achievements: Megazone Cloud CEO, Benchmark Enterprise Co-CEO, Korea Software Industry Association Vice Chairman, Korea Internet Enterprise Association Vice Chairman, Korea Cloud Computing Association Chairman, and more.\n\n2. (주)포디투 아루 김동환 대표이사 (CEO of Pod2, Aru Kim Dong-hwan)\n   - Notable Achievements: Diabloc Inc. Co-founder and CTO, AI Frontier Research Center Director, AI Safety Research Center Director, and more.\n\n3. (주)고피자 임재환 대표이사 (CEO of Gopizza, Lim Jae-hwan)\n   - Notable Achievements: Asia's younges

Deepseek

In [68]:
api_key = os.getenv("DEEPSEEK_API_KEY")

In [69]:
client = OpenAI(api_key=api_key, base_url="https://api.deepseek.com")


In [70]:
message_list = [
        {
            "role": "user",
            "content": [
                {
                    "type": "input_text",
                    "text": "주어진 이미지를 분석하세요. 이미지 종류('figure', 'chart', 'table', 'equation')와 설명을 반환하세요. 표의 경우 담겨있는 정보를 생략하지 말고 전부 일목요연하게 제공하세요. 설명의 경우 마크다운으로 작성하세요요"
                },
                {
                    "type": "input_image",
                    "image_url": f"data:image/{ext};base64,{base64_image}",
                    "detail": "high"
                }
            ]
        }
    ]

In [72]:
response = client.chat.completions.create(
  model="deepseek-chat",
  messages=[
    {
      "role": "user",
      "content": [
        {
          "type": "text",
          "text": "What is in this image?",
        },
        {
          "type": "image_url",
          "image_url": {
            "url":  f"data:image/jpeg;base64,{base64_image}"
          },
        },
      ],
    }
  ],
)

UnprocessableEntityError: Failed to deserialize the JSON body into the target type: messages[0]: data did not match any variant of untagged enum ChatCompletionRequestContent at line 1 column 2268900