## Imports

In [1]:
import numpy as np
import pickle
import json
from mistralai import Mistral
from tqdm import tqdm
import base64

In [2]:
import os
from dotenv import load_dotenv
from huggingface_hub import login

load_dotenv()

token = os.getenv('HF_TOKEN')
MISTRAL_API_KEY = os.getenv('MISTRAL_API_KEY')
login(token=token)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


## Directories

In [3]:
images_fpath = "images"
if not os.path.exists(images_fpath):
    os.mkdir(images_fpath)

## Pixtral base example

In [4]:
model = "pixtral-12b-2409"

# Initialize the Mistral client
client = Mistral(api_key=MISTRAL_API_KEY)

In [5]:
messages = [
    {
        "role": "system",
        "content": "Return the answer in a JSON object with the next structure: "
                   "{\"elements\": [{\"element\": \"some name of element1\", "
                   "\"description\": \"some description of element 1\"}, "
                   "{\"element\": \"some name of element2\", \"description\": "
                   "\"some description of element 2\"}]}"
    },
    {
        "role": "user",
        "content": "Describe the image"
    },
    {
        "role": "user",
        "content": [
            {
                "type": "image_url",
                "image_url": "https://docs.mistral.ai/img/eiffel-tower-paris.jpg"
            }
        ]
    }
]

In [6]:
%%time
chat_response = client.chat.complete(
    model=model,
    messages=messages,
    response_format={
        "type": "json_object",
    }
)
content = chat_response.choices[0].message.content
data = json.loads(content)
data

CPU times: user 19.1 ms, sys: 5.42 ms, total: 24.5 ms
Wall time: 24.7 s


{'elements': [{'element': 'Eiffel Tower',
   'description': 'The Eiffel Tower is a wrought-iron lattice tower on the Champ de Mars in Paris, France. It is named after the engineer Gustave Eiffel, whose company designed and built the tower.'}]}

Responce очень долгий может быть

In [7]:
messages = [
    {
        "role": "system",
        "content": "Return the answer in a JSON object with the next structure: "
                   "{\"elements\": [{\"element\": \"some name of element1\",} "
    },
    {
        "role": "user",
        "content": "Describe the image"
    },
    {
        "role": "user",
        "content": [
            {
                "type": "image_url",
                "image_url": "https://docs.mistral.ai/img/eiffel-tower-paris.jpg"
            }
        ]
    }
]

In [8]:
%%time
chat_response = client.chat.complete(
    model=model,
    messages=messages,
    response_format={
        "type": "json_object",
    }
)
content = chat_response.choices[0].message.content
content

CPU times: user 11.1 ms, sys: 2.68 ms, total: 13.8 ms
Wall time: 2.37 s


'{"elements": [{"element": "Eiffel Tower"}]}'

## Local image

Картинка отсюда
https://kamnemir.ru/files/363fc737-491d-48bb-918e-20ef23308dd11.jpg

In [9]:
def load_image_as_base64(path):
    with open(path, "rb") as f:
        return base64.b64encode(f.read()).decode("utf-8")

In [10]:
pic_fpath = os.path.join(
    images_fpath,
    "pamukkale.jpg"
)

In [11]:
pic_b64 = load_image_as_base64(pic_fpath)

In [13]:
    {
        "role": "system",
        "content": "Return the answer in a JSON object with the next structure: "
                   "{\"elements\": [{\"element\": \"some name of element1\", "
                   "\"description\": \"some description of element 1\"}, "
                   "{\"element\": \"some name of element2\", \"description\": "
                   "\"some description of element 2\"}]}"
    },

({'role': 'system',
  'content': 'Return the answer in a JSON object with the next structure: {"elements": [{"element": "some name of element1", "description": "some description of element 1"}, {"element": "some name of element2", "description": "some description of element 2"}]}'},)

In [14]:
messages = [
    {
        "role": "system",
        "content": "Return the answer in a JSON object with the next structure: "
                   "{\"elements\": [{\"element\": \"some name of element1\", "
                   "\"description\": \"some description of element 1\"}, "
                   "{\"element\": \"some name of element2\", \"description\": "
                   "\"some description of element 2\"}]\""
                   "\"summary\": \"summarised short description of the whole\"}"
                   "You should interpret image as a whole entity"
                   "People on the image do not matter, only the scenery"
                   "Give very specific explanation, related to the image,"
    },

    {
        "role": "user",
        "content": "Describe the image"
    },
    {
        "role": "user",
        "content": [
            {
                "type": "image_url",
                "image_url": f"data:image/jpeg;base64,{pic_b64}"
            }
        ]
    }
]

In [16]:
%%time
chat_response = client.chat.complete(
    model=model,
    messages=messages,
    response_format={
        "type": "json_object",
    }
)
content = chat_response.choices[0].message.content
data = json.loads(content)
data

CPU times: user 15 ms, sys: 4.32 ms, total: 19.3 ms
Wall time: 2.76 s


{'elements': [{'element': 'Travertine Terraces',
   'description': 'These are the white, stepped formations made of calcium carbonate deposited by mineral springs.'},
  {'element': 'Pools',
   'description': 'There are numerous natural pools filled with mineral-rich water, where people are seen bathing and relaxing.'}],
 'summary': 'The image depicts the Pamukkale Travertine Terraces in Turkey, featuring white, stepped formations created by mineral-rich water and numerous natural pools used for bathing.'}

Поле summary можно использовать, остальные потенциально тоже, стоит поэкспериментировать с промптом