In [3]:
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor

In [4]:
import torch
print(torch.cuda.is_available()) 

False


In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [6]:
mod="Qwen/Qwen2-VL-2B-Instruct"

In [7]:
model = Qwen2VLForConditionalGeneration.from_pretrained(mod, torch_dtype="auto", device_map="auto")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading shards: 100%|██████████| 2/2 [03:32<00:00, 106.41s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:17<00:00,  8.52s/it]
Some parameters are on the meta device because they were offloaded to the cpu and disk.


In [8]:
min_pixels = 256 * 28 * 28
max_pixels = 1280 * 28 * 28
processor = AutoProcessor.from_pretrained(mod, min_pixels=min_pixels, max_pixels=max_pixels )

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [9]:
import torch
import cv2
import re
import pandas as pd
from PIL import Image

In [10]:
prompt = "Extract invoice number, date, total amount, and vendor name from this invoice."

In [11]:
from torch_snippets import (
    read,
    resize,
    Info,
    in_debug_mode,
    show,
    P,
    np,
    PIL,
    Warn,
    ifnone,
)

In [12]:
from transformers import AutoTokenizer
from qwen_vl_utils import process_vision_info
from torch_snippets.adapters import np_2_b64

In [13]:
def path_2_b64(path, image_size=None):
        if in_debug_mode():
            print(image_type)
            return
        if isinstance(path, (str, P)):
            image = read(path)
            image_type = f"image/{P(path).extn}"
        elif isinstance(path, PIL.Image.Image):
            image = np.array(path)
            image_type = f"image/jpeg"
        else:
            raise NotImplementedError(f"Yet to implement for {type(path)}")
        if image_size:
            if isinstance(image_size, int):
                image_size = (image_size, image_size)
            image = resize(image, ("at-most", image_size))
        if in_debug_mode():
            Info(f"{image.shape=}")
            show(image)
        return np_2_b64(image), image_type

In [16]:
image_path="/DocumentExtraction/Data/invoice2.jpg"

In [17]:
base64_image, image_type = path_2_b64(image_path)

In [19]:
print(base64_image)
print(image_type)

/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAIBAQEBAQIBAQECAgICAgQDAgICAgUEBAMEBgUGBgYFBgYGBwkIBgcJBwYGCAsICQoKCgoKBggLDAsKDAkKCgr/2wBDAQICAgICAgUDAwUKBwYHCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgr/wAARCARRA70DASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/8QAHwEAAwEBAQEBAQEBAQAAAAAAAAECAwQFBgcICQoL/8QAtREAAgECBAQDBAcFBAQAAQJ3AAECAxEEBSExBhJBUQdhcRMiMoEIFEKRobHBCSMzUvAVYnLRChYkNOEl8RcYGRomJygpKjU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6goOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3uLm6wsPExcbHyMnK0tPU1dbX2Nna4uPk5ebn6Onq8vP09fb3+Pn6/9oADAMBAAIRAxEAPwD8l5ru+mi/cQ+VJL7VZhb7JaY+3+bJF/roqy5Lqc2sf2i5/df6urtvfWEFp+/P7zt+5r62pS9keE6ti7plvNBFcarN/rCf9dS/avMtcef/AKr/AF3lVBNc3EUUfnGLy4v+WNR3lvDe2tvOJ/8AprW9NX+M6qVXuXbe4uJzgz/8

In [None]:

def predict(image, prompt, max_new_tokens=1024):
    img_b64_str, image_type = path_2_b64(image)
    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "image": f"data:{image_type};base64,{img_b64_str}",
                },
                {"type": "text", "text": prompt},
            ],
        }
    ]

    # Preparation for inference
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    image_inputs, video_inputs = process_vision_info(messages)
    inputs =processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    # Inference: Generation of the output
    generated_ids= model.generate(**inputs, max_new_tokens=max_new_tokens)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :]
        for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False,
    )
    return output_text[0]

In [31]:
l=predict(image=image_path, prompt=prompt)

: 