In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from PIL import Image

In [2]:
# Initialize the model
model = AutoModelForCausalLM.from_pretrained(
    "vikhyatk/moondream2",
    revision="2025-01-09",
    trust_remote_code=True,
    device_map={"": "cuda"}
)

In [3]:
from pathlib import Path
import re
from tqdm import tqdm

In [4]:
def extract_number(s: str) -> float:
    pattern = r'(\d+)_(\d+)'
    match = re.match(pattern, s)
    num_int = match.group(1)
    num_float = match.group(2)
    return float(f"{num_int}.{num_float}")

In [5]:
def parse_image(p: Path) -> float:
    image = Image.open(p)
    q = """
your task is to extract number displayed on the device. respond with integer number
"""
    return float(model.query(image, q)["answer"]) / 10

In [6]:
def parse_image_dir(dir: Path):
    count = 0
    err_count = 0
    for file in tqdm(dir.glob('*/*')):
        name = file.stem
        num = extract_number(name)
        answer = parse_image(file)
        status = "ok"
        count += 1
        if str(num) != str(answer):
            status = "err"
            err_count += 1
        print(f"wanted: {num:4.1f}, got: {answer:4.1f} | {status} | {file}")
    print(f"err %: {err_count / count * 100}")

In [7]:
DATA_PATH = Path("./data")
DATA_PATH

PosixPath('data')

In [8]:
parse_image_dir(DATA_PATH)

2it [00:00,  5.37it/s]

wanted:  0.0, got:  0.0 | ok | data/1_type/0_0.png
wanted: 10.0, got: 10.0 | ok | data/1_type/10_0.png


4it [00:00,  7.65it/s]

wanted: 11.0, got: 11.0 | ok | data/1_type/11_0.png
wanted: 12.0, got: 12.0 | ok | data/1_type/12_0.png


6it [00:00,  8.64it/s]

wanted: 13.0, got: 13.0 | ok | data/1_type/13_0.png
wanted: 14.0, got: 14.0 | ok | data/1_type/14_0.png


8it [00:01,  8.77it/s]

wanted: 15.0, got: 15.0 | ok | data/1_type/15_0.png
wanted: 17.0, got: 71.7 | err | data/1_type/17_0.png


10it [00:01,  9.20it/s]

wanted: 18.0, got: 18.0 | ok | data/1_type/18_0.png
wanted: 19.0, got: 15.0 | err | data/1_type/19_0.png


12it [00:01,  9.35it/s]

wanted:  1.0, got:  1.0 | ok | data/1_type/1_0.png
wanted: 21.0, got: 21.0 | ok | data/1_type/21_0.png


14it [00:01,  9.07it/s]

wanted: 22.0, got: 22.0 | ok | data/1_type/22_0.png
wanted: 23.0, got: 72.8 | err | data/1_type/23_0.png


16it [00:01,  9.27it/s]

wanted: 24.0, got: 24.0 | ok | data/1_type/24_0.png
wanted: 25.0, got: 25.0 | ok | data/1_type/25_0.png


18it [00:02,  9.06it/s]

wanted: 26.0, got: 25.0 | err | data/1_type/26_0.png
wanted: 27.0, got: 71.0 | err | data/1_type/27_0.png


20it [00:02,  9.35it/s]

wanted: 28.0, got: 28.0 | ok | data/1_type/28_0.png
wanted: 29.0, got: 28.0 | err | data/1_type/29_0.png


22it [00:02,  9.44it/s]

wanted:  2.0, got:  2.0 | ok | data/1_type/2_0.png
wanted: 30.0, got: 30.0 | ok | data/1_type/30_0.png


24it [00:02,  9.49it/s]

wanted: 31.0, got:  0.3 | err | data/1_type/31_0.png
wanted: 32.0, got: 32.0 | ok | data/1_type/32_0.png


26it [00:02,  9.52it/s]

wanted: 33.0, got: 33.0 | ok | data/1_type/33_0.png
wanted: 34.0, got: 34.0 | ok | data/1_type/34_0.png


28it [00:03,  9.54it/s]

wanted: 35.0, got: 35.0 | ok | data/1_type/35_0.png
wanted: 36.0, got: 33.0 | err | data/1_type/36_0.png


30it [00:03,  9.52it/s]

wanted: 37.0, got:  0.3 | err | data/1_type/37_0.png
wanted: 38.0, got: 38.0 | ok | data/1_type/38_0.png


32it [00:03,  9.56it/s]

wanted: 39.0, got: 38.0 | err | data/1_type/39_0.png
wanted:  3.0, got:  3.0 | ok | data/1_type/3_0.png


34it [00:03,  9.56it/s]

wanted: 40.0, got: 40.0 | ok | data/1_type/40_0.png
wanted: 41.0, got: 41.0 | ok | data/1_type/41_0.png


36it [00:04,  9.59it/s]

wanted: 42.0, got:  4.0 | err | data/1_type/42_0.png
wanted: 43.0, got: 43.0 | ok | data/1_type/43_0(2).png


38it [00:04,  9.59it/s]

wanted: 43.0, got: 43.0 | ok | data/1_type/43_0.png
wanted: 44.0, got: 44.0 | ok | data/1_type/44_0.png


40it [00:04,  9.50it/s]

wanted: 45.0, got:  4.5 | err | data/1_type/45_0.png
wanted: 46.0, got:  4.6 | err | data/1_type/46_0.png


42it [00:04,  9.50it/s]

wanted: 47.0, got: 47.0 | ok | data/1_type/47_0.png
wanted: 48.0, got: 48.0 | ok | data/1_type/48_0.png


44it [00:04,  9.48it/s]

wanted: 49.0, got:  0.4 | err | data/1_type/49_0.png
wanted:  4.0, got:  4.0 | ok | data/1_type/4_0.png


46it [00:05,  9.45it/s]

wanted: 50.0, got: 50.0 | ok | data/1_type/50_0.png
wanted: 51.0, got: 51.0 | ok | data/1_type/51_0.png


48it [00:05,  9.53it/s]

wanted:  5.0, got:  5.0 | ok | data/1_type/5_0.png
wanted:  6.0, got:  0.3 | err | data/1_type/6_0.png


50it [00:05,  9.50it/s]

wanted:  7.0, got:  1.0 | err | data/1_type/7_0.png
wanted:  8.0, got:  8.0 | ok | data/1_type/8_0.png


52it [00:05,  9.48it/s]

wanted:  9.0, got:  5.0 | err | data/1_type/9_0.png
wanted:  0.0, got:  0.0 | ok | data/2_type/0_0(2).png


54it [00:05,  9.39it/s]

wanted:  0.0, got:  0.0 | ok | data/2_type/0_0.png
wanted: 10.0, got: 10.0 | ok | data/2_type/10_0.png


56it [00:06,  9.39it/s]

wanted: 11.0, got: 11.0 | ok | data/2_type/11_0.png
wanted: 12.0, got:  1.2 | err | data/2_type/12_0.png


58it [00:06,  9.37it/s]

wanted: 13.0, got: 13.0 | ok | data/2_type/13_0.png
wanted: 14.0, got: 14.0 | ok | data/2_type/14_0.png


60it [00:06,  9.44it/s]

wanted: 15.0, got: 15.0 | ok | data/2_type/15_0.png
wanted: 16.0, got: 16.0 | ok | data/2_type/16_0.png


62it [00:06,  9.47it/s]

wanted: 17.0, got:  1.7 | err | data/2_type/17_0.png
wanted: 18.0, got: 18.0 | ok | data/2_type/18_0.png


64it [00:06,  9.50it/s]

wanted: 19.0, got:  1.0 | err | data/2_type/19_0.png
wanted:  1.0, got:  1.0 | ok | data/2_type/1_0.png


66it [00:07,  9.50it/s]

wanted: 20.0, got: 20.0 | ok | data/2_type/20_0.png
wanted: 21.0, got: 21.2 | err | data/2_type/21_0.png


68it [00:07,  9.52it/s]

wanted: 22.0, got:  2.2 | err | data/2_type/22_0.png
wanted: 23.0, got: 23.0 | ok | data/2_type/23_0.png


70it [00:07,  9.52it/s]

wanted: 24.0, got:  0.2 | err | data/2_type/24_0.png
wanted: 25.0, got: 25.0 | ok | data/2_type/25_0.png


72it [00:07,  9.22it/s]

wanted: 26.0, got: 32.0 | err | data/2_type/26_0.png
wanted: 27.0, got: 27.2 | err | data/2_type/27_0.png


74it [00:08,  9.46it/s]

wanted: 28.0, got: 28.0 | ok | data/2_type/28_0.png
wanted: 29.0, got: 23.0 | err | data/2_type/29_0.png


76it [00:08,  9.58it/s]

wanted:  2.0, got:  2.0 | ok | data/2_type/2_0.png
wanted: 30.0, got: 30.0 | ok | data/2_type/30_0.png


78it [00:08,  9.60it/s]

wanted: 31.0, got: 31.3 | err | data/2_type/31_0.png
wanted: 32.0, got: 23.0 | err | data/2_type/32_0.png


80it [00:08,  9.58it/s]

wanted: 33.0, got: 33.0 | ok | data/2_type/33_0.png
wanted: 34.0, got: 34.0 | ok | data/2_type/34_0.png


82it [00:08,  9.53it/s]

wanted: 35.0, got:  0.0 | err | data/2_type/35_0.png
wanted: 36.0, got: 30.0 | err | data/2_type/36_0.png


84it [00:09,  9.58it/s]

wanted: 37.0, got:  0.3 | err | data/2_type/37_0.png
wanted: 38.0, got: 30.0 | err | data/2_type/38_0.png


86it [00:09,  9.59it/s]

wanted: 39.0, got: 38.0 | err | data/2_type/39_0.png
wanted:  3.0, got:  3.0 | ok | data/2_type/3_0.png


88it [00:09,  9.61it/s]

wanted: 40.0, got: 40.0 | ok | data/2_type/40_0.png
wanted: 41.0, got: 41.0 | ok | data/2_type/41_0.png


90it [00:09,  9.63it/s]

wanted: 42.0, got: 72.0 | err | data/2_type/42_0.png
wanted: 43.0, got: 43.0 | ok | data/2_type/43_0.png


92it [00:09,  9.65it/s]

wanted: 44.0, got: 44.0 | ok | data/2_type/44_0.png
wanted: 45.0, got: 45.0 | ok | data/2_type/45_0.png


94it [00:10,  9.34it/s]

wanted: 46.0, got: 45.6 | err | data/2_type/46_0.png
wanted: 47.0, got: 47.0 | ok | data/2_type/47_0.png


96it [00:10,  9.41it/s]

wanted: 49.0, got:  0.4 | err | data/2_type/49_0.png
wanted:  4.0, got:  4.0 | ok | data/2_type/4_0.png


98it [00:10,  9.45it/s]

wanted: 50.0, got: 50.0 | ok | data/2_type/50_0.png
wanted: 51.0, got: 51.0 | ok | data/2_type/51_0.png


100it [00:10,  9.51it/s]

wanted: 52.0, got:  2.3 | err | data/2_type/52_0.png
wanted: 53.0, got: 53.0 | ok | data/2_type/53_0.png


102it [00:10,  9.56it/s]

wanted: 54.0, got: 54.0 | ok | data/2_type/54_0.png
wanted: 55.0, got:  0.5 | err | data/2_type/55_0.png


104it [00:11,  9.27it/s]

wanted: 56.0, got:  0.5 | err | data/2_type/56_0.png
wanted: 57.0, got: 52.7 | err | data/2_type/57_0.png


106it [00:11,  9.13it/s]

wanted: 58.0, got: 58.0 | ok | data/2_type/58_0.png
wanted: 59.0, got: 59.0 | ok | data/2_type/59_0.png


108it [00:11,  9.32it/s]

wanted:  5.0, got:  3.0 | err | data/2_type/5_0.png
wanted: 60.0, got: 60.0 | ok | data/2_type/60_0.png


110it [00:11,  9.38it/s]

wanted: 61.0, got: 61.0 | ok | data/2_type/61_0.png
wanted: 62.0, got:  0.6 | err | data/2_type/62_0.png


112it [00:12,  9.49it/s]

wanted: 63.0, got:  0.6 | err | data/2_type/63_0.png
wanted:  6.0, got:  6.0 | ok | data/2_type/6_0.png


114it [00:12,  9.54it/s]

wanted:  7.0, got:  7.0 | ok | data/2_type/7_0.png
wanted:  8.0, got:  8.0 | ok | data/2_type/8_0.png


116it [00:12,  9.60it/s]

wanted:  9.0, got:  9.0 | ok | data/2_type/9_0.png
wanted:  0.0, got:  0.0 | ok | data/3_type/0_0(2).png


118it [00:12,  9.64it/s]

wanted:  0.0, got:  0.0 | ok | data/3_type/0_0.png
wanted: 10.0, got:  1.0 | err | data/3_type/10_0.png


120it [00:12,  9.64it/s]

wanted: 11.0, got:  1.1 | err | data/3_type/11_0 (2).png
wanted: 11.0, got:  1.1 | err | data/3_type/11_0.png


122it [00:13,  9.59it/s]

wanted: 12.0, got:  1.2 | err | data/3_type/12_0.png
wanted: 13.0, got:  1.3 | err | data/3_type/13_0.png


124it [00:13,  9.54it/s]

wanted: 14.0, got:  1.4 | err | data/3_type/14_0.png
wanted: 15.0, got:  1.5 | err | data/3_type/15_0.png


126it [00:13,  9.49it/s]

wanted: 16.0, got:  1.6 | err | data/3_type/16_0.png
wanted: 17.0, got:  1.7 | err | data/3_type/17_0.png


128it [00:13,  9.48it/s]

wanted: 18.0, got:  1.8 | err | data/3_type/18_0.png
wanted: 19.0, got:  1.9 | err | data/3_type/19_0.png


130it [00:13,  9.45it/s]

wanted:  1.0, got:  0.1 | err | data/3_type/1_0.png
wanted: 20.0, got:  2.0 | err | data/3_type/20_0.png


132it [00:14,  9.53it/s]

wanted: 21.0, got:  2.1 | err | data/3_type/21_0.png
wanted: 22.0, got:  2.2 | err | data/3_type/22_0.png


134it [00:14,  9.55it/s]

wanted: 23.0, got:  2.3 | err | data/3_type/23_0.png
wanted: 24.0, got:  2.4 | err | data/3_type/24_0.png


136it [00:14,  9.56it/s]

wanted: 25.0, got:  2.5 | err | data/3_type/25_0.png
wanted: 26.0, got:  2.6 | err | data/3_type/26_0.png


138it [00:14,  9.59it/s]

wanted: 27.0, got:  2.7 | err | data/3_type/27_0.png
wanted: 28.0, got:  2.8 | err | data/3_type/28_0.png


140it [00:14,  9.58it/s]

wanted: 29.0, got:  2.9 | err | data/3_type/29_0.png
wanted:  2.0, got:  0.2 | err | data/3_type/2_0.png


142it [00:15,  9.58it/s]

wanted: 30.0, got:  3.0 | err | data/3_type/30_0.png
wanted: 31.0, got:  3.1 | err | data/3_type/31_0.png


144it [00:15,  9.56it/s]

wanted: 32.0, got:  3.2 | err | data/3_type/32_0 (2).png
wanted: 32.0, got:  3.2 | err | data/3_type/32_0.png


146it [00:15,  9.53it/s]

wanted: 33.0, got:  3.3 | err | data/3_type/33_0.png
wanted: 34.0, got:  3.4 | err | data/3_type/34_0.png


148it [00:15,  9.54it/s]

wanted: 35.0, got:  3.5 | err | data/3_type/35_0 (2).png
wanted: 35.0, got:  3.5 | err | data/3_type/35_0.png


150it [00:16,  9.59it/s]

wanted: 36.0, got:  3.6 | err | data/3_type/36_0.png
wanted: 37.0, got:  3.7 | err | data/3_type/37_0.png


152it [00:16,  9.56it/s]

wanted: 38.0, got:  3.8 | err | data/3_type/38_0.png
wanted: 39.0, got:  3.9 | err | data/3_type/39_0.png


154it [00:16,  9.57it/s]

wanted:  3.0, got:  0.3 | err | data/3_type/3_0.png
wanted: 40.0, got:  4.0 | err | data/3_type/40_0.png


156it [00:16,  9.55it/s]

wanted: 41.0, got:  4.1 | err | data/3_type/41_0.png
wanted: 42.0, got:  4.2 | err | data/3_type/42_0.png


158it [00:16,  9.53it/s]

wanted: 43.0, got:  4.3 | err | data/3_type/43_0.png
wanted: 44.0, got:  4.4 | err | data/3_type/44_0.png


160it [00:17,  9.45it/s]

wanted: 45.0, got:  4.5 | err | data/3_type/45_0.png
wanted: 46.0, got:  4.6 | err | data/3_type/46_0.png


162it [00:17,  9.53it/s]

wanted: 47.0, got:  4.7 | err | data/3_type/47_0.png
wanted: 48.0, got:  4.8 | err | data/3_type/48_0.png


164it [00:17,  9.43it/s]

wanted: 49.0, got:  4.9 | err | data/3_type/49_0.png
wanted:  4.0, got:  0.4 | err | data/3_type/4_0.png


166it [00:17,  9.40it/s]

wanted: 50.0, got:  5.0 | err | data/3_type/50_0.png
wanted: 51.0, got:  5.1 | err | data/3_type/51_0.png


168it [00:17,  9.45it/s]

wanted: 52.0, got:  5.2 | err | data/3_type/52_0.png
wanted: 53.0, got:  5.3 | err | data/3_type/53_0.png


170it [00:18,  9.47it/s]

wanted: 54.0, got:  5.4 | err | data/3_type/54_0.png
wanted: 55.0, got:  5.5 | err | data/3_type/55_0.png


172it [00:18,  9.49it/s]

wanted: 56.0, got:  5.6 | err | data/3_type/56_0.png
wanted: 57.0, got:  5.7 | err | data/3_type/57_0.png


174it [00:18,  9.48it/s]

wanted: 58.0, got:  5.8 | err | data/3_type/58_0.png
wanted: 59.0, got:  5.9 | err | data/3_type/59_0.png


176it [00:18,  9.47it/s]

wanted:  5.0, got:  0.5 | err | data/3_type/5_0.png
wanted: 60.0, got:  6.0 | err | data/3_type/60_0.png


178it [00:18,  9.44it/s]

wanted: 61.0, got:  6.1 | err | data/3_type/61_0.png
wanted: 62.0, got:  6.2 | err | data/3_type/62_0.png


180it [00:19,  9.35it/s]

wanted: 63.0, got:  6.3 | err | data/3_type/63_0.png
wanted: 64.0, got:  6.4 | err | data/3_type/64_0.png


182it [00:19,  9.43it/s]

wanted: 65.0, got:  6.5 | err | data/3_type/65_0.png
wanted: 66.0, got:  6.6 | err | data/3_type/66_0.png


184it [00:19,  9.45it/s]

wanted: 67.0, got:  6.7 | err | data/3_type/67_0.png
wanted: 68.0, got:  6.8 | err | data/3_type/68_0.png


186it [00:19,  9.53it/s]

wanted: 69.0, got:  6.9 | err | data/3_type/69_0.png
wanted:  6.0, got:  0.6 | err | data/3_type/6_0.png


188it [00:20,  9.51it/s]

wanted: 70.0, got:  7.0 | err | data/3_type/70_0.png
wanted: 71.0, got:  7.1 | err | data/3_type/71_0.png


190it [00:20,  9.48it/s]

wanted: 72.0, got:  7.2 | err | data/3_type/72_0.png
wanted: 73.0, got:  7.3 | err | data/3_type/73_0.png


192it [00:20,  9.44it/s]

wanted: 74.0, got:  7.4 | err | data/3_type/74_0.png
wanted: 75.0, got:  7.5 | err | data/3_type/75_0.png


194it [00:20,  9.40it/s]

wanted: 76.0, got:  7.6 | err | data/3_type/76_0.png
wanted: 77.0, got:  7.7 | err | data/3_type/77_0.png


196it [00:20,  9.43it/s]

wanted: 78.0, got:  7.8 | err | data/3_type/78_0.png
wanted: 79.0, got:  7.9 | err | data/3_type/79_0(2).png


198it [00:21,  9.48it/s]

wanted: 79.0, got:  7.9 | err | data/3_type/79_0.png
wanted:  7.0, got:  0.7 | err | data/3_type/7_0.png


200it [00:21,  9.46it/s]

wanted: 80.0, got:  8.0 | err | data/3_type/80_0.png
wanted: 81.0, got:  8.1 | err | data/3_type/81_0.png


202it [00:21,  9.43it/s]

wanted: 82.0, got:  8.2 | err | data/3_type/82_0.png
wanted: 83.0, got:  8.3 | err | data/3_type/83_0.png


204it [00:21,  9.42it/s]

wanted: 84.0, got:  8.4 | err | data/3_type/84_0.png
wanted: 85.0, got:  8.5 | err | data/3_type/85_0.png


206it [00:21,  9.44it/s]

wanted: 86.0, got:  8.6 | err | data/3_type/86_0.png
wanted: 87.0, got:  8.7 | err | data/3_type/87_0.png


208it [00:22,  9.34it/s]

wanted: 88.0, got:  8.8 | err | data/3_type/88_0.png
wanted: 89.0, got:  8.9 | err | data/3_type/89_0.png


210it [00:22,  9.38it/s]

wanted:  8.0, got:  0.8 | err | data/3_type/8_0.png
wanted: 90.0, got:  9.0 | err | data/3_type/90_0.png


212it [00:22,  9.39it/s]

wanted: 91.0, got:  9.1 | err | data/3_type/91_0.png
wanted: 92.0, got:  9.2 | err | data/3_type/92_0.png


214it [00:22,  9.38it/s]

wanted: 93.0, got:  9.3 | err | data/3_type/93_0.png
wanted: 94.0, got:  9.4 | err | data/3_type/94_0.png


216it [00:22,  9.41it/s]

wanted: 95.0, got:  9.5 | err | data/3_type/95_0.png
wanted: 96.0, got:  9.6 | err | data/3_type/96_0.png


218it [00:23,  9.41it/s]

wanted: 97.0, got:  9.7 | err | data/3_type/97_0.png
wanted: 98.0, got:  9.8 | err | data/3_type/98_0.png


220it [00:23,  9.39it/s]

wanted: 99.0, got:  9.9 | err | data/3_type/99_0.png
wanted:  9.0, got:  0.9 | err | data/3_type/9_0.png


222it [00:23,  9.45it/s]

wanted:  0.0, got:  0.0 | ok | data/4_type/0_0(2).png
wanted:  0.0, got:  0.0 | ok | data/4_type/0_0(3).png


224it [00:23,  9.47it/s]

wanted:  0.0, got:  0.0 | ok | data/4_type/0_0.png
wanted: 10.0, got: 10.0 | ok | data/4_type/10_0.png


226it [00:24,  9.50it/s]

wanted: 11.0, got:  1.0 | err | data/4_type/11_0.png
wanted: 12.0, got: 72.0 | err | data/4_type/12_0.png


228it [00:24,  9.44it/s]

wanted: 13.0, got: 13.0 | ok | data/4_type/13_0.png
wanted: 14.0, got:  0.4 | err | data/4_type/14_0.png


230it [00:24,  9.37it/s]

wanted: 16.0, got:  0.4 | err | data/4_type/16_0.png
wanted: 17.0, got:  1.7 | err | data/4_type/17_0.png


232it [00:24,  9.39it/s]

wanted: 18.0, got:  7.0 | err | data/4_type/18_0.png
wanted: 19.0, got:  0.4 | err | data/4_type/19_0.png


234it [00:24,  9.41it/s]

wanted:  1.0, got:  1.0 | ok | data/4_type/1_0.png
wanted: 21.0, got: 21.0 | ok | data/4_type/21_0.png


235it [00:25,  9.40it/s]

wanted: 22.0, got:  2.2 | err | data/4_type/22_0.png
err %: 65.53191489361701





12.5
