In [1]:
!pip install ultralytics

Collecting ultralytics
  Downloading ultralytics-8.3.111-py3-none-any.whl.metadata (37 kB)
Collecting py-cpuinfo (from ultralytics)
  Downloading py_cpuinfo-9.0.0-py3-none-any.whl.metadata (794 bytes)
Collecting ultralytics-thop>=2.0.0 (from ultralytics)
  Downloading ultralytics_thop-2.0.14-py3-none-any.whl.metadata (9.4 kB)
Downloading ultralytics-8.3.111-py3-none-any.whl (978 kB)
   ---------------------------------------- 0.0/978.8 kB ? eta -:--:--
   --------------------------------------- 978.8/978.8 kB 44.8 MB/s eta 0:00:00
Downloading ultralytics_thop-2.0.14-py3-none-any.whl (26 kB)
Downloading py_cpuinfo-9.0.0-py3-none-any.whl (22 kB)
Installing collected packages: py-cpuinfo, ultralytics-thop, ultralytics
Successfully installed py-cpuinfo-9.0.0 ultralytics-8.3.111 ultralytics-thop-2.0.14


In [2]:
from ultralytics import YOLO
from PIL import Image
import torch
from torchvision import transforms
from torchvision import models
import os
import matplotlib.pyplot as plt
import matplotlib.patches as patches

In [3]:
yolo_model = YOLO("best.pt") 

In [4]:
mobilenet = models.mobilenet_v2(pretrained=False)
mobilenet.classifier[1] = torch.nn.Linear(mobilenet.last_channel, 26)  

state_dict = torch.load("mobilenet_v2_best.pt", map_location="cpu")
mobilenet.load_state_dict(state_dict)

mobilenet.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
mobilenet.to(device)



MobileNetV2(
  (features): Sequential(
    (0): Conv2dNormActivation(
      (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU6(inplace=True)
    )
    (1): InvertedResidual(
      (conv): Sequential(
        (0): Conv2dNormActivation(
          (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
          (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (1): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (2): InvertedResidual(
      (conv): Sequential(
        (0): Conv2dNormActivation(
          (0): Conv2d(16, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm2d(96, eps=

In [5]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

In [6]:
label_to_char = {i: chr(97 + i) for i in range(26)}  # {0: 'a', 1: 'b', ..., 25: 'z'}

In [7]:
image_path = "1.jpg"
image = Image.open(image_path).convert("RGB")
results = yolo_model(image_path)


image 1/1 C:\Users\Yibo Sun\DSBrailleProject\1.jpg: 640x480 1 Y, 198.7ms
Speed: 9.2ms preprocess, 198.7ms inference, 333.5ms postprocess per image at shape (1, 3, 640, 480)


In [8]:
boxes = results[0].boxes.xyxy.cpu().tolist()

In [9]:
boxes.sort(key=lambda b: b[0])

In [10]:
output_text = ""

In [11]:
for i, (x1, y1, x2, y2) in enumerate(boxes):
    cropped = image.crop((int(x1), int(y1), int(x2), int(y2)))
    input_tensor = transform(cropped).unsqueeze(0).to(device)

    with torch.no_grad():
        logits = mobilenet(input_tensor)
        pred_label = torch.argmax(logits, dim=1).item()
        character = label_to_char.get(pred_label, '?')

    output_text += character