In [3]:
import cv2
import pytesseract
from PIL import Image

# Загружаем изображение
image_path = "../resources/images/4.jpg"
image = cv2.imread(image_path)

# (Необязательно) преобразуем в оттенки серого
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# (Необязательно) применим порог для лучшего контраста
_, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

# Преобразуем в формат, совместимый с PIL (pytesseract принимает его)
pil_img = Image.fromarray(thresh)

# Задаем конфигурацию: PSM 6 — для табличного распознавания
custom_config = r'--oem 3 --psm 6'

# Распознаем текст
text = pytesseract.image_to_string(pil_img, config=custom_config)

print("Распознанный текст:")
print(text)


Распознанный текст:
ID Name Age City

[1 [ace | 20 | newvrk_|
[3 [enaie | 35 | tos anges
[a [ore [26 | crag |



In [4]:
import cv2
import pytesseract
from PIL import Image
import pandas as pd

# Загружаем изображение
image_path = "../resources/images/4.jpg"
image = cv2.imread(image_path)

# Преобразуем в оттенки серого и применим порог
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
_, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

# Получаем данные с координацией
custom_config = r'--oem 3 --psm 6'
data = pytesseract.image_to_data(thresh, config=custom_config, output_type=pytesseract.Output.DATAFRAME)

# Удаляем пустые строки
data = data.dropna(subset=["text"])
data = data[data["text"].str.strip() != ""]

# Рисуем прямоугольники и текст
for i, row in data.iterrows():
    x, y, w, h = row["left"], row["top"], row["width"], row["height"]
    text = row["text"]
    conf = row["conf"]
    
    if int(conf) > 50:  # Отсекаем по порогу уверенности
        cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 2)
        cv2.putText(image, text, (x, y - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 1)

# Сохраняем или отображаем результат
cv2.imwrite("ocr_result.jpg", image)
# или использовать cv2.imshow("OCR", image) и cv2.waitKey(0) на локальной машине


True

In [2]:
! pip install pytesseract pillow opencv-python

Collecting pytesseract
  Obtaining dependency information for pytesseract from https://files.pythonhosted.org/packages/7a/33/8312d7ce74670c9d39a532b2c246a853861120486be9443eebf048043637/pytesseract-0.3.13-py3-none-any.whl.metadata
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13



[notice] A new release of pip is available: 23.2.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [33]:
def extract_cells_and_ocr(image_path: str,
                          min_cell_width: int = 20,
                          min_cell_height: int = 20,
                          lang: str = 'eng') -> pd.DataFrame:
    # Загрузка изображения
    img = cv2.imread(image_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Препроцессинг изображения
    blurred = cv2.GaussianBlur(gray, (3, 3), 0)
    _, binary = cv2.threshold(blurred, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)

    # Детекция вертикальных и горизонтальных линий
    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 30))
    vertical_lines = cv2.erode(binary, vertical_kernel, iterations=1)
    vertical_lines = cv2.dilate(vertical_lines, vertical_kernel, iterations=1)

    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (30, 1))
    horizontal_lines = cv2.erode(binary, horizontal_kernel, iterations=1)
    horizontal_lines = cv2.dilate(horizontal_lines, horizontal_kernel, iterations=1)

    # Объединение линий
    table_structure = cv2.addWeighted(vertical_lines, 0.5, horizontal_lines, 0.5, 0.0)

    # Поиск контуров
    contours, _ = cv2.findContours(table_structure, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
    cells = []
    for cnt in contours:
        x, y, w, h = cv2.boundingRect(cnt)
        if w > min_cell_width and h > min_cell_height:
            cells.append((x, y, w, h))

    # Фильтрация слишком больших рамок (внешняя граница таблицы)
    image_area = img.shape[0] * img.shape[1]
    cells = [cell for cell in cells if cell[2] * cell[3] < 0.5 * image_area]

    # Сортировка ячеек
    cells = sorted(cells, key=lambda b: (b[1], b[0]))

    # Группировка по строкам
    rows = []
    current_row = []
    last_y = -100
    tolerance = 10
    for cell in cells:
        x, y, w, h = cell
        if abs(y - last_y) > tolerance:
            if current_row:
                rows.append(sorted(current_row, key=lambda b: b[0]))
            current_row = [cell]
            last_y = y
        else:
            current_row.append(cell)
    if current_row:
        rows.append(sorted(current_row, key=lambda b: b[0]))

    # OCR по ячейкам
    data = []
    for row in rows:
        row_data = []
        for (x, y, w, h) in row:
            margin = 2
            roi = img[y+margin:y+h-margin, x+margin:x+w-margin]
            roi = cv2.resize(roi, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)
            roi_gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
            _, roi_bin = cv2.threshold(roi_gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
            text = pytesseract.image_to_string(roi_bin, config='--psm 6', lang=lang).strip()
            row_data.append(text)
        data.append(row_data)

    return pd.DataFrame(data), cells

In [34]:
df, cells = extract_cells_and_ocr("../resources/images/4.jpg")

df

Unnamed: 0,0,1,2,3
0,ID Name Age City\n\n1] atce [30 [New York\n2\n...,,,
1,1,Alice,30.0,New York
2,2,Bob,25.0,San Francisco
3,3,Charlie,35.0,Los Angeles
4,4,Diana,28.0,Chicago


In [35]:
cells

[(184, 65, 288, 167),
 (185, 111, 36, 29),
 (222, 111, 71, 29),
 (294, 111, 50, 29),
 (345, 111, 125, 29),
 (185, 141, 36, 28),
 (222, 141, 71, 28),
 (294, 141, 50, 28),
 (345, 141, 125, 28),
 (185, 170, 36, 29),
 (222, 170, 71, 29),
 (294, 170, 50, 29),
 (345, 170, 125, 29),
 (185, 200, 36, 29),
 (222, 200, 71, 29),
 (294, 200, 50, 29),
 (345, 200, 125, 29)]

In [36]:
def draw_cell_boundaries(image_path: str,
                          cells: list[tuple[int, int, int, int]],
                          output_path: str = "/mnt/data/cells_detected.jpg") -> str:
    img = cv2.imread(image_path)
    for (x, y, w, h) in cells:
        cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 2)
    cv2.imwrite(output_path, img)

    return output_path

In [29]:
draw_cell_boundaries("../resources/images/4.jpg", [cells[0]], 'test.jpg')

'test.jpg'