In [276]:
import pytesseract
from wordfreq import top_n_list, word_frequency
import math
from PIL import Image, ImageDraw, ImageOps, ImageFilter
import cv2
import numpy as np

In [321]:
text = pytesseract.image_to_string('test.jpg', lang="rus", config="--oem 1 --psm 11")
print(text)

еслинеможешь на что-то повлиять

романтизируй это



In [163]:
russian_words = set(top_n_list('ru', 170000))

MIN_WORD_LEN = 3 
EXCEPTION_WORDS = {"я", "и", "в", "не", "на", "с"} 


def segment(text):
    n = len(text)
    probs = [-math.inf] * (n + 1)  
    lasts = [0] * (n + 1)
    probs[0] = 0  

    for i in range(1, n + 1):
        for j in range(max(0, i - 25), i):  
            word = text[j:i]
            if word in russian_words and (len(word) >= MIN_WORD_LEN or word in EXCEPTION_WORDS):
                freq = word_frequency(word, 'ru', wordlist='large')
                if freq == 0:
                    freq = 1e-9 
                score = probs[j] + math.log(freq)
                if score > probs[i]:
                    probs[i] = score
                    lasts[i] = j

    words = []
    i = n
    while i > 0:
        j = lasts[i]
        words.append(text[j:i])
        i = j
    return list(reversed(words))

def segment_with_hyphen(text):
    words = []
    parts = []
    temp = ''
    for ch in text:
        if ch == '-':
            if temp:
                parts.append(temp)
                temp = ''
            parts.append('-')
        else:
            temp += ch
    if temp:
        parts.append(temp)

    for part in parts:
        if part == '-':
            words.append(part)
        else:
            words.extend(segment(part))
    return words

def segment_lines(text):
    result = []
    for line in text.splitlines():
        line = line.strip()
        if not line:
            continue
        for word in line.split():
            result.extend(segment_with_hyphen(word))
    final_words = []
    i = 0
    while i < len(result):
        if result[i] == '-' and i > 0 and i < len(result) - 1:
            final_words[-1] = final_words[-1] + '-' + result[i + 1]
            i += 2
        else:
            final_words.append(result[i])
            i += 1
    return " ".join(final_words)

In [267]:
def get_boxes(img):
    data = pytesseract.image_to_data(img, lang="rus", output_type=pytesseract.Output.DICT)

    boxes = []
    for i, word in enumerate(data['text']):
        if word.strip() != "":
            x0, y0 = data['left'][i], data['top'][i]
            x1, y1 = x0 + data['width'][i], y0 + data['height'][i]
            boxes.append({"coords": [x0, y0, x1, y1], "text": word})
    return boxes

def merge_boxes_with_text(boxes, threshold):
    merged = []

    while boxes:
        base = boxes.pop(0)
        x0, y0, x1, y1 = base["coords"]
        words_in_block = [base]

        i = 0
        while i < len(boxes):
            bx = boxes[i]
            bx0, by0, bx1, by1 = bx["coords"]
            if not (bx1 + threshold < x0 or bx0 - threshold > x1 or
                    by1 + threshold < y0 or by0 - threshold > y1):
                x0, y0, x1, y1 = min(x0, bx0), min(y0, by0), max(x1, bx1), max(y1, by1)
                words_in_block.append(bx)
                boxes.pop(i)
            else:
                i += 1

        words_in_block.sort(key=lambda w: (w["coords"][1], w["coords"][0]))

        lines = []
        current_line_y = -1
        current_line = []
        for w in words_in_block:
            top = w["coords"][1]
            if current_line_y == -1:
                current_line_y = top
            if abs(top - current_line_y) > 10:
                lines.append(" ".join([segment_lines(ww["text"]) for ww in current_line]))
                current_line = [w]
                current_line_y = top
            else:
                current_line.append(w)
        if current_line:
            lines.append(" ".join([ww["text"] for ww in current_line]))

        merged.append({"coords": [x0, y0, x1, y1], "text": "\n".join(lines)})

    return merged

def get_coords_and_text(img):
    merged_blocks = merge_boxes_with_text(get_boxes(img), threshold=500)

    draw = ImageDraw.Draw(img)
    for block in merged_blocks:
        x0, y0, x1, y1 = block["coords"]
        draw.rectangle([x0, y0, x1, y1], outline="red", width=2)

    img.show()

    for block in merged_blocks:
        print(block["coords"])
        print(block["text"])
        print("---")

In [322]:
get_coords_and_text(Image.open('test.jpg'))

[53, 345, 920, 428]
если не можешь на что-то повлиять
романтизируй это
---
