In [None]:
# This cell is inserted to verify the fix
print("Verification cell")

In [3]:
import torch
import cv2
import yaml
import numpy as np
from addict import Dict
from pathlib import Path
from segmentation.models import build_model
from segmentation.post_processing import get_post_processing
import matplotlib.pyplot as plt

class SimpleTextPreprocessor:

    def __init__(self):
        pass

    def preprocess(self, img_bgr, short_size=736, img_name=None):
        h, w = img_bgr.shape[:2]
        scale = short_size / min(h, w)
        new_h = int(h * scale)
        new_w = int(w * scale)

        new_h = (new_h + 31) // 32 * 32
        new_w = (new_w + 31) // 32 * 32

        img_resized = cv2.resize(img_bgr, (new_w, new_h))

        gray = cv2.cvtColor(img_resized, cv2.COLOR_BGR2GRAY)

        blurred = cv2.GaussianBlur(gray, (5, 5), 0)

        binary = cv2.adaptiveThreshold(
            blurred,
            255,
            cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
            cv2.THRESH_BINARY,
            blockSize=15,
            C=10
        )

        binary_bgr = cv2.cvtColor(binary, cv2.COLOR_GRAY2BGR)

        img_tensor = (
            torch.from_numpy(binary_bgr)
            .permute(2, 0, 1)
            .unsqueeze(0)
            .float() / 255.0
        )

        return img_tensor, binary_bgr, (new_h, new_w)

preprocessor = SimpleTextPreprocessor()

def preprocess_intelligent(img_bgr, short_size=736, img_name=None):
    return preprocessor.preprocess(img_bgr, short_size, img_name)

def visualize_detection(img, boxes, scores, conf_threshold=0.5):
    result = img.copy()

    for box, score in zip(boxes, scores):
        if score < conf_threshold:
            continue

        pts = np.array(box, dtype=np.int32).reshape(-1, 1, 2)
        cv2.polylines(result, [pts], True, (0, 255, 0), 2)

        x1, y1 = int(box[0][0]), int(box[0][1])
        cv2.putText(result, f'{score:.2f}', (x1, y1 - 5),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0, 255, 0), 1)

    return result

with open("D:/pbl6_be_v2/app/config/icdar2015_resnet18_FPN_DBhead_polyLR.yaml", "r") as f:
    cfg = Dict(yaml.safe_load(f))

if "in_channels" not in cfg["arch"]["backbone"]:
    cfg["arch"]["backbone"]["in_channels"] = 3

model = build_model(cfg["arch"])

checkpoint = torch.load(
    "D:/pbl6_be_v2/app/weights/model_best.pth",
    map_location="cpu"
)
model.load_state_dict(checkpoint["state_dict"])
model.eval()

post_process = get_post_processing(cfg["post_processing"])

img_path = "D:/pbl6_be_v2/test_jpg.jpg"
img_bgr = cv2.imread(img_path)

if img_bgr is None:
    raise FileNotFoundError(f"Không đọc được ảnh: {img_path}")

orig_h, orig_w = img_bgr.shape[:2]

img_tensor, preprocessed_bgr, (target_h, target_w) = preprocess_intelligent(
    img_bgr, 736, Path(img_path).stem
)

with torch.no_grad():
    preds = model(img_tensor)

batch = {"shape": [(orig_h, orig_w)]}
boxes, scores = post_process(batch, preds, is_output_polygon=False)

print(f"Số box detect được: {len(boxes[0])}")

output_txt = "result_detect.txt"
with open(output_txt, "w", encoding="utf-8") as f:
    for i, box in enumerate(boxes[0]):
        score = scores[0][i] if scores is not None else 1.0
        coords = [f"{int(x)},{int(y)}" for (x, y) in box]
        f.write(f"{','.join(coords)},{score:.2f}\n")

print(f"Đã lưu kết quả detect vào {output_txt}")

img_show = img_bgr.copy()
for i, box in enumerate(boxes[0]):
    pts = np.array(box, np.int32).reshape((-1, 1, 2))
    cv2.polylines(img_show, [pts], True, (0, 255, 0), 2)

cv2.imwrite("result_detect.jpg", img_show)
print("Đã lưu ảnh kết quả: result_detect.jpg")


load from imagenet
Số box detect được: 313
Đã lưu kết quả detect vào result_detect.txt
Đã lưu ảnh kết quả: result_detect.jpg


In [5]:
import cv2
from PIL import Image
from vietocr.tool.predictor import Predictor
from vietocr.tool.config import Cfg
import numpy as np
import os
import math

def sort_boxes_reading_order(boxes):

    if not boxes: return []
    clean_boxes = []
    for b in boxes:
        if len(b) < 4: continue
        x1, y1, x2, y2 = b[:4]
        if x2 > x1 and y2 > y1 and (x2 - x1) >= 2 and (y2 - y1) >= 2:
            if x1 == 0 and y1 == 0 and x2 == 0 and y2 == 0:
                continue
            clean_boxes.append([x1, y1, x2, y2])
    if not clean_boxes: return []
    
    heights = [b[3] - b[1] for b in clean_boxes]
    avg_h = np.median(heights) if heights else 10
    print(f"Median Box Height: {avg_h:.2f}")
    
    boxes_with_cy = [(b, (b[1] + b[3]) / 2) for b in clean_boxes]
    boxes_with_cy.sort(key=lambda x: (x[1], x[0][0]))
    
    lines = []
    threshold = 11.3
    
    for box, cy in boxes_with_cy:
        best_line_idx = -1
        best_distance = threshold + 1
        
        for idx, line in enumerate(lines):
            line_cy_avg = np.mean([(b[1] + b[3]) / 2 for b in line])
            distance = abs(cy - line_cy_avg)
            
            if distance <= threshold and distance < best_distance:
                best_distance = distance
                best_line_idx = idx
        
        if best_line_idx >= 0:
            lines[best_line_idx].append(box)
        else:
            lines.append([box])
    
    lines.sort(key=lambda line: np.mean([b[1] for b in line]))
    
    final_lines = []
    for line in lines:
        line.sort(key=lambda b: b[0])
        final_lines.append(line)
    
    print(f"Đã gom thành {len(final_lines)} dòng")
    
    return final_lines

def enhance_image_for_ocr(image):
    if len(image.shape) == 3:
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    else:
        gray = image
    denoised = cv2.fastNlMeansDenoising(gray, None, 10, 7, 21)
    return denoised

def postprocess_text(text):
    if not text: return ""
    text = text.strip()
    replacements = {'|': 'I', '[': '(', ']': ')'}
    for k, v in replacements.items():
        text = text.replace(k, v)
    return text

def main():
    config = Cfg.load_config_from_file('D:/pbl6_be_v2/app/config/myconfig.yml')
    config['weights'] = 'D:/pbl6_be_v2/app/weights/myModelOCR.pth'
    config['device'] = 'cpu'
    detector = Predictor(config)
    
    image_path = img_path
    boxes_path = 'D:/pbl6_be_v2/result_detect.txt'
    
    img = cv2.imread(image_path)
    if img is None:
        print(f"Warning: Không tìm thấy ảnh {image_path}, thử dùng biến toàn cục nếu có.")
        try:
            img = img_bgr 
        except:
            raise FileNotFoundError(f"Không tìm thấy ảnh: {image_path}")

    H, W = img.shape[:2]
    boxes = []
    if os.path.exists(boxes_path):
        with open(boxes_path, "r", encoding="utf-8") as f:
            for line in f:
                parts = line.replace(",", " ").strip().split()
                if len(parts) < 8: continue
                try:
                    coords = list(map(float, parts[:8]))
                    xs = coords[0::2]; ys = coords[1::2]
                    x1, y1, x2, y2 = int(min(xs)), int(min(ys)), int(max(xs)), int(max(ys))
                    x1 = max(0, x1); y1 = max(0, y1); x2 = min(W, x2); y2 = min(H, y2)
                    if x2 > x1 and y2 > y1:
                        boxes.append([x1, y1, x2, y2])
                except:
                    continue
    else:
        print("Không tìm thấy file result_detect.txt")
        return

    print(f"Đã đọc {len(boxes)} boxes.")

    lines = sort_boxes_reading_order(boxes)

    debug_img = img.copy()
    line_idx = 0
    for line in lines:
        color = (np.random.randint(0,255), np.random.randint(0,255), np.random.randint(0,255))
        for box in line:
            x1, y1, x2, y2 = box[:4]
            cv2.rectangle(debug_img, (x1, y1), (x2, y2), color, 2)
            cv2.putText(debug_img, str(line_idx), (x1, y1-5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 1)
        line_idx += 1
    cv2.imwrite("D:/pbl6_be_v2/sorted_boxes_debug_robust.jpg", debug_img)

    final_results = []
    for line in lines:
        line_text_parts = []
        for box in line:
            x1, y1, x2, y2 = box[:4]
            crop = img[y1:y2, x1:x2]
            if crop.size == 0:
                continue
            crop_pil = Image.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB))
            text = detector.predict(crop_pil)
            text = postprocess_text(text)
            line_text_parts.append(text)
        full_line_text = " ".join(line_text_parts)
        final_results.append(full_line_text)

    out_path = "D:/pbl6_be_v2/ocr_results_optimized.txt"
    with open(out_path, "w", encoding="utf-8") as f:
        f.write("\n".join(final_results))
    print(f"Đã lưu kết quả OCR vào: {out_path}")

if __name__ == "__main__":
    main()

Đã đọc 293 boxes.
Median Box Height: 16.00
Đã gom thành 30 dòng
Đã lưu kết quả OCR vào: D:/pbl6_be_v2/ocr_results_optimized.txt
