In [2]:
import torch
import cv2
import yaml
import numpy as np
from addict import Dict
from pathlib import Path
from segmentation.models import build_model
from segmentation.post_processing import get_post_processing
import matplotlib.pyplot as plt

# ===================== ADAPTIVE PREPROCESSOR v5 =====================
class AdaptivePreprocessor:

    
    def __init__(self):
        self.base_strategies = {
            'very_dark': {'clahe_clip': 8.0, 'tile': (8, 8), 'bilateral': (7, 50, 50), 'denoise': (10, 10, 7, 21)},
            'dark': {'clahe_clip': 6.0, 'tile': (8, 8), 'bilateral': (7, 40, 40), 'denoise': (10, 10, 7, 21)},
            'normal': {'clahe_clip': 3.0, 'tile': (16, 16), 'bilateral': (5, 30, 30), 'denoise': (5, 5, 3, 10)},
            'bright': {'clahe_clip': 1.5, 'tile': (16, 16), 'bilateral': (5, 20, 20), 'denoise': (5, 5, 3, 10)},
            'very_bright': {'clahe_clip': 1.0, 'tile': (16, 16), 'bilateral': (3, 15, 15), 'denoise': (5, 5, 3, 10)},
        }
    
    def analyze_image(self, img_bgr):
        gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
        h, w = img_bgr.shape[:2]
        
        mean = np.mean(gray)
        std = np.std(gray)
        percentile_5 = np.percentile(gray, 5)
        percentile_95 = np.percentile(gray, 95)
        contrast = percentile_95 - percentile_5
        laplacian_var = cv2.Laplacian(gray, cv2.CV_64F).var()
        
        return {
            'mean': mean,
            'std': std,
            'percentile_5': percentile_5,
            'percentile_95': percentile_95,
            'contrast': contrast,
            'laplacian_var': laplacian_var,
            'shape': (h, w),
            'gray': gray
        }
    
    def classify_image(self, metrics):
        mean = metrics['mean']
        contrast = metrics['contrast']
        laplacian_var = metrics['laplacian_var']
        
        if mean < 50:
            primary = 'very_dark'
        elif mean < 100:
            primary = 'dark'
        elif mean < 150:
            primary = 'normal'
        elif mean < 200:
            primary = 'bright'
        else:
            primary = 'very_bright'
        
        return primary, {
            'is_high_contrast': contrast > 100,
            'is_low_contrast': contrast < 50,
            'is_blur': laplacian_var < 80,
            'is_clear': laplacian_var > 1000,
        }
    
    def adapt_strategy(self, primary, characteristics):
        strategy = self.base_strategies[primary].copy()
        
        if characteristics['is_high_contrast']:
            strategy['clahe_clip'] = min(strategy['clahe_clip'] * 1.5, 8.0)
            d, s1, s2 = strategy['bilateral']
            strategy['bilateral'] = (d, min(s1 + 10, 70), min(s2 + 10, 70))
        
        if characteristics['is_low_contrast']:
            strategy['clahe_clip'] = min(strategy['clahe_clip'] * 2.0, 8.0)
            strategy['tile'] = (8, 8)
        
        if characteristics['is_blur']:
            strategy['clahe_clip'] = max(strategy['clahe_clip'] * 0.7, 0.5)
            d, s1, s2 = strategy['bilateral']
            strategy['bilateral'] = (d, max(s1 - 10, 5), max(s2 - 10, 5))
        
        if characteristics['is_clear']:
            d, s1, s2 = strategy['bilateral']
            strategy['bilateral'] = (d, min(s1 + 20, 80), min(s2 + 20, 80))
        
        return strategy
    
    def preprocess(self, img_bgr, short_size=736, img_name=None):
        metrics = self.analyze_image(img_bgr)
        primary, characteristics = self.classify_image(metrics)
        strategy = self.adapt_strategy(primary, characteristics)
        
        print(f"{primary.upper()}", end="")
        if characteristics['is_high_contrast']: print(" + high_contrast", end="")
        if characteristics['is_low_contrast']: print(" + low_contrast", end="")
        if characteristics['is_blur']: print(" + blur", end="")
        if characteristics['is_clear']: print(" + clear", end="")
        print(f"Brightness={metrics['mean']:.0f}, Contrast={metrics['contrast']:.0f}")
        
        gray = metrics['gray']
        
        h, w = img_bgr.shape[:2]
        scale = short_size * 1.0 / min(h, w)
        new_h = int(h * scale + 0.5)
        new_w = int(w * scale + 0.5)
        new_h = (new_h + 31) // 32 * 32
        new_w = (new_w + 31) // 32 * 32
        gray = cv2.resize(gray, (new_w, new_h))
        
        # === BƯỚC 1: Normalize ===
        if primary in ['very_dark', 'dark']:
            gray = cv2.normalize(gray, None, alpha=40, beta=220, norm_type=cv2.NORM_MINMAX)
        elif primary in ['very_bright', 'bright']:
            gray = cv2.normalize(gray, None, alpha=50, beta=230, norm_type=cv2.NORM_MINMAX)
        else:
            gray = cv2.normalize(gray, None, alpha=30, beta=220, norm_type=cv2.NORM_MINMAX)
        
        # === BƯỚC 2: CLAHE ===
        clahe = cv2.createCLAHE(clipLimit=strategy['clahe_clip'], tileGridSize=strategy['tile'])
        gray = clahe.apply(gray)
        
        # === BƯỚC 3: Bilateral Filter ===
        d, sigma1, sigma2 = strategy['bilateral']
        gray = cv2.bilateralFilter(gray, d, sigma1, sigma2)
        
        # === BƯỚC 4: Morphological Operations ===
        if primary in ['very_dark', 'dark']:
            kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3))
            gray = cv2.morphologyEx(gray, cv2.MORPH_CLOSE, kernel, iterations=1)
        
        # === BƯỚC 5: Non-local Means Denoising ===
        denoise_h, denoise_template, denoise_search, denoise_strength = strategy['denoise']
        gray_bgr = cv2.cvtColor(gray, cv2.COLOR_GRAY2BGR)
        gray_bgr = cv2.fastNlMeansDenoisingColored(gray_bgr, None, denoise_h, denoise_template, denoise_search, denoise_strength)
        gray = cv2.cvtColor(gray_bgr, cv2.COLOR_BGR2GRAY)
        
        # === BƯỚC 6: Edge Enhancement ===
        if metrics['laplacian_var'] > 80:
            blur_enh = cv2.GaussianBlur(gray, (3, 3), 0)
            gray = cv2.addWeighted(gray, 1.2, blur_enh, -0.2, 0)
            gray = np.clip(gray, 0, 255).astype(np.uint8)
        
        enhanced = cv2.cvtColor(gray, cv2.COLOR_GRAY2BGR)
        img_tensor = torch.from_numpy(enhanced).permute(2, 0, 1).unsqueeze(0).float() / 255.0
        
        return img_tensor, enhanced, (new_h, new_w)


preprocessor = AdaptivePreprocessor()

def preprocess_intelligent(img_bgr, short_size=736, img_name=None):
    return preprocessor.preprocess(img_bgr, short_size, img_name)

def visualize_detection(img, boxes, scores, conf_threshold=0.5):
    result = img.copy()
    
    for box, score in zip(boxes, scores):
        if score < conf_threshold:
            continue
        
        pts = np.array(box, dtype=np.int32).reshape(-1, 1, 2)
        cv2.polylines(result, [pts], True, (0, 255, 0), 2)
        
        x1, y1 = int(box[0][0]), int(box[0][1])
        cv2.putText(result, f'{score:.2f}', (x1, y1-5), 
                   cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0, 255, 0), 1)
    
    return result


# ===================== LOAD MODEL =====================
with open("D:/pbl6_be_v2/app/config/icdar2015_resnet18_FPN_DBhead_polyLR.yaml", "r") as f:
    cfg = Dict(yaml.safe_load(f))

if "in_channels" not in cfg["arch"]["backbone"]:
    cfg["arch"]["backbone"]["in_channels"] = 3

model = build_model(cfg["arch"])
checkpoint = torch.load(
    "D:/pbl6_be_v2/app/weights/model_best.pth",
    map_location="cpu"
)
model.load_state_dict(checkpoint["state_dict"])
model.eval()

post_process = get_post_processing(cfg["post_processing"])


# ===================== INFERENCE =====================
img_path = "D:/pbl6_be_v2/test_jpg.jpg"
img_bgr = cv2.imread(img_path)
if img_bgr is None:
    raise FileNotFoundError(f"Không đọc được ảnh: {img_path}")

orig_h, orig_w = img_bgr.shape[:2]
img_tensor, preprocessed_bgr, (target_h, target_w) = preprocess_intelligent(img_bgr, 736, Path(img_path).stem)

with torch.no_grad():
    preds = model(img_tensor)
    print("Preds:", preds.shape, preds.min().item(), preds.max().item())

# ===================== POST PROCESS =====================
batch = {"shape": [(orig_h, orig_w)]}
boxes, scores = post_process(batch, preds, is_output_polygon=False)
print(f"Số box detect được: {len(boxes[0])}")

# ===================== SAVE RESULTS =====================
output_txt = "result_detect.txt"
with open(output_txt, "w", encoding="utf-8") as f:
    for i, box in enumerate(boxes[0]):
        score = scores[0][i] if scores is not None else 1.0
        coords = [f"{int(x)},{int(y)}" for (x, y) in box]
        f.write(f"{','.join(coords)},{score:.2f}\n")
print(f"Đã lưu kết quả detect vào {output_txt}")

# ===================== VẼ KẾT QUẢ =====================
img_show = img_bgr.copy()
for i, box in enumerate(boxes[0]):
    pts = np.array(box, np.int32).reshape((-1, 1, 2))
    cv2.polylines(img_show, [pts], isClosed=True, color=(0, 255, 0), thickness=2)

cv2.imwrite("result_detect.jpg", img_show)
print("Đã lưu ảnh kết quả: result_detect.jpg")


load from imagenet
BRIGHT + high_contrast + clearBrightness=166, Contrast=157
Preds: torch.Size([1, 2, 1088, 736]) 0.0004289679345674813 0.9976726174354553
Số box detect được: 311
Đã lưu kết quả detect vào result_detect.txt
Đã lưu ảnh kết quả: result_detect.jpg


In [3]:
import cv2
from PIL import Image
from vietocr.tool.predictor import Predictor
from vietocr.tool.config import Cfg
import numpy as np
import os
import math

# ---------------------------
# HÀM TIỆN ÍCH
# ---------------------------
def clamp(val, lo, hi):
    return max(lo, min(hi, val))

# ---------------------------
# NHÓM LINES THEO Y (median-height based)
# ---------------------------
def group_lines_by_median(boxes, med_h_factor=0.6):
    """
    boxes: list of [x1,y1,x2,y2]
    Trả về: list of lines, mỗi line là list of boxes (kept as [x1,y1,x2,y2])
    """
    if not boxes:
        return []

    arr = np.array(boxes, dtype=float)
    y_centers = (arr[:,1] + arr[:,3]) / 2.0
    heights = (arr[:,3] - arr[:,1])
    median_h = float(np.median(heights)) if len(heights)>0 else 0.0
    if median_h <= 0:
        median_h = float(np.mean(heights)) if len(heights)>0 else 10.0

    # sort by y_center
    order = np.argsort(y_centers)
    arr_sorted = arr[order]

    y_thresh = median_h * med_h_factor

    lines = []
    current = [arr_sorted[0].tolist()]
    current_mean_y = y_centers[order[0]]

    for r in arr_sorted[1:]:
        cy = (r[1] + r[3]) / 2.0
        if abs(cy - current_mean_y) <= y_thresh:
            current.append(r.tolist())
            # update mean
            current_mean_y = np.mean([ (b[1]+b[3])/2.0 for b in current ])
        else:
            lines.append(current)
            current = [r.tolist()]
            current_mean_y = cy
    lines.append(current)

    # Merge very close lines (to avoid over-splitting)
    merged = []
    for ln in lines:
        if not merged:
            merged.append(ln)
            continue
        prev = merged[-1]
        prev_y = np.mean([ (b[1]+b[3])/2.0 for b in prev ])
        cur_y = np.mean([ (b[1]+b[3])/2.0 for b in ln ])
        if abs(cur_y - prev_y) < median_h * 0.45:  # merge threshold
            merged[-1].extend(ln)
        else:
            merged.append(ln)

    # inside each line sort boxes by x1 ascending (left -> right)
    final_lines = []
    for ln in merged:
        ln_sorted = sorted(ln, key=lambda b: b[0])
        final_lines.append([ [int(b[0]), int(b[1]), int(b[2]), int(b[3])] for b in ln_sorted ])

    return final_lines

# ---------------------------
# PHÁT HIỆN CỘT BẰNG GAP TRÊN X_CENTER
# ---------------------------
def detect_columns_by_x_gaps(lines, min_gap_factor=1.5):
    """
    lines: list of lines (each is list of boxes [x1,y1,x2,y2])
    Ý tưởng: compute x_center for each line (mean of boxes), sort them; 
    find big gaps -> define column boundaries
    Trả về: list of columns, mỗi column là list of lines
    """
    if not lines:
        return []

    line_centers = [ np.mean([ (b[0]+b[2])/2.0 for b in line ]) for line in lines ]
    sorted_idx = np.argsort(line_centers)
    centers_sorted = [line_centers[i] for i in sorted_idx]

    # nếu ít lines thì 1 cột
    if len(centers_sorted) < 4:
        return [ [lines[i] for i in sorted_idx] ]

    # compute gaps between adjacent centers
    gaps = [ centers_sorted[i+1] - centers_sorted[i] for i in range(len(centers_sorted)-1) ]
    median_gap = np.median(gaps) if gaps else 0.0
    if median_gap <= 0:
        median_gap = np.mean(gaps) if gaps else 50.0

    # find split points where gap is significantly larger than typical
    split_indices = []
    threshold = median_gap * min_gap_factor
    for i,g in enumerate(gaps):
        if g > threshold:
            split_indices.append(i)

    # build boundaries over sorted_idx
    columns = []
    start = 0
    for si in split_indices:
        group_idx = sorted_idx[start:si+1]
        columns.append([ lines[i] for i in group_idx ])
        start = si+1
    # last group
    group_idx = sorted_idx[start: len(sorted_idx)]
    columns.append([ lines[i] for i in group_idx ])

    # sort lines inside each column by y (top -> bottom)
    for col in columns:
        col.sort(key=lambda ln: np.mean([ (b[1]+b[3])/2.0 for b in ln ]))

    # sort columns left -> right by their mean x center
    columns.sort(key=lambda col: np.mean([ np.mean([ (b[0]+b[2])/2.0 for b in ln ]) for ln in col for b in ln ]))
    return columns

# ---------------------------
# TOÀN BỘ PIPELINE SẮP XẾP (LEFT->RIGHT reading order)
# ---------------------------
def sort_boxes_reading_order(boxes):
    """
    boxes: list of [x1,y1,x2,y2]
    Trả về: final_lines_ordered: list of lines (each line list of boxes)
    Reading order implemented: iterate columns left->right, within column top->bottom
    """
    # 1) group into lines
    lines = group_lines_by_median(boxes, med_h_factor=0.6)

    # 2) detect columns using x gaps
    columns = detect_columns_by_x_gaps(lines, min_gap_factor=1.6)

    # 3) final assembly: for each column left->right, append its lines in top->bottom
    final_lines = []
    for col in columns:
        # col already sorted by y
        for ln in col:
            # ensure boxes inside line sorted left->right
            ln_sorted = sorted(ln, key=lambda b: b[0])
            final_lines.append(ln_sorted)

    return final_lines

# ---------------------------
# CONFIG VIETOCR
# ---------------------------
config = Cfg.load_config_from_file('D:/pbl6_be_v2/app/config/myconfig.yml')
config['weights'] = 'D:/pbl6_be_v2/app/weights/mymodelOCR.pth'
config['device'] = 'cpu'
detector = Predictor(config)

# ---------------------------
# INPUT (gán img_path trước khi chạy)
# ---------------------------
image_path = img_path
boxes_path = 'D:/pbl6_be_v2/result_detect.txt'

img = cv2.imread(image_path)
if img is None:
    raise FileNotFoundError(f"Không tìm thấy ảnh tại: {image_path}")

H, W = img.shape[:2]

# ---------------------------
# ĐỌC BOX TỪ FILE
# ---------------------------
boxes = []
with open(boxes_path, "r", encoding="utf-8") as f:
    for line in f:
        parts = line.replace(",", " ").strip().split()
        if not parts:
            continue
        try:
            parts_f = list(map(float, parts))
        except:
            continue
        if len(parts_f) < 8:
            continue
        coords = parts_f[:8]
        xs = coords[0::2]
        ys = coords[1::2]
        x1, y1 = int(min(xs)), int(min(ys))
        x2, y2 = int(max(xs)), int(max(ys))
        # clamp
        x1 = clamp(x1, 0, W-1)
        x2 = clamp(x2, 0, W-1)
        y1 = clamp(y1, 0, H-1)
        y2 = clamp(y2, 0, H-1)
        # ignore degenerate
        if x2 <= x1 or y2 <= y1:
            continue
        boxes.append([x1, y1, x2, y2])

print(f"Đã đọc {len(boxes)} box từ file detect")

# ---------------------------
# SẮP XẾP THEO THỨ TỰ ĐỌC (LEFT -> RIGHT)
# ---------------------------
lines = sort_boxes_reading_order(boxes)
print(f"Gom thành {len(lines)} dòng (theo reading order left->right).")

# ---------------------------
# VẼ DEBUG
# ---------------------------
debug_img = img.copy()
idx = 0
for ln in lines:
    for (x1,y1,x2,y2) in ln:
        cv2.rectangle(debug_img, (x1,y1), (x2,y2), (0,255,0), 2)
        cv2.putText(debug_img, str(idx), (x1, max(0,y1-6)),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,0,255), 1, cv2.LINE_AA)
        idx += 1

dbg_out = "D:/pbl6_be_v2/sorted_boxes_debug.jpg"
cv2.imwrite(dbg_out, debug_img)
print(f"Đã lưu ảnh debug: {dbg_out}")

# ---------------------------
# OCR THEO THỨ TỰ MỚI
# ---------------------------
final_lines_texts = []
for ln in lines:
    line_texts = []
    for (x1,y1,x2,y2) in ln:
        try:
            crop = img[y1:y2, x1:x2]
            if crop.size == 0:
                continue
            crop_pil = Image.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB))
            txt = detector.predict(crop_pil)
            if isinstance(txt, str):
                txt = txt.strip()
            else:
                txt = str(txt).strip()
        except Exception as e:
            # nếu OCR lỗi, bỏ box hoặc giữ rỗng
            txt = ""
        if txt:
            line_texts.append(txt)
    # nối các box trong dòng bằng 1 khoảng trắng
    final_lines_texts.append(" ".join(line_texts))

# Ghép các dòng: vì reading order đã left->right (cột trái->phải) và trong cột top->bottom,
# mỗi phần tử final_lines_texts tương ứng 1 line trong thứ tự đọc.
final_text = "\n".join([ln for ln in final_lines_texts if ln.strip() != ""])

# ---------------------------
# LƯU KẾT QUẢ
# ---------------------------
out_path = "D:/pbl6_be_v2/ocr_results.txt"
with open(out_path, "w", encoding="utf-8") as f:
    f.write(final_text)

print(f"✔ Đã hoàn thành OCR và lưu kết quả vào: {out_path}")


Đã đọc 267 box từ file detect
Gom thành 32 dòng (theo reading order left->right).
Đã lưu ảnh debug: D:/pbl6_be_v2/sorted_boxes_debug.jpg
✔ Đã hoàn thành OCR và lưu kết quả vào: D:/pbl6_be_v2/ocr_results.txt
