In [28]:
import os
import zipfile
import shutil
import random
import numpy as np
import cv2
import json
import matplotlib.pyplot as plt
from tqdm import tqdm
from pathlib import Path
from math import atan2, degrees

### 데이터 준비

In [2]:
# 초기화 후 CP949로 해제
!rm -rf /content/data && mkdir -p /content/data
!unzip -qq -O CP949 "/content/drive/MyDrive/Colab Notebooks/Python_colab/Web Service/생성형 AI/data/medicine.zip" -d /content/data

In [3]:
random.seed(2025)

IMG_SRC = Path("/content/data/원천데이터/TS1/result/medicine/images")
ANN_SRC = Path("/content/data/라벨링데이터/TL1/result/medicine/annotations")

# TS1 / TL1 바로 아래에 생성
IMG_DST_ROOT = Path("/content/data/원천데이터/TS1")
ANN_DST_ROOT = Path("/content/data/라벨링데이터/TL1")

for p in [IMG_DST_ROOT, ANN_DST_ROOT]:
    for s in ["train", "val", "test"]:
        (p/s).mkdir(parents=True, exist_ok=True)

# 유틸
IMG_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".tif", ".tiff", ".webp"}
def list_files(root, exts=None):
    exts = {e.lower() for e in (exts or [])}
    files = []
    for f in root.rglob("*"):
        if f.is_file() and (not exts or f.suffix.lower() in exts):
            files.append(f)
    return files

# 1) 분배 대상 이미지 수집 (이미 분배된 것은 제외)
split_dirs = {IMG_DST_ROOT/"train", IMG_DST_ROOT/"val", IMG_DST_ROOT/"test"}
all_imgs = [p for p in list_files(IMG_SRC, IMG_EXTS) if not any(str(p).startswith(str(sd)) for sd in split_dirs)]

# 2) 섞고 8/1/1 분할
random.shuffle(all_imgs)
n = len(all_imgs)
tr, va = int(n*0.8), int(n*0.9)
splits = {
    "train": all_imgs[:tr],
    "val":   all_imgs[tr:va],
    "test":  all_imgs[va:]
}

# 3) 이미지 이동 + 주석(어노테이션) 동반 이동
def find_annotation(img_path: Path):
    # 이미지 파일명(stem)과 같은 이름의 주석을 annotations 트리에서 탐색
    stem = img_path.stem
    # 주석 확장자 후보 (필요시 추가)
    cand_exts = [".json", ".xml", ".txt"]
    for ext in cand_exts:
        hits = list(ANN_SRC.rglob(stem + ext))
        if hits:
            return hits[0]
    return None

moved = {"train":0, "val":0, "test":0}
moved_ann = {"train":0, "val":0, "test":0}

for split, files in splits.items():
    for img in files:
        dst_img = IMG_DST_ROOT/split/img.name
        if not dst_img.exists():  # 재실행 시 중복 방지
            dst_img.parent.mkdir(parents=True, exist_ok=True)
            shutil.move(str(img), str(dst_img))
            moved[split] += 1

        ann = find_annotation(img)
        if ann:
            dst_ann = ANN_DST_ROOT/split/ann.name
            if not dst_ann.exists():  # 재실행 시 중복 방지
                dst_ann.parent.mkdir(parents=True, exist_ok=True)
                shutil.move(str(ann), str(dst_ann))
                moved_ann[split] += 1

print(f"[이미지 이동] train/val/test = {moved['train']}/{moved['val']}/{moved['test']}")
print(f"[주석 이동]  train/val/test = {moved_ann['train']}/{moved_ann['val']}/{moved_ann['test']}")

# 4) 요약
def count_in(dst_root):
    return {s: len(list((dst_root/s).glob("*"))) for s in ["train","val","test"]}

print("\n== 최종 파일 개수 ==")
print("이미지:", count_in(IMG_DST_ROOT))
print("주석 :", count_in(ANN_DST_ROOT))


[이미지 이동] train/val/test = 388/48/49
[주석 이동]  train/val/test = 388/48/49

== 최종 파일 개수 ==
이미지: {'train': 388, 'val': 48, 'test': 49}
주석 : {'train': 388, 'val': 48, 'test': 49}


### 환경세팅

In [4]:
# pip 최신화
!pip install -q --upgrade pip

In [5]:
# 1) PaddlePaddle (GPU, Colab T4 기준)
!pip install -q paddlepaddle-gpu==2.5.2.post117 \
    -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html

# 2) OpenCV - paddleocr 2.7.0.3과 호환되는 버전
!pip uninstall -y opencv-python opencv-contrib-python >/dev/null
!pip install -q "opencv-python<=4.6.0.66" "opencv-contrib-python<=4.6.0.66"

# 3) PaddleOCR 본체 (자동 의존성 설치 막음)
!pip install -q --no-deps paddleocr==2.7.0.3

# 4) 이미지 OCR에 필요한 최소 의존성만 수동 설치
!pip install -q attrdict fire==0.5.0 lmdb shapely pyclipper \
    rapidfuzz scikit-image imgaug tqdm

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
paddleocr 2.7.0.3 requires pdf2docx, which is not installed.
paddleocr 2.7.0.3 requires premailer, which is not installed.
paddleocr 2.7.0.3 requires PyMuPDF<1.21.0, which is not installed.
paddleocr 2.7.0.3 requires python-docx, which is not installed.
paddleocr 2.7.0.3 requires visualdl, which is not installed.[0m[31m
[0m

In [6]:
!pip install -q "numpy<2.0"

### 전처리

In [7]:
# 1. 안전한 이미지 읽기
def safe_imread(path):
    path = str(path)
    data = np.fromfile(path, dtype=np.uint8)
    return cv2.imdecode(data, cv2.IMREAD_COLOR)

# 2. 안전한 이미지 저장
def safe_imsave(path, img):
    path = Path(path)
    path.parent.mkdir(parents=True, exist_ok=True)
    ext = path.suffix.lower() or ".jpg"
    ok, buf = cv2.imencode(ext, img)
    if ok:
        buf.tofile(str(path))

# 3. 리사이즈 (긴 변 기준)
def normalize_size(img, max_long=1280):
    h, w = img.shape[:2]
    long_side = max(h, w)
    if long_side <= max_long:
        return img
    scale = max_long / long_side
    new_w, new_h = int(w * scale), int(h * scale)
    return cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA)

# 4. 그레이스케일 변환
def to_gray(img):
    if len(img.shape) == 2:
        return img
    return cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

# 5. 가우시안 블러
def denoise_gaussian(img, ksize=3):
    k = max(3, int(ksize) | 1)  # 홀수 보정
    return cv2.GaussianBlur(img, (k, k), 0)

# 6. CLAHE + 오츠 이진화
def enhance_contrast_and_binarize(img, clip_limit=2.0, tile_grid_size=(8,8)):
    clahe = cv2.createCLAHE(clipLimit=clip_limit, tileGridSize=tile_grid_size)
    enhanced = clahe.apply(img)
    _, binary = cv2.threshold(enhanced, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    return binary

In [8]:
def preprocess_image_pipeline(input_path, output_path,
                               max_long=1280,
                               blur_ksize=3,
                               clahe_clip=2.0,
                               clahe_tile=(8,8)):
    # 1. 로드
    img = safe_imread(input_path)
    if img is None:
        raise FileNotFoundError(f"이미지를 불러올 수 없음: {input_path}")

    # 2. 리사이즈
    img = normalize_size(img, max_long=max_long)

    # 3. 그레이스케일
    gray = to_gray(img)

    # 4. 가우시안 블러
    blurred = denoise_gaussian(gray, ksize=blur_ksize)

    # 5. CLAHE + 오츠 이진화
    final = enhance_contrast_and_binarize(blurred,
                                          clip_limit=clahe_clip,
                                          tile_grid_size=clahe_tile)

    # 6. 저장
    safe_imsave(output_path, final)
    return final


### 미학습 PaddleOCR (추론만)

In [20]:
import csv
import subprocess
from paddleocr import PaddleOCR, draw_ocr
from PIL import Image

In [21]:
# GPU/CPU 자동 감지
def detect_gpu():
    try:
        output = subprocess.check_output("nvidia-smi", shell=True, stderr=subprocess.STDOUT)
        return True
    except subprocess.CalledProcessError:
        return False

USE_GPU = detect_gpu()
print(f"[INFO] GPU 사용 여부: {USE_GPU}")

[INFO] GPU 사용 여부: False


In [17]:
# 폰트 다운로드 (한글 폰트)
!wget -q https://github.com/google/fonts/raw/main/ofl/nanumgothic/NanumGothic-Regular.ttf -O /content/NanumGothic.ttf
FONT_PATH = "/content/NanumGothic.ttf"

In [22]:
ocr = PaddleOCR(
    use_angle_cls=True,
    lang='korean',
    rec=True,
    det=True,
    use_gpu=USE_GPU
)

[2025/08/12 11:38:30] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/root/.paddleocr/whl/det/ml/Multilingual_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/root/.paddleocr/whl/rec/korean/korean_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, max_text_length=25, re

In [23]:
def run_inference_and_visualize(img_path, save_dir):
    img_path = Path(img_path)
    save_dir = Path(save_dir)
    save_dir.mkdir(parents=True, exist_ok=True)

    # 이미지 로드
    image = safe_imread(img_path)
    if image is None:
        raise FileNotFoundError(f"이미지 로드 실패: {img_path}")
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

    # PaddleOCR 추론
    result = ocr.ocr(str(img_path), cls=True)

    # 결과 파싱
    boxes, txts, scores = [], [], []
    for line in result[0]:
        box, (txt, score) = line
        boxes.append(box)
        txts.append(txt)
        scores.append(score)

    # 폴리곤 시각화
    im_show = draw_ocr(image_rgb, boxes, txts, scores, font_path=FONT_PATH)
    im_show = Image.fromarray(im_show)

    # 결과 이미지 저장
    save_img_path = save_dir / f"{img_path.stem}_ocr_result.jpg"
    im_show.save(save_img_path)

    # CSV 저장 (평가지표용)
    save_csv_path = save_dir / f"{img_path.stem}_ocr_result.csv"
    with open(save_csv_path, 'w', newline='', encoding='utf-8-sig') as f:
        writer = csv.writer(f)
        writer.writerow(["x1", "y1", "x2", "y2", "x3", "y3", "x4", "y4", "text", "score"])
        for box, txt, score in zip(boxes, txts, scores):
            flat_box = [coord for point in box for coord in point]
            writer.writerow(flat_box + [txt, score])

    print(f"[저장 완료] 이미지: {save_img_path}, CSV: {save_csv_path}")
    return save_img_path, save_csv_path

In [30]:
def run_inference_and_visualize(img_path, output_dir):
    # 폴더 분리
    img_save_dir = Path(output_dir) / "images"
    csv_save_dir = Path(output_dir) / "csv"
    img_save_dir.mkdir(parents=True, exist_ok=True)
    csv_save_dir.mkdir(parents=True, exist_ok=True)

    # 이미지 로드
    image = safe_imread(img_path)
    if image is None:
        raise FileNotFoundError(f"이미지 로드 실패: {img_path}")
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

    # PaddleOCR 추론
    result = ocr.ocr(str(img_path), cls=True)

    # 결과 파싱
    boxes, txts, scores = [], [], []
    for line in result[0]:
        box, (txt, score) = line
        boxes.append(box)
        txts.append(txt)
        scores.append(score)

    # 폴리곤 시각화
    im_show = draw_ocr(image_rgb, boxes, txts, scores, font_path=FONT_PATH)
    im_show = Image.fromarray(im_show)

    # JPG 저장
    save_img_path = img_save_dir / f"{Path(img_path).stem}_ocr_result.jpg"
    im_show.save(save_img_path)

    # CSV 저장
    save_csv_path = csv_save_dir / f"{Path(img_path).stem}_ocr_result.csv"
    with open(save_csv_path, 'w', newline='', encoding='utf-8-sig') as f:
        writer = csv.writer(f)
        writer.writerow(["x1", "y1", "x2", "y2", "x3", "y3", "x4", "y4", "text", "score"])
        for box, txt, score in zip(boxes, txts, scores):
            flat_box = [coord for point in box for coord in point]
            writer.writerow(flat_box + [txt, score])

    return save_img_path, save_csv_path

In [31]:
val_dir = Path("/content/data/원천데이터/TS1/val")
output_dir = Path("/content/pretrained_results")

val_images = list(val_dir.glob("*.jpg"))
for img_path in tqdm(val_images, desc="PaddleOCR Pretrained Inference", unit="img"):
    try:
        run_inference_and_visualize(img_path, output_dir)
    except Exception as e:
        print(f"[ERROR] {img_path} 처리 중 오류: {e}")

PaddleOCR Pretrained Inference:   0%|          | 0/42 [00:00<?, ?img/s]

[2025/08/12 11:50:19] ppocr DEBUG: dt_boxes num : 38, elapsed : 0.40316081047058105
[2025/08/12 11:50:20] ppocr DEBUG: cls num  : 38, elapsed : 0.4670588970184326
[2025/08/12 11:50:38] ppocr DEBUG: rec_res num  : 38, elapsed : 18.29444646835327


PaddleOCR Pretrained Inference:   2%|▏         | 1/42 [00:19<13:30, 19.77s/img]

[2025/08/12 11:50:39] ppocr DEBUG: dt_boxes num : 26, elapsed : 0.34058713912963867
[2025/08/12 11:50:39] ppocr DEBUG: cls num  : 26, elapsed : 0.25953054428100586
[2025/08/12 11:50:57] ppocr DEBUG: rec_res num  : 26, elapsed : 17.79706382751465


PaddleOCR Pretrained Inference:   5%|▍         | 2/42 [00:38<12:45, 19.14s/img]

[2025/08/12 11:50:57] ppocr DEBUG: dt_boxes num : 18, elapsed : 0.5143916606903076
[2025/08/12 11:50:58] ppocr DEBUG: cls num  : 18, elapsed : 0.1735379695892334
[2025/08/12 11:51:08] ppocr DEBUG: rec_res num  : 18, elapsed : 9.843793153762817


PaddleOCR Pretrained Inference:   7%|▋         | 3/42 [00:49<09:57, 15.31s/img]

[2025/08/12 11:51:09] ppocr DEBUG: dt_boxes num : 8, elapsed : 1.1134419441223145
[2025/08/12 11:51:09] ppocr DEBUG: cls num  : 8, elapsed : 0.08049678802490234
[2025/08/12 11:51:14] ppocr DEBUG: rec_res num  : 8, elapsed : 5.419328689575195


PaddleOCR Pretrained Inference:  10%|▉         | 4/42 [00:55<07:33, 11.93s/img]

[2025/08/12 11:51:16] ppocr DEBUG: dt_boxes num : 7, elapsed : 0.9115481376647949
[2025/08/12 11:51:16] ppocr DEBUG: cls num  : 7, elapsed : 0.15390729904174805
[2025/08/12 11:51:21] ppocr DEBUG: rec_res num  : 7, elapsed : 5.626307249069214


PaddleOCR Pretrained Inference:  12%|█▏        | 5/42 [01:03<06:17, 10.19s/img]

[2025/08/12 11:51:22] ppocr DEBUG: dt_boxes num : 16, elapsed : 0.5280826091766357
[2025/08/12 11:51:22] ppocr DEBUG: cls num  : 16, elapsed : 0.16678452491760254
[2025/08/12 11:51:30] ppocr DEBUG: rec_res num  : 16, elapsed : 7.313940763473511


PaddleOCR Pretrained Inference:  14%|█▍        | 6/42 [01:11<05:44,  9.58s/img]

[2025/08/12 11:51:31] ppocr DEBUG: dt_boxes num : 55, elapsed : 0.8973388671875
[2025/08/12 11:51:32] ppocr DEBUG: cls num  : 55, elapsed : 0.868016242980957
[2025/08/12 11:52:02] ppocr DEBUG: rec_res num  : 55, elapsed : 29.506738662719727


PaddleOCR Pretrained Inference:  17%|█▋        | 7/42 [01:43<09:52, 16.93s/img]

[2025/08/12 11:52:03] ppocr DEBUG: dt_boxes num : 123, elapsed : 0.8793344497680664
[2025/08/12 11:52:04] ppocr DEBUG: cls num  : 123, elapsed : 1.1970815658569336
[2025/08/12 11:53:10] ppocr DEBUG: rec_res num  : 123, elapsed : 65.26930642127991


PaddleOCR Pretrained Inference:  19%|█▉        | 8/42 [02:51<18:48, 33.21s/img]

[2025/08/12 11:53:11] ppocr DEBUG: dt_boxes num : 17, elapsed : 0.873753547668457
[2025/08/12 11:53:11] ppocr DEBUG: cls num  : 17, elapsed : 0.16219425201416016
[2025/08/12 11:53:21] ppocr DEBUG: rec_res num  : 17, elapsed : 9.525257587432861


PaddleOCR Pretrained Inference:  21%|██▏       | 9/42 [03:02<14:25, 26.22s/img]

[2025/08/12 11:53:22] ppocr DEBUG: dt_boxes num : 6, elapsed : 0.5783882141113281
[2025/08/12 11:53:22] ppocr DEBUG: cls num  : 6, elapsed : 0.059825897216796875
[2025/08/12 11:53:26] ppocr DEBUG: rec_res num  : 6, elapsed : 4.416195631027222


PaddleOCR Pretrained Inference:  24%|██▍       | 10/42 [03:07<10:33, 19.79s/img]

[2025/08/12 11:53:27] ppocr DEBUG: dt_boxes num : 47, elapsed : 1.1364662647247314
[2025/08/12 11:53:28] ppocr DEBUG: cls num  : 47, elapsed : 0.45607972145080566
[2025/08/12 11:53:53] ppocr DEBUG: rec_res num  : 47, elapsed : 24.554537534713745


PaddleOCR Pretrained Inference:  26%|██▌       | 11/42 [03:34<11:17, 21.86s/img]

[2025/08/12 11:53:53] ppocr DEBUG: dt_boxes num : 4, elapsed : 0.44597840309143066
[2025/08/12 11:53:53] ppocr DEBUG: cls num  : 4, elapsed : 0.038843631744384766
[2025/08/12 11:53:55] ppocr DEBUG: rec_res num  : 4, elapsed : 1.7402517795562744


PaddleOCR Pretrained Inference:  29%|██▊       | 12/42 [03:36<07:57, 15.93s/img]

[2025/08/12 11:53:56] ppocr DEBUG: dt_boxes num : 79, elapsed : 0.8681612014770508
[2025/08/12 11:53:57] ppocr DEBUG: cls num  : 79, elapsed : 0.7496693134307861
[2025/08/12 11:54:37] ppocr DEBUG: rec_res num  : 79, elapsed : 39.55729413032532


PaddleOCR Pretrained Inference:  31%|███       | 13/42 [04:18<11:30, 23.80s/img]

[2025/08/12 11:54:38] ppocr DEBUG: dt_boxes num : 32, elapsed : 0.4482407569885254
[2025/08/12 11:54:38] ppocr DEBUG: cls num  : 32, elapsed : 0.33247876167297363
[2025/08/12 11:54:54] ppocr DEBUG: rec_res num  : 32, elapsed : 15.980324983596802


PaddleOCR Pretrained Inference:  33%|███▎      | 14/42 [04:35<10:08, 21.75s/img]

[2025/08/12 11:54:55] ppocr DEBUG: dt_boxes num : 8, elapsed : 0.3855118751525879
[2025/08/12 11:54:55] ppocr DEBUG: cls num  : 8, elapsed : 0.07809329032897949
[2025/08/12 11:55:01] ppocr DEBUG: rec_res num  : 8, elapsed : 6.13890814781189


PaddleOCR Pretrained Inference:  36%|███▌      | 15/42 [04:42<07:45, 17.25s/img]

[2025/08/12 11:55:02] ppocr DEBUG: dt_boxes num : 6, elapsed : 1.3677701950073242
[2025/08/12 11:55:03] ppocr DEBUG: cls num  : 6, elapsed : 0.07848286628723145
[2025/08/12 11:55:10] ppocr DEBUG: rec_res num  : 6, elapsed : 7.008348703384399


PaddleOCR Pretrained Inference:  38%|███▊      | 16/42 [04:51<06:21, 14.68s/img]

[2025/08/12 11:55:11] ppocr DEBUG: dt_boxes num : 19, elapsed : 0.9538769721984863
[2025/08/12 11:55:11] ppocr DEBUG: cls num  : 19, elapsed : 0.19323086738586426
[2025/08/12 11:55:23] ppocr DEBUG: rec_res num  : 19, elapsed : 12.526387214660645


PaddleOCR Pretrained Inference:  40%|████      | 17/42 [05:05<06:01, 14.46s/img]

[2025/08/12 11:55:24] ppocr DEBUG: dt_boxes num : 84, elapsed : 0.5756452083587646
[2025/08/12 11:55:25] ppocr DEBUG: cls num  : 84, elapsed : 0.8283345699310303
[2025/08/12 11:56:09] ppocr DEBUG: rec_res num  : 84, elapsed : 43.5697546005249


PaddleOCR Pretrained Inference:  43%|████▎     | 18/42 [05:50<09:32, 23.84s/img]

[2025/08/12 11:56:10] ppocr DEBUG: dt_boxes num : 14, elapsed : 0.7390592098236084
[2025/08/12 11:56:10] ppocr DEBUG: cls num  : 14, elapsed : 0.1418290138244629
[2025/08/12 11:56:19] ppocr DEBUG: rec_res num  : 14, elapsed : 8.726786136627197


PaddleOCR Pretrained Inference:  45%|████▌     | 19/42 [06:00<07:31, 19.65s/img]

[2025/08/12 11:56:20] ppocr DEBUG: dt_boxes num : 8, elapsed : 0.8131446838378906
[2025/08/12 11:56:20] ppocr DEBUG: cls num  : 8, elapsed : 0.07674002647399902
[2025/08/12 11:56:24] ppocr DEBUG: rec_res num  : 8, elapsed : 3.9278624057769775


PaddleOCR Pretrained Inference:  48%|████▊     | 20/42 [06:05<05:36, 15.28s/img]

[2025/08/12 11:56:25] ppocr DEBUG: dt_boxes num : 98, elapsed : 0.5033550262451172
[2025/08/12 11:56:26] ppocr DEBUG: cls num  : 98, elapsed : 0.9416356086730957
[2025/08/12 11:57:15] ppocr DEBUG: rec_res num  : 98, elapsed : 48.48081731796265


PaddleOCR Pretrained Inference:  50%|█████     | 21/42 [06:56<09:05, 25.99s/img]

[2025/08/12 11:57:16] ppocr DEBUG: dt_boxes num : 14, elapsed : 0.46349072456359863
[2025/08/12 11:57:16] ppocr DEBUG: cls num  : 14, elapsed : 0.14002466201782227
[2025/08/12 11:57:22] ppocr DEBUG: rec_res num  : 14, elapsed : 6.239164590835571


PaddleOCR Pretrained Inference:  52%|█████▏    | 22/42 [07:03<06:46, 20.32s/img]

[2025/08/12 11:57:23] ppocr DEBUG: dt_boxes num : 40, elapsed : 0.4919764995574951
[2025/08/12 11:57:23] ppocr DEBUG: cls num  : 40, elapsed : 0.3797140121459961
[2025/08/12 11:57:45] ppocr DEBUG: rec_res num  : 40, elapsed : 21.28754734992981


PaddleOCR Pretrained Inference:  55%|█████▍    | 23/42 [07:26<06:38, 20.97s/img]

[2025/08/12 11:57:46] ppocr DEBUG: dt_boxes num : 8, elapsed : 1.2429165840148926
[2025/08/12 11:57:46] ppocr DEBUG: cls num  : 8, elapsed : 0.07853984832763672
[2025/08/12 11:57:50] ppocr DEBUG: rec_res num  : 8, elapsed : 3.970109701156616


PaddleOCR Pretrained Inference:  57%|█████▋    | 24/42 [07:31<04:54, 16.36s/img]

[2025/08/12 11:57:51] ppocr DEBUG: dt_boxes num : 22, elapsed : 0.3467068672180176
[2025/08/12 11:57:51] ppocr DEBUG: cls num  : 22, elapsed : 0.2361140251159668
[2025/08/12 11:58:04] ppocr DEBUG: rec_res num  : 22, elapsed : 13.148208141326904


PaddleOCR Pretrained Inference:  60%|█████▉    | 25/42 [07:46<04:26, 15.66s/img]

[2025/08/12 11:58:05] ppocr DEBUG: dt_boxes num : 74, elapsed : 0.7689299583435059
[2025/08/12 11:58:06] ppocr DEBUG: cls num  : 74, elapsed : 0.7221825122833252
[2025/08/12 11:58:48] ppocr DEBUG: rec_res num  : 74, elapsed : 42.00453853607178


PaddleOCR Pretrained Inference:  62%|██████▏   | 26/42 [08:30<06:28, 24.27s/img]

[2025/08/12 11:58:50] ppocr DEBUG: dt_boxes num : 41, elapsed : 0.7342917919158936
[2025/08/12 11:58:50] ppocr DEBUG: cls num  : 41, elapsed : 0.38485193252563477
[2025/08/12 11:59:11] ppocr DEBUG: rec_res num  : 41, elapsed : 20.776434898376465


PaddleOCR Pretrained Inference:  64%|██████▍   | 27/42 [08:52<05:54, 23.66s/img]

[2025/08/12 11:59:12] ppocr DEBUG: dt_boxes num : 12, elapsed : 0.46523380279541016
[2025/08/12 11:59:12] ppocr DEBUG: cls num  : 12, elapsed : 0.12579584121704102
[2025/08/12 11:59:18] ppocr DEBUG: rec_res num  : 12, elapsed : 6.799831867218018


PaddleOCR Pretrained Inference:  67%|██████▋   | 28/42 [09:00<04:23, 18.83s/img]

[2025/08/12 11:59:19] ppocr DEBUG: dt_boxes num : 40, elapsed : 0.5258862972259521
[2025/08/12 11:59:20] ppocr DEBUG: cls num  : 40, elapsed : 0.39949560165405273
[2025/08/12 11:59:40] ppocr DEBUG: rec_res num  : 40, elapsed : 20.44159746170044


PaddleOCR Pretrained Inference:  69%|██████▉   | 29/42 [09:21<04:16, 19.70s/img]

[2025/08/12 11:59:41] ppocr DEBUG: dt_boxes num : 7, elapsed : 0.4566364288330078
[2025/08/12 11:59:41] ppocr DEBUG: cls num  : 7, elapsed : 0.07532382011413574
[2025/08/12 11:59:45] ppocr DEBUG: rec_res num  : 7, elapsed : 4.148775339126587


PaddleOCR Pretrained Inference:  71%|███████▏  | 30/42 [09:26<03:02, 15.25s/img]

[2025/08/12 11:59:46] ppocr DEBUG: dt_boxes num : 61, elapsed : 0.4663124084472656
[2025/08/12 11:59:46] ppocr DEBUG: cls num  : 61, elapsed : 0.6067459583282471
[2025/08/12 12:00:16] ppocr DEBUG: rec_res num  : 61, elapsed : 29.734556674957275


PaddleOCR Pretrained Inference:  74%|███████▍  | 31/42 [09:58<03:40, 20.08s/img]

[2025/08/12 12:00:17] ppocr DEBUG: dt_boxes num : 6, elapsed : 0.9051344394683838
[2025/08/12 12:00:18] ppocr DEBUG: cls num  : 6, elapsed : 0.055219411849975586
[2025/08/12 12:00:23] ppocr DEBUG: rec_res num  : 6, elapsed : 5.435468435287476


PaddleOCR Pretrained Inference:  76%|███████▌  | 32/42 [10:04<02:40, 16.04s/img]

[2025/08/12 12:00:24] ppocr DEBUG: dt_boxes num : 5, elapsed : 1.0559940338134766
[2025/08/12 12:00:24] ppocr DEBUG: cls num  : 5, elapsed : 0.047104597091674805
[2025/08/12 12:00:26] ppocr DEBUG: rec_res num  : 5, elapsed : 2.1422390937805176


PaddleOCR Pretrained Inference:  79%|███████▊  | 33/42 [10:08<01:50, 12.27s/img]

[2025/08/12 12:00:27] ppocr DEBUG: dt_boxes num : 46, elapsed : 0.4027242660522461
[2025/08/12 12:00:28] ppocr DEBUG: cls num  : 46, elapsed : 0.4626784324645996
[2025/08/12 12:00:52] ppocr DEBUG: rec_res num  : 46, elapsed : 24.35494375228882


PaddleOCR Pretrained Inference:  81%|████████  | 34/42 [10:33<02:10, 16.26s/img]

[2025/08/12 12:00:53] ppocr DEBUG: dt_boxes num : 11, elapsed : 0.5005061626434326
[2025/08/12 12:00:53] ppocr DEBUG: cls num  : 11, elapsed : 0.10927510261535645
[2025/08/12 12:00:58] ppocr DEBUG: rec_res num  : 11, elapsed : 4.941083192825317


PaddleOCR Pretrained Inference:  83%|████████▎ | 35/42 [10:39<01:31, 13.11s/img]

[2025/08/12 12:00:59] ppocr DEBUG: dt_boxes num : 63, elapsed : 0.7656245231628418
[2025/08/12 12:00:59] ppocr DEBUG: cls num  : 63, elapsed : 0.6221575736999512
[2025/08/12 12:01:30] ppocr DEBUG: rec_res num  : 63, elapsed : 30.980906009674072


PaddleOCR Pretrained Inference:  86%|████████▌ | 36/42 [11:12<01:54, 19.03s/img]

[2025/08/12 12:01:32] ppocr DEBUG: dt_boxes num : 60, elapsed : 0.8923327922821045
[2025/08/12 12:01:33] ppocr DEBUG: cls num  : 60, elapsed : 0.9168236255645752
[2025/08/12 12:02:03] ppocr DEBUG: rec_res num  : 60, elapsed : 30.055317401885986


PaddleOCR Pretrained Inference:  88%|████████▊ | 37/42 [11:44<01:55, 23.04s/img]

[2025/08/12 12:02:04] ppocr DEBUG: dt_boxes num : 36, elapsed : 0.43048524856567383
[2025/08/12 12:02:04] ppocr DEBUG: cls num  : 36, elapsed : 0.3625624179840088
[2025/08/12 12:02:26] ppocr DEBUG: rec_res num  : 36, elapsed : 22.21534299850464


PaddleOCR Pretrained Inference:  90%|█████████ | 38/42 [12:08<01:32, 23.13s/img]

[2025/08/12 12:02:28] ppocr DEBUG: dt_boxes num : 10, elapsed : 1.1304240226745605
[2025/08/12 12:02:28] ppocr DEBUG: cls num  : 10, elapsed : 0.15851521492004395
[2025/08/12 12:02:36] ppocr DEBUG: rec_res num  : 10, elapsed : 7.952355146408081


PaddleOCR Pretrained Inference:  93%|█████████▎| 39/42 [12:17<00:57, 19.04s/img]

[2025/08/12 12:02:37] ppocr DEBUG: dt_boxes num : 57, elapsed : 0.4749562740325928
[2025/08/12 12:02:37] ppocr DEBUG: cls num  : 57, elapsed : 0.5657944679260254
[2025/08/12 12:03:05] ppocr DEBUG: rec_res num  : 57, elapsed : 27.723896265029907


PaddleOCR Pretrained Inference:  95%|█████████▌| 40/42 [12:46<00:44, 22.10s/img]

[2025/08/12 12:03:06] ppocr DEBUG: dt_boxes num : 14, elapsed : 0.35507917404174805
[2025/08/12 12:03:06] ppocr DEBUG: cls num  : 14, elapsed : 0.13435673713684082
[2025/08/12 12:03:19] ppocr DEBUG: rec_res num  : 14, elapsed : 13.282071590423584


PaddleOCR Pretrained Inference:  98%|█████████▊| 41/42 [13:00<00:19, 19.67s/img]

[2025/08/12 12:03:21] ppocr DEBUG: dt_boxes num : 28, elapsed : 1.1062588691711426
[2025/08/12 12:03:21] ppocr DEBUG: cls num  : 28, elapsed : 0.5249338150024414
[2025/08/12 12:03:37] ppocr DEBUG: rec_res num  : 28, elapsed : 15.57778263092041


PaddleOCR Pretrained Inference: 100%|██████████| 42/42 [13:18<00:00, 19.01s/img]


### 학습된 PaddleOCR(FT)

In [27]:
IMG_ROOT = Path("/content/data/원천데이터/OCR")
ANN_ROOT = Path("/content/data/라벨링데이터/OCR")
SAVE_ROOT = Path("/content/ocr_label_txt")
SAVE_ROOT.mkdir(parents=True, exist_ok=True)