In [9]:
# ========================================
# Cable Block Diagram 자동 구조화 프로토타입
# ========================================

import cv2
import torch
import numpy as np
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
import json

# ========================================
# 1️⃣ PDF → 이미지 변환 (pdf2image 사용 가능)
# ========================================
# 여기서는 이미 이미지 파일로 변환했다고 가정
image_path = "./data/test_image.JPG"
image = cv2.imread(image_path)

# ========================================
# 2️⃣ Text Detection (YOLOv8)
# ========================================
# pip install ultralytics
from ultralytics import YOLO

text_model = YOLO("yolov8n.pt")  # pre-trained text detection
text_results = text_model.predict(image)

text_bboxes = []
for r in text_results[0].boxes.xyxy:  # xyxy format
    x1, y1, x2, y2 = r.tolist()
    text_bboxes.append([int(x1), int(y1), int(x2), int(y2)])


0: 448x640 (no detections), 1.6ms
Speed: 1.1ms preprocess, 1.6ms inference, 0.2ms postprocess per image at shape (1, 3, 448, 640)


In [11]:
from ultralytics import YOLO
import cv2

# 1️⃣ 이미지 읽기
image_path = "./data/test_image.JPG"
image = cv2.imread(image_path)

# 2️⃣ Text Detection용 YOLO 모델 로드
# yolov8n-text.pt : text detection용 pre-trained 모델
text_model = YOLO("yolov8n-seg.pt")

# 3️⃣ Inference
# conf 낮춰서 작은 글자도 잡히게 설정
text_results = text_model.predict(source=image, conf=0.1, imgsz=640)

# 4️⃣ Bounding box 추출
text_bboxes = []
for r in text_results[0].boxes.xyxy:  # xyxy format
    x1, y1, x2, y2 = r.tolist()
    text_bboxes.append([int(x1), int(y1), int(x2), int(y2)])

print("Detected text bboxes:", text_bboxes)


[KDownloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov8n-seg.pt to 'yolov8n-seg.pt': 100% ━━━━━━━━━━━━ 6.7MB 88.9MB/s 0.1s

0: 448x640 2 refrigerators, 1 clock, 4.4ms
Speed: 1.0ms preprocess, 4.4ms inference, 161.0ms postprocess per image at shape (1, 3, 448, 640)
Detected text bboxes: [[145, 31, 588, 756], [0, 30, 659, 752], [236, 91, 313, 160]]


In [13]:
text_bboxes

[[145, 31, 588, 756], [0, 30, 659, 752], [236, 91, 313, 160]]

In [15]:
# ========================================
# 3️⃣ OCR (TrOCR)
# ========================================
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
ocr_model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")

ocr_texts = []
for bbox in text_bboxes:
    x1, y1, x2, y2 = bbox
    crop = image[y1:y2, x1:x2]
    # TrOCR expects PIL image
    from PIL import Image
    crop_pil = Image.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB))
    
    pixel_values = processor(images=crop_pil, return_tensors="pt").pixel_values
    generated_ids = ocr_model.generate(pixel_values)
    text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    ocr_texts.append(text)


Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-handwritten and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
# ========================================
# 4️⃣ Structure Detection (Boxes, Lines, Symbols)
# ========================================
# YOLO 모델 fine-tuned on diagram structures
structure_model = YOLO("yolov8n-seg.pt")  # BOX_SOLID, BOX_DASH, LINE, SYMBOL

structure_results = []
for bbox in text_bboxes:
    # 주변 padding 영역
    x1, y1, x2, y2 = bbox
    pad = 20
    crop = image[max(0,y1-pad):y2+pad, max(0,x1-pad):x2+pad]
    
    results = structure_model.predict(crop)
    for r in results[0].boxes.data:
        x_s, y_s, x_e, y_e, conf, cls = r.tolist()
        structure_results.append({
            "bbox": [int(x_s)+x1-pad, int(y_s)+y1-pad, int(x_e)+x1-pad, int(y_e)+y1-pad],
            "class": int(cls),
            "confidence": float(conf)
        })





0: 640x416 1 refrigerator, 27.2ms
Speed: 0.8ms preprocess, 27.2ms inference, 0.9ms postprocess per image at shape (1, 3, 640, 416)

0: 640x576 (no detections), 11.9ms
Speed: 1.1ms preprocess, 11.9ms inference, 0.1ms postprocess per image at shape (1, 3, 640, 576)

0: 608x640 (no detections), 11.4ms
Speed: 0.6ms preprocess, 11.4ms inference, 0.1ms postprocess per image at shape (1, 3, 608, 640)


In [24]:
# ========================================
# 5️⃣ Embedding Generation (Text + Structure)
# ========================================
# 여기는 placeholder, 실제로는 Transformer encoder 또는 CNN+Geom vector
def generate_embedding(text, structures):
    """
    text: string
    structures: list of dict with bbox/class
    return: np.array latent vector
    """
    # 예: 단순히 길이 + bbox 통합 벡터
    text_vec = np.array([len(text)])
    struct_vec = np.array([len(structures)])
    return np.concatenate([text_vec, struct_vec])

embeddings = []
for text, bbox in zip(ocr_texts, text_bboxes):
    # text 주변 구조물 필터링
    related_structs = [s for s in structure_results 
                       if (s['bbox'][0] >= bbox[0]-20 and s['bbox'][2] <= bbox[2]+20)]
    embeddings.append(generate_embedding(text, related_structs))


In [25]:
# ========================================
# 6️⃣ Embedding Inversion → Spec List
# ========================================
# placeholder: 실제 Transformer decoder 학습 필요
spec_list = []
for text, bbox, embedding in zip(ocr_texts, text_bboxes, embeddings):
    spec_list.append({
        "text": text,
        "bbox": bbox,
        "related_structures": [s for s in structure_results 
                               if (s['bbox'][0] >= bbox[0]-20 and s['bbox'][2] <= bbox[2]+20)],
        "embedding": embedding.tolist()
    })


In [None]:
# ========================================
# 7️⃣ JSON 출력
# ========================================
output = {
    "nodes": spec_list
}

with open("../data/cable_diagram_structured.json", "w") as f:
    json.dump(output, f, indent=2)

print("✅ 구조화 JSON 출력 완료!")

✅ 구조화 JSON 출력 완료!
