# Faster R-CNN
### 데이터 : coco128
- faster r-cnn에 맞는 데이터는 아니지만 다른 모델과 비교하기위해서 임의로 사용
- 모델특징 : 작은 객체 탐지에 강력하나, 학습에 자원이 많이들고 까다로움.
---

1. coco 128기반으로 데이터 어노테이션 json 제작

In [4]:
import os
import json
from tqdm import tqdm
import cv2

def convert_yolo_to_coco(img_dir, label_dir, output_json, categories):
    coco = {
        "images": [],
        "annotations": [],
        "categories": []
    }

    ann_id = 1
    img_id = 1

    for cat_id, cat_name in enumerate(categories, 1):
        coco["categories"].append({"id": cat_id, "name": cat_name})

    for img_file in tqdm(sorted(os.listdir(img_dir))):
        if not img_file.endswith(('.jpg', '.png', '.jpeg')):
            continue

        img_path = os.path.join(img_dir, img_file)
        label_path = os.path.join(label_dir, os.path.splitext(img_file)[0] + ".txt")

        if not os.path.exists(label_path):
            continue

        img = cv2.imread(img_path)
        h, w = img.shape[:2]

        coco["images"].append({
            "file_name": img_file,
            "height": h,
            "width": w,
            "id": img_id
        })

        with open(label_path, "r") as f:
            for line in f.readlines():
                parts = list(map(float, line.strip().split()))
                if len(parts) != 5:
                    continue
                class_id, x_center, y_center, width, height = parts

                x = (x_center - width / 2) * w
                y = (y_center - height / 2) * h
                bw = width * w
                bh = height * h

                coco["annotations"].append({
                    "id": ann_id,
                    "image_id": img_id,
                    "category_id": int(class_id) + 1,
                    "bbox": [x, y, bw, bh],
                    "area": bw * bh,
                    "iscrowd": 0
                })
                ann_id += 1

        img_id += 1

    with open(output_json, "w") as f:
        json.dump(coco, f, indent=4)

    print(f"Saved COCO JSON to: {output_json}")


if __name__ == "__main__":
    # 설정
    categories = [
        "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck",
        "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench",
        "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra",
        "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
        "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove",
        "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork",
        "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli",
        "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant",
        "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard",
        "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book",
        "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"
    ]    
    root_dir = "datasets"


    convert_yolo_to_coco(
        img_dir=os.path.join(root_dir, "images","train"),
        label_dir=os.path.join(root_dir, "labels","train"),
        output_json=os.path.join(root_dir, "annotations","instances_train.json"),
        categories=categories
    )

    convert_yolo_to_coco(
        img_dir=os.path.join(root_dir, "images","val"),
        label_dir=os.path.join(root_dir, "labels","val"),
        output_json=os.path.join(root_dir, "annotations","instances_val.json"),
        categories=categories
    )


100%|██████████| 103/103 [00:00<00:00, 764.70it/s]


Saved COCO JSON to: datasets\annotations\instances_train.json


100%|██████████| 25/25 [00:00<00:00, 605.78it/s]

Saved COCO JSON to: datasets\annotations\instances_val.json





2. 사전학습 모델을 가져와 카테고리(클래스) 값만 교체

In [5]:
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

num_classes = len(categories) + 1  # background 포함

model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)




3. faster RCNN 모델학습 데이터 가공

In [None]:
import os
import torch
from torch.utils.data import Dataset
from PIL import Image
import json
import torchvision.transforms as T

# COCO annotation 형식의 사용자 정의 Dataset 클래스
class CocoLikeDataset(Dataset):
    def __init__(self, img_dir, ann_file, transforms=None):
        self.img_dir = img_dir  # 이미지 파일들이 들어있는 디렉토리
        self.transforms = transforms  # 증강포함

        # annotation JSON 파일 로드
        with open(ann_file, 'r') as f:
            self.coco = json.load(f)

        # class id- 이름을 매핑 
        self.categories = {cat['id']: cat['name'] for cat in self.coco['categories']}

        # id별 어노테이션 그룹핑
        self.image_id_to_annotations = {}
        for ann in self.coco['annotations']:
            img_id = ann['image_id']
            if img_id not in self.image_id_to_annotations:
                self.image_id_to_annotations[img_id] = []
            self.image_id_to_annotations[img_id].append(ann)

        # image 정보리스트
        self.images = self.coco['images']

    def __getitem__(self, idx):
        # 현재 인덱스에 해당하는 이미지 정보
        img_info = self.images[idx]
        img_path = os.path.join(self.img_dir, img_info['file_name'])  # 이미지 경로 
        img = Image.open(img_path).convert("RGB")  #이미지 불러오기

        image_id = img_info['id']
        anns = self.image_id_to_annotations.get(image_id, [])  # 해당이미지 어노테이션목록

        boxes = []     # 바운딩 박스 좌표 [x_min, y_min, x_max, y_max]
        labels = []    # 클래스 ID
        areas = []     # 객체 면적
        iscrowd = []   # crowd 여부 (0: 단일 객체, 1: 군집 객체)

        for ann in anns:
            bbox = ann['bbox']  # COCO 형식: [x, y, width, height]
            x_min = bbox[0]
            y_min = bbox[1]
            x_max = x_min + bbox[2]
            y_max = y_min + bbox[3]
            boxes.append([x_min, y_min, x_max, y_max])
            labels.append(ann['category_id'])        # 분류 ID
            areas.append(ann['area'])                # 사전 계산된 면적
            iscrowd.append(ann.get('iscrowd', 0))    # 'iscrowd'가 없으면 0으로 설정

        # 모든 리스트를 tensor로 변환 (PyTorch 모델 입력용)
        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        labels = torch.as_tensor(labels, dtype=torch.int64)
        areas = torch.as_tensor(areas, dtype=torch.float32)
        iscrowd = torch.as_tensor(iscrowd, dtype=torch.int64)

        # Faster R-CNN 등에 맞는 포맷으로 타깃 딕셔너리 구성
        target = {
            "boxes": boxes,                   # [N, 4]
            "labels": labels,                 # [N]
            "image_id": torch.tensor([image_id]),  # [1]
            "area": areas,                    # [N]
            "iscrowd": iscrowd                # [N]
        }

        # 이미지 변환 적용 (ToTensor, Augmentation 등)
        if self.transforms:
            img = self.transforms(img)

        return img, target  # 모델에 전달될 (이미지, 타깃)

    def __len__(self):
        return len(self.images)  # 전체 이미지 개수 반환


4. 모델학습

In [8]:
from torch.utils.data import DataLoader
import torch
import time

train_dataset = CocoLikeDataset(
    img_dir="datasets/images/train",
    ann_file="datasets/annotations/instances_train.json",
    transforms=get_transform(train=True)
)
val_dataset = CocoLikeDataset(
    img_dir="datasets/images/val",
    ann_file="datasets/annotations/instances_val.json",
    transforms=get_transform(train=False)
)

def collate_fn(batch):
    return tuple(zip(*batch))

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# 모델 매개변수 수 출력
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total Trainable Parameters: {num_params:,}")

optimizer = torch.optim.SGD(model.parameters(), lr=0.005, momentum=0.9, weight_decay=0.0005)

num_epochs = 50
start_time = time.time()  # 시작 시간

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for images, targets in train_loader:
        images = list(img.to(device) for img in images)
        targets = [{k: v.to(device) for k,v in t.items()} for t in targets]

        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()
        
        total_loss += losses.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader):.4f}")

end_time = time.time()  # 끝 시간
elapsed_time = end_time - start_time
print(f"\nTotal Training Time: {elapsed_time:.2f} seconds ({elapsed_time/60:.2f} minutes)")


Total Trainable Parameters: 41,481,636
Epoch 1, Loss: 0.8909
Epoch 2, Loss: 0.8434
Epoch 3, Loss: 0.8598
Epoch 4, Loss: 0.8807
Epoch 5, Loss: 0.8425
Epoch 6, Loss: 0.8520
Epoch 7, Loss: 0.7724
Epoch 8, Loss: 0.7576
Epoch 9, Loss: 0.7457
Epoch 10, Loss: 0.6941
Epoch 11, Loss: 0.6681
Epoch 12, Loss: 0.6897
Epoch 13, Loss: 0.6568
Epoch 14, Loss: 0.6153
Epoch 15, Loss: 0.6215
Epoch 16, Loss: 0.6244
Epoch 17, Loss: 0.5546
Epoch 18, Loss: 0.5749
Epoch 19, Loss: 0.5390
Epoch 20, Loss: 0.5056
Epoch 21, Loss: 0.5425
Epoch 22, Loss: 0.5339
Epoch 23, Loss: 0.5183
Epoch 24, Loss: 0.4876
Epoch 25, Loss: 0.4741
Epoch 26, Loss: 0.4683
Epoch 27, Loss: 0.4667
Epoch 28, Loss: 0.4930
Epoch 29, Loss: 0.4273
Epoch 30, Loss: 0.4087
Epoch 31, Loss: 0.4127
Epoch 32, Loss: 0.4239
Epoch 33, Loss: 0.3789
Epoch 34, Loss: 0.4109
Epoch 35, Loss: 0.3917
Epoch 36, Loss: 0.3542
Epoch 37, Loss: 0.3729
Epoch 38, Loss: 0.4004
Epoch 39, Loss: 0.3645
Epoch 40, Loss: 0.3513
Epoch 41, Loss: 0.3438
Epoch 42, Loss: 0.3243
Epoc