# Object Detection in Pytorch document

In [60]:
import torch
import torchvision
import torchvision.transforms as T

import numpy as np
import matplotlib.pyplot as plt
import PIL.Image as Image

import os

%matplotlib inline

## 경로 탐색

In [2]:
# 데이터 경로

data_path = 'data/PennFudanPed'
os.listdir(data_path)

['added-object-list.txt', 'Annotation', 'PedMasks', 'PNGImages', 'readme.txt']

In [3]:
# 이미지 경로
os.listdir(os.path.join(data_path,'PNGImages'))[:5]

['FudanPed00001.png',
 'FudanPed00002.png',
 'FudanPed00003.png',
 'FudanPed00004.png',
 'FudanPed00005.png']

In [4]:
# 마스크 경로
os.listdir(os.path.join(data_path, 'PedMasks'))[:5]

['FudanPed00001_mask.png',
 'FudanPed00002_mask.png',
 'FudanPed00003_mask.png',
 'FudanPed00004_mask.png',
 'FudanPed00005_mask.png']

## Dataset 생성

In [5]:
class PennFudanDataset(object):
    
    def __init__(self, root, transforms):
        self.root = data_path
        self.transforms = transforms
        
        self.imgs = sorted(os.listdir(os.path.join(data_path, 'PNGImages')))
        self.masks = sorted(os.listdir(os.path.join(data_path, 'PedMasks')))
        
    def __getitem__(self, idx):
        
        img_path = os.path.join(self.root, 'PNGImages', self.imgs[idx])
        mask_path = os.path.join(self.root, 'PedMasks', self.masks[idx])
        
        img = Image.open(img_path).convert('RGB')
        
        mask = Image.open(mask_path)
        
        mask = np.array(mask)
        
        obj_ids = np.unique(mask) # 결과로 0,1,2 값이 나옴
        
        obj_ids = obj_ids[1:] 
        # 0은 background을 의미하기 때문에 제외힌다(1,2)
                                
        masks = mask == obj_ids[:, None, None]
        # obj_ids는 1,2 값만 존재함(0은 제외했음)
        # ojb_ids[:, None, None]의 결과는 shape = (2, 1, 1)이 만들어짐
        # mask는 0,1,2로 이루어진 사진(W x H)이기 때문에
        # mask == ojb_ids 코드로 인해 (2 X W X H)로 변환된다(1이 True인 행렬, 2가 True인 행렬)
        # 각각 1과 2가 있는 부분은 True로 바뀌고 나머지는 False로 바뀐다(masking 작업)
        
        num_objs = len(obj_ids)
        boxes = []
        for i in range(num_objs):
            pos = np.where(mask[i]) # 값이 1인 부분의 index를 모두 찾는다(id 별로)
            xmin = np.min(pos[1]) # 행이 y축, 열이 x축이기 때문에 pos 결과의 index가 x=1, y=0이 된다.
            xmax = np.max(pos[1])
            ymin = np.min(pos[0])
            ymax = np.max(pos[0])
            boxes.append([xmin, ymin, xmax, ymax])
        
        # 마스킹 된 object를 기준으로
        # True인 부분의 x,y 값의 최소값을 찾으면 box의 왼쪽 위 끝을 찾고
        # True인 부분의 x,y 값의 최대값을 찾으면 box의 오른쪽 아래 끝을 잡는다
        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        
        labels = torch.ones((num_objs))
        masks = torch.as_tensor(masks, dtype=torch.uint8)
        
        image_id = torch.tensor([idx])
        
        # 각각의 상자에 대해서 넓이를 구해줌
        # (xmax - xmin) * (ymax - ymin)
        area = (boxes[:,3] - boxes[:,1]) * (boxes[:,2] - boxes[:,0])
        
        
        iscrowd = torch.zeros((num_objs), dtype=torch.int64) # 솔직히 이건 뭔지 모르겠네....뒤에 나오겠지?
        
        target = {}
        target['boxes'] = boxes
        target['labels'] = labels
        target['masks'] = masks
        target['image_id'] = image_id
        target['area'] = area
        target['iscrowd'] = iscrowd
        
        if self.transforms is not None:
            img, target = self.transforms(img, target) # 모든 요소에 대해 적용이 되나?
            
        return img, target
        
    def __len__(self):
        return len(self.imgs)

## Model 불러오기
- faster-rcnn 이용
- 
- COCO에서 pre-train된 모델 불러오기

### 1. Predictor 부분 FineTuning
- 마지막 box를 예측하는 부분을 수정
- Fasterrcnn -> fastrcnn으로 교체

In [6]:
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

In [7]:
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True) # Feature Pyramid Network

In [8]:
num_classes = 2 # 1 : person, 0 : backgroud

In [9]:
model

FasterRCNN(
  (transform): GeneralizedRCNNTransform()
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d()
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d()
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d()
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d()
          (relu): ReLU(inplace=True)
          (downsample): Sequential(
            (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): FrozenBatchNorm2d()
          )
  

In [10]:
model.roi_heads.box_predictor.cls_score.in_features

1024

In [11]:
in_features = model.roi_heads.box_predictor.cls_score.in_features

In [12]:
model.roi_heads.box_predictor

FastRCNNPredictor(
  (cls_score): Linear(in_features=1024, out_features=91, bias=True)
  (bbox_pred): Linear(in_features=1024, out_features=364, bias=True)
)

In [13]:
FastRCNNPredictor(in_features, num_classes)

FastRCNNPredictor(
  (cls_score): Linear(in_features=1024, out_features=2, bias=True)
  (bbox_pred): Linear(in_features=1024, out_features=8, bias=True)
)

In [14]:
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

### 2.Backbone 수정하기

In [15]:
import torchvision
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator

In [19]:
backbone = torchvision.models.mobilenet_v2(pretrained=True).features

In [22]:
backbone.out_channels = 1280

In [42]:
anchor_generator = AnchorGenerator(sizes=((32,64,128,256,512)),
                                   aspect_ratios=((0.5,1.0,2.0)))

In [46]:
roi_pooler = torchvision.ops.MultiScaleRoIAlign(featmap_names=[0],
                                                output_size=7,
                                                sampling_ratio=2)

In [48]:
model

FasterRCNN(
  (transform): GeneralizedRCNNTransform()
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d()
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d()
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d()
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d()
          (relu): ReLU(inplace=True)
          (downsample): Sequential(
            (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): FrozenBatchNorm2d()
          )
  

In [49]:
model = FasterRCNN(backbone,
                   num_classes=2,
                   rpn_anchor_generator=anchor_generator,
                   box_roi_pool=roi_pooler)

In [50]:
model

FasterRCNN(
  (transform): GeneralizedRCNNTransform()
  (backbone): Sequential(
    (0): ConvBNReLU(
      (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU6(inplace=True)
    )
    (1): InvertedResidual(
      (conv): Sequential(
        (0): ConvBNReLU(
          (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
          (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (1): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (2): InvertedResidual(
      (conv): Sequential(
        (0): ConvBNReLU(
          (0): Conv2d(16, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm

### 3. Mask을 위한 모델 추가

In [53]:
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor

In [57]:
def get_model_instance_segmentation(num_classes):
    model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True)
    
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
    
    in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels
    hidden_layer = 256
    
    model.roi_heads.mask_predictor = MaskRCNNPredictor(in_features_mask,
                                                       hidden_layer,
                                                       num_classes)
    
    return model

In [61]:
def get_transform(train):
    transforms = []
    transforms.append(T.ToTensor())
    if train:
        transforms.append(T.RandomHorizontalFlip(0.5))
        
    return T.Compose(transforms)

## Training and Evaluating

In [None]:
def main():
    
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    
    num_classes = 2
    
    dataset = PennFudanDataset('data/PennFudanPed', get_transform(train=True))
    dataset_test = PennFudanDataset('data/PennFudanPed', get_transform(train=True))
    
    indices = torch.randperm(len(dataset)).tolist() # 랜덤으로 숫자 배열
    dataset = torch.utils.data.Subset(dataset, indices[:-50])
    dataset_test = torch.utils.data.Subset(dataset_test, indices[-50:])
    
    data_loader = torch.utils.data.DataLoader(dataset, batch_size=2,
                                              shuffle=True, num_workers=1,
                                              collate_fn=utils.collate_fn)
    
    data_loader_test = torch.utils.data.DataLoader(dataset_test, batch_size = 1,
                                                   shuffle=False, num_workers=1,
                                                   collate_fn=utils.collate_fn)
    
    model = get_model_instance_segmentation(num_classes)
    
    model.to(device)
    
    params = [p for p in model.parameters() if p.requires_grad]
    
    optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005)
    
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)
    
    EPOCH = 10
    
    for e in range(EPOCH):
        for data in dataloader:
            images, labels = data
            images, labels = images.to(device), labels.to(device)
            

In [84]:
from data.engine import train_one_epoch, evaluate

ModuleNotFoundError: No module named 'coco_utils'

In [76]:
torch.utils.data.Subset(dataset, i[:-50])

<torch.utils.data.dataset.Subset at 0x249d8c60a58>

In [67]:
dataset = PennFudanDataset('data/PennFudanPed', get_transform(train=True))
i = torch.randperm(len(dataset)).tolist()

In [325]:
mask_list = sorted(os.listdir(os.path.join(data_path, 'PedMasks')))

In [333]:
mask_path = os.path.join(data_path, 'PedMasks', mask_list[0])
mask = Image.open(mask_path)
mask = np.array(mask)
mask.shape

(536, 559)

In [345]:
object_id = np.unique(mask)

object_id = object_id[1:]
object_id

array([1, 2], dtype=uint8)

In [349]:
masks = mask == object_id[:, None, None]
masks

array([[[False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        ...,
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False]],

       [[False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        ...,
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False]]])

In [376]:
pos = np.where(masks[1])
pos

(array([170, 170, 170, ..., 485, 485, 485], dtype=int64),
 array([452, 453, 454, ..., 514, 515, 516], dtype=int64))

In [373]:
xmin = np.min(pos[1])
ymin = np.min(pos[0])
xmax = np.max(pos[1])
ymax = np.max(pos[0])
xmin, ymin, xmax, ymax

(419, 170, 534, 485)

In [374]:
#boxes = []
boxes.append([xmin, ymin, xmax, ymax])
boxes

[[159, 181, 301, 430], [419, 170, 534, 485]]

In [378]:
boxes = torch.as_tensor(boxes, dtype=torch.float)
boxes

tensor([[159., 181., 301., 430.],
        [419., 170., 534., 485.]])

In [380]:
object_num = len(object_id)
object_num

2

In [413]:
labels = torch.ones((object_num,), dtype=torch.int64)
labels

tensor([1, 1])

In [415]:
masks = torch.as_tensor(masks, dtype=torch.uint8)
masks

tensor([[[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]],

        [[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]], dtype=torch.uint8)

In [423]:
image_id = torch.tensor([0])
image_id

tensor([0])

In [434]:
area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:,2] - boxes[:,0])
area

tensor([35358., 36225.])

In [441]:
torch.zeros((object_num), dtype=torch.int64)

tensor([0, 0])