In [1]:
import torch
import torchvision.models
import cv2
import torchvision.transforms as transforms
import numpy as np
from PIL import Image
import os, sys

In [2]:
COCO_INSTANCE_CATEGORY_NAMES = [
    '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
    'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A', 'stop sign',
    'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
    'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack', 'umbrella', 'N/A', 'N/A',
    'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
    'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
    'bottle', 'N/A', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
    'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
    'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table',
    'N/A', 'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
    'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A', 'book',
    'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
]

In [3]:
COLORS = np.random.uniform(0, 255, size=(len(COCO_INSTANCE_CATEGORY_NAMES), 3))

In [4]:
transform = transforms.Compose([
    transforms.ToTensor(),
])

In [5]:
def predict(image, model, device, detection_threshold):
    # transform the image to tensor
    image = transform(image).to(device)
    image = image.unsqueeze(0) # add a batch dimension
    with torch.no_grad():
        outputs = model(image) # get the predictions on the image
    # get all the scores
    scores = list(outputs[0]['scores'].detach().cpu().numpy())
    # index of those scores which are above a certain threshold
    thresholded_preds_inidices = [scores.index(i) for i in scores if i > detection_threshold]
    # get all the predicted bounding boxes
    bboxes = outputs[0]['boxes'].detach().cpu().numpy()
    # get boxes above the threshold score
    boxes = bboxes[np.array(scores) >= detection_threshold].astype(np.int32)
    # get all the predicited class names
    labels = outputs[0]['labels'].cpu().numpy()
    pred_classes = [COCO_INSTANCE_CATEGORY_NAMES[labels[i]] for i in thresholded_preds_inidices]
    return boxes, pred_classes

In [6]:
def draw_boxes(boxes, classes, image):
    for i, box in enumerate(boxes):
        color = COLORS[COCO_INSTANCE_CATEGORY_NAMES.index(classes[i])]
        cv2.rectangle(
            image,
            (int(box[0]), int(box[1])),
            (int(box[2]), int(box[3])),
            color, 2
        )
        cv2.putText(image, classes[i], (int(box[0]), int(box[1]-5)),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.8, color, 2, 
                    lineType=cv2.LINE_AA)
    return image

def extract_objects(bboxes, classes, image):
    imgs = []
    for i, box in enumerate(boxes):
        imgs.append(image.crop((box[0], box[1], box[2], box[3])))
    return imgs

def extract_objects2(bboxes, classes, image, image_name, domain='rainy'):
    for i, box in enumerate(boxes):
        img = image.crop((box[0], box[1], box[2], box[3]))
        
        directory = 'INIT_dataset/ssd/ssd_' + domain + '_' + classes[i]
        if not os.path.exists(directory):
            os.makedirs(directory)
            
        img.save(directory + '/' + image_name + '_o' + str(i+1) + '.png')

In [7]:
# parameters to adjust the sensibility of extracting objects with retinanet
min_s = 1200
threshold = 0.5

model = torchvision.models.detection.retinanet_resnet50_fpn(pretrained=True, min_size=min_s)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model.eval().to(device)

  return torch._C._cuda_getDeviceCount() > 0


RetinaNet(
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(inplace=True)
          (downsample): Sequential(
            (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): FrozenBatchNorm2d(256, eps=0.0)


In [8]:
# load the image names

# rainy, cloudy, night, sunny
domain = 'INIT_dataset/rainy'

imgs_name = []
for root, dirs, files in os.walk(domain):
    for i in files:
        imgs_name.append(os.path.join(root, i))
#for i in range(0, len(imgs)):
#    imgs[i] = os.path.basename(imgs[i])
print(len(imgs_name))
print(imgs_name[10])
print((os.path.basename(imgs_name[10]))[:-4])

2226
INIT_dataset/rainy/video_data_20180731/jig_KYT01/20180720_KYT/dw_2018_07_20_12-48-39_000000_20fps_bae/camera_2_center_fov60.h264/13_00533.png
13_00533


In [9]:
# change to sunny, cloudy or night to apply on the other domains
dom = 'rainy'

for i in range(0, len(imgs_name)):
    image = Image.open(imgs_name[i]).convert('RGB')
    #image_array = np.array(image)
    #image_array = cv2.cvtColor(image_array, cv2.COLOR_RGB2BGR)
    boxes, classes = predict(image, model, device, threshold)
    extract_objects2(boxes, classes, image, (os.path.basename(imgs_name[i]))[:-4], dom)
    if i % 100 == 0:
        print(i)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
