In [1]:
import random
import time
import os

import cv2
import numpy as np

import torch
import torchvision
from PIL import Image
from IPython import display

import torchvision.models
from torchvision.transforms import transforms

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
if (torch.cuda.device_count()>0):
    print(torch.cuda.get_device_name(0))

cuda:0
NVIDIA GeForce GTX 1080


In [3]:
model = torchvision.models.detection.maskrcnn_resnet50_fpn_v2(weights="DEFAULT")
model.to(device).eval()
;

''

In [4]:
img_directory = "./frames/Train/Game"
output_directory = "./output/segmented/Train/Game"
batch_size = 8

In [5]:
file_names = [file_name for file_name in os.listdir(img_directory) if file_name.endswith(".jpg")]
len(file_names)

1270

In [6]:
file_names[:10]

['Train_Game_01_00001.jpg',
 'Train_Game_01_00002.jpg',
 'Train_Game_01_00003.jpg',
 'Train_Game_01_00004.jpg',
 'Train_Game_01_00005.jpg',
 'Train_Game_01_00006.jpg',
 'Train_Game_01_00007.jpg',
 'Train_Game_01_00008.jpg',
 'Train_Game_01_00009.jpg',
 'Train_Game_01_00010.jpg']

In [48]:
transform = transforms.Compose([
    transforms.ToTensor()
])

def batch_loader(directory, file_names, batch_size):
    for i in range(len(file_names) // batch_size + 1):
        selected = []
        start_idx = batch_size * i
        limit_idx = batch_size * (i + 1)
        
        if limit_idx > len(file_names):
            selected = file_names[start_idx:]
        else:
            selected = file_names[start_idx:limit_idx]
            
        if len(selected) == 0:
            break
        
        output_files = []
        raw_frames = []
        usable_frames = []
        
        for file_name in selected:
            output_files.append(f"{output_directory}/Segmented_{file_name.split('.')[0]}")
            input_file = f"{directory}/{file_name}"
            raw_frame = cv2.imread(input_file, cv2.IMREAD_COLOR)
            raw_frames.append(raw_frame)
            
            usable_frame = cv2.cvtColor(raw_frame, cv2.COLOR_BGR2RGB)
            usable_frame = transform(usable_frame)
            usable_frames.append(usable_frame)
        
        stacked_frames = torch.stack(usable_frames).to(device)
        yield stacked_frames, raw_frames, output_files
            
batch_generator = batch_loader(img_directory, file_names, batch_size)

In [12]:
# The 91 COCO class names
coco_names = ['__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack', 'umbrella', 'N/A', 'N/A', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'N/A', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table', 'N/A', 'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush']

# Generate a set of color for drawing different classes
COLORS = np.random.uniform(0, 255, size=(len(coco_names), 3))

threshold = 0.965

In [55]:
def process_maskrcnn_output(output, frame, save_loc, threshold):
    scores = list(output['scores'].detach().cpu().numpy()) # Get scores
    thresholded_objects = [scores.index(i) for i in scores if i > threshold]
    thresholded_objects_count = len(thresholded_objects)
    masks = (output['masks']>0.5).squeeze().detach().cpu().numpy()
    masks = masks[:thresholded_objects_count]
    boxes = [[(i[0], i[1]), (i[2], i[3])]  for i in output['boxes'].detach().cpu()]
    boxes = boxes[:thresholded_objects_count]
    labels = [coco_names[i] for i in output['labels']][:thresholded_objects_count]
    
    for i, label in enumerate(labels):
        if label != "person":
            continue
            
        
        out_frame = np.array(frame.copy())
        out_frame = out_frame[int(boxes[i][0][1]):int(boxes[i][1][1]), int(boxes[i][0][0]):int(boxes[i][1][0])]
        
        cv2.imwrite(f"{save_loc}_{i}.jpg", out_frame)
        
    
    return
    
    print(scores) # The scores are sorted from largest to the shortest
    thresholded_objects = [scores.index(i) for i in scores if i > threshold] # Get an index for the objects having the scores > a threshold of 0.965
    thresholded_objects_count = len(thresholded_objects) # Total objects having scores > threshold
    masks = (output['masks']>0.5).squeeze().detach().cpu().numpy() # Get the segmentation masks
    masks = masks[:thresholded_objects_count] # Discard masks for objects that are below threshold by only taking the beginning of the list
    boxes = [[(i[0], i[1]), (i[2], i[3])]  for i in output['boxes'].detach().cpu()] # Get the bounding boxes, in (x1, y1), (x2, y2) format
    boxes = boxes[:thresholded_objects_count] # Discard bounding boxes for objects that are below threshold by only taking the beginning of the list
    labels = [coco_names[i] for i in output['labels']] # Get the classes labels
    
    out_frame = np.array(frame.copy())
    
    # Draw the segmentation masks with the text labels
    for i in range(len(masks)): # For all detected objects with score > threshold
        color = COLORS[random.randrange(0, len(COLORS))] # Pick a random color
        red_map = np.zeros_like(masks[i]).astype(np.uint8) # Initialize an empty mask for each of the RGB channels
        green_map = np.zeros_like(masks[i]).astype(np.uint8)
        blue_map = np.zeros_like(masks[i]).astype(np.uint8)
        red_map[masks[i] == 1], green_map[masks[i] == 1], blue_map[masks[i] == 1] = color # Set the color of the masked pixels
        segmentation_map = np.stack([red_map, green_map, blue_map], axis=2) # Combine the three channels of the mask
        cv2.addWeighted(out_frame, 1.0, segmentation_map, 0.6, 0.0, out_frame) # Apply the mask onto the image
        cv2.rectangle(out_frame, (int(boxes[i][0][0]),int(boxes[i][0][1])), (int(boxes[i][1][0]),int(boxes[i][1][1])), color, 2) # Draw the bounding boxes
        cv2.putText(out_frame , labels[i], (int(boxes[i][0][0]), int(boxes[i][0][1])-10), cv2.FONT_HERSHEY_SIMPLEX, 1, color, 2, cv2.LINE_AA) # Draw the class label as text
    
    cv2.imwrite(save_loc, out_frame)

In [57]:
batch_generator = batch_loader(img_directory, file_names, batch_size)

for i, (batch, raw_frames, output_files) in enumerate(batch_generator):
    with torch.no_grad():
        outputs = model(batch)
    
    for output, frame, output_file in zip(outputs, raw_frames, output_files):
        process_maskrcnn_output(output, frame, output_file, threshold)
        break

RuntimeError: CUDA out of memory. Tried to allocate 504.00 MiB (GPU 0; 8.00 GiB total capacity; 5.56 GiB already allocated; 0 bytes free; 7.19 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF