In [2]:
# modified from original source by Krishna Patel
# source: https://github.com/leoxiaobin/deep-high-resolution-net.pytorch/blob/master/demo/inference.py

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import csv
import os
import shutil
import json

from PIL import Image
from pycocotools.coco import COCO

import torch
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.optim
import torch.utils.data
import torch.utils.data.distributed
import torchvision.transforms as transforms
import torchvision
import cv2
import numpy as np

print("We are using torch version", torch.__version__)
print("We are using torchvision version", torchvision.__version__)

import sys
sys.path.append("./deep-high-resolution-net.pytorch/lib")
import time

!pwd
from models import pose_hrnet
from config import cfg
from config import update_config
from core.inference import get_final_preds
from utils.transforms import get_affine_transform

import detectron2
from detectron2.utils.logger import setup_logger
setup_logger()

import numpy as np
import cv2
import os

from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg


CTX = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

_BLACK = (0, 0, 0)
_RED = (0, 0, 255)
_BLUE = (255, 0, 0) 
_PURPLE = (204, 0, 153)
_ORANGE = (51, 153, 255)
_LBROWN = (0, 153, 230)
keypoint_colors = { '1': _RED, '2': _RED, '3': _RED, '4': _RED, '5': _RED,
                            '6': _ORANGE, '7': _ORANGE, '8': _ORANGE, '9': _ORANGE, 
                            '10': _LBROWN, '11': _LBROWN, '12': _LBROWN, '13': _LBROWN,
                            '14': _BLUE, '15': _BLUE, '16': _BLUE, '17': _BLUE,
                            '18': _PURPLE, '19': _PURPLE, '20': _PURPLE, '21': _PURPLE
                            }

COCO_INSTANCE_CATEGORY_NAMES = [
    '__background__', 'hand',
]


def get_person_detection_boxes(model, img, threshold=0.5):
    pil_image = Image.fromarray(img)  # Load the image
    transform = transforms.Compose([transforms.ToTensor()])  # Defing PyTorch Transform
    transformed_img = transform(pil_image)  # Apply the transform to the image
    pred = model([transformed_img.to(CTX)])  # Pass the image to the model

    # Use the first detected person
    pred_classes = [COCO_INSTANCE_CATEGORY_NAMES[i]
                    for i in list(pred[0]['labels'].cpu().numpy())]  # Get the Prediction Score
    pred_boxes = [[(i[0], i[1]), (i[2], i[3])]
                  for i in list(pred[0]['boxes'].cpu().detach().numpy())]  # Bounding boxes
    pred_scores = list(pred[0]['scores'].cpu().detach().numpy())

    person_boxes = []
    # Select box has score larger than threshold and is person
    for pred_class, pred_box, pred_score in zip(pred_classes, pred_boxes, pred_scores):
        if (pred_score > threshold) and pred_class == 'hand':
            person_boxes.append(pred_box)

    return person_boxes


def get_pose_estimation_prediction(pose_model, image, centers, scales, transform):
    rotation = 0

    # pose estimation transformation
    model_inputs = []
    for center, scale in zip(centers, scales):
        trans = get_affine_transform(center, scale, rotation, cfg.MODEL.IMAGE_SIZE)
        # Crop smaller image of people
        model_input = cv2.warpAffine(
            image,
            trans,
            (int(cfg.MODEL.IMAGE_SIZE[0]), int(cfg.MODEL.IMAGE_SIZE[1])),
            flags=cv2.INTER_LINEAR)

        # hwc -> 1chw
        model_input = transform(model_input)#.unsqueeze(0)
        model_inputs.append(model_input)

    # n * 1chw -> nchw
    model_inputs = torch.stack(model_inputs)

    # compute output heatmap
    output = pose_model(model_inputs.to(CTX))
    coords, _ = get_final_preds(
        cfg,
        output.cpu().detach().numpy(),
        np.asarray(centers),
        np.asarray(scales))

    return coords


def box_to_center_scale(box, model_image_width, model_image_height):
    """convert a box to center,scale information required for pose transformation
    Parameters
    ----------
    box : list of tuple
        list of length 2 with two tuples of floats representing
        bottom left and top right corner of a box
    model_image_width : int
    model_image_height : int
    Returns
    -------
    (numpy array, numpy array)
        Two numpy arrays, coordinates for the center of the box and the scale of the box
    """
    center = np.zeros((2), dtype=np.float32)

    x1, y1, x2, y2 = box
    box_width = x2 - x1
    box_height = y2 - y1
    center[0] = x1 + box_width * 0.5
    center[1] = y1 + box_height * 0.5

    aspect_ratio = model_image_width * 1.0 / model_image_height
    pixel_std = 200

    if box_width > aspect_ratio * box_height:
        box_height = box_width * 1.0 / aspect_ratio
    elif box_width < aspect_ratio * box_height:
        box_width = box_height * aspect_ratio
    scale = np.array(
        [box_width * 1.0 / pixel_std, box_height * 1.0 / pixel_std],
        dtype=np.float32)
    if center[0] != -1:
        scale = scale * 1.25

    return center, scale

def parse_args():
    parser = argparse.ArgumentParser(description='Surgery Hand and Keypoint Detection on Video')
    parser.add_argument('--cfg', type=str, default="keypoints.yaml")
    parser.add_argument('--bb_cfg', type=str, default="bbox.yaml")
    parser.add_argument('--video', type=str, default = "slap.mp4")
    parser.add_argument('--produce_vid', action='store_true')
    parser.add_argument('--out_json', type=str, default='out.json')
    parser.add_argument('--tracking', action='store_true')
    parser.add_argument('opts',
                        help='Modify config options using the command-line',
                        default=None,
                        nargs=argparse.REMAINDER)

    args = parser.parse_args("--produce_vid".split())
    # args expected by supporting codebase
    args.modelDir = ''
    args.logDir = ''
    args.dataDir = ''
    args.prevModelDir = ''

    return args


def main():
    # transformation
    pose_transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225]),
    ])

    # cudnn related setting
    cudnn.benchmark = cfg.CUDNN.BENCHMARK
    torch.backends.cudnn.deterministic = cfg.CUDNN.DETERMINISTIC
    torch.backends.cudnn.enabled = cfg.CUDNN.ENABLED

    args = parse_args()
    update_config(cfg, args)

    bbox_cfg = get_cfg()
    bbox_cfg.merge_from_file(args.bb_cfg)
    box_model = DefaultPredictor(bbox_cfg)

    print("Getting pose model from", cfg.MODEL.NAME+'.get_pose_net')
    print("Using cfg", cfg)
    pose_model = eval(cfg.MODEL.NAME+'.get_pose_net')(
        cfg, is_train=False
    )

    if cfg.TEST.MODEL_FILE:
        print('=> loading model from {}'.format(cfg.TEST.MODEL_FILE))
        pose_model.load_state_dict(torch.load(cfg.TEST.MODEL_FILE), strict=False)
    else:
        print('expected model defined in config at TEST.MODEL_FILE')

    pose_model.to(CTX)
    pose_model.eval()

    print("Opening", args.video)
    video = cv2.VideoCapture(args.video)
    width = video.get(cv2.CAP_PROP_FRAME_WIDTH)
    height = video.get(cv2.CAP_PROP_FRAME_HEIGHT)
    fps = video.get(cv2.CAP_PROP_FPS)
    data = {}

    
    print("We are going to produce the video", args.produce_vid)
    if args.produce_vid:
        fourcc = cv2.VideoWriter_fourcc('F', 'M', 'P', '4')
        video_tracked = cv2.VideoWriter('predictions.mp4', fourcc, fps, (int(width), int(height)))

    frame_num = 0
    while video.isOpened():    
        print("Performing Inference on Frame Number ", frame_num, end='\r')
        _, frame = video.read()
        if frame is None or frame.size == 0:
            break

        img = frame
        if args.produce_vid:
            image_debug = img.copy()
        image_pose = img.copy()
        predictions = box_model(img)['instances']
        
        print("Passed box_model an geimg, received predictions", predictions)
        #produces Prediction boxes in the form of (x1, y1, x2, y2) where x measured from left and y measured from top
        #these boxes also come with scorse and class annotations
        
        pred_boxes = predictions.pred_boxes
        print("Corresponding pred_boxes are", pred_boxes)

        centers = []
        scales = []
        for box in pred_boxes:
            #for each box, returns the center of the box and the scale of the box relative to the input image
            print("Analyzing box", box, "with config parms", cfg.MODEL.IMAGE_SIZE[0], cfg.MODEL.IMAGE_SIZE[1])
            center, scale = box_to_center_scale(box, cfg.MODEL.IMAGE_SIZE[0], cfg.MODEL.IMAGE_SIZE[1])
            print("Centers are {} and scales are {}".format(center, scale))
            centers.append(center)
            scales.append(scale)

        if len(pred_boxes) == 0:
            frame_num += 1
            if args.produce_vid:
                video_tracked.write(image_debug)
            continue

        now = time.time()
        print("SENDING IN TO POSES", centers, scales)
        pose_preds = get_pose_estimation_prediction(pose_model, image_pose, centers, scales, transform=pose_transform)
        then = time.time()
        
        preds = []

        for coords in pose_preds:

            preds.append({"keypoints":[]})
            for i, coord in enumerate(coords):
                x_coord, y_coord = float(max(0, coord[0])), float(max(0, coord[1]))
                preds[-1]["keypoints"].append((x_coord if x_coord > 0 else 0, y_coord if y_coord > 0 else 0))

                if not (x_coord == 0 and y_coord == 0):
                    x_coord, y_coord = int(x_coord), int(y_coord)
                    if args.produce_vid:
                        cv2.circle(image_debug, (x_coord, y_coord), 4, keypoint_colors[str(i + 1)], -1)
                        cv2.putText(image_debug, str(i + 1), (x_coord - 4, y_coord - 4), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 1, cv2.LINE_AA)


        for it, box in enumerate(pred_boxes):
            preds[it]["bbox"] = [float(box[0]), float(box[1]), float(box[2]), float(box[3])]
            if args.tracking:
                preds[it]["bbox"].append(float(predictions.scores[it].detach().cpu().numpy()))
            if args.produce_vid:
                cv2.rectangle(image_debug, (int(box[0]), int(box[1])), (int(box[2]), int(box[3])), color=(0, 255, 0),
                              thickness=3) 

        if args.produce_vid:
            video_tracked.write(image_debug)
        data[frame_num] = preds
        frame_num += 1
        
    video.release()
    if args.produce_vid:
        video_tracked.release()
        

    json.dump(data, open(args.out_json, "w"))

!pwd
if __name__ == '__main__':
    main()


We are using torch version 1.8.0
We are using torchvision version 0.9.0
/home/egoodman/keypoints/surgery-hand-detection
/home/egoodman/keypoints/surgery-hand-detection
Getting pose model from pose_hrnet.get_pose_net
Using cfg AUTO_RESUME: True
CUDNN:
  BENCHMARK: True
  DETERMINISTIC: False
  ENABLED: True
DATASET:
  COLOR_RGB: True
  DATASET: coco
  DATA_FORMAT: jpg
  FLIP: False
  HYBRID_JOINTS_TYPE: 
  NUM_JOINTS_HALF_BODY: 10
  PROB_HALF_BODY: 0.3
  ROOT: data/coco/
  ROT_FACTOR: 45
  SCALE_FACTOR: 0.35
  SELECT_DATA: False
  TEST_SET: val
  TRAIN_SET: train_boot_aug_2
DATA_DIR: 
DEBUG:
  DEBUG: True
  SAVE_BATCH_IMAGES_GT: True
  SAVE_BATCH_IMAGES_PRED: True
  SAVE_HEATMAPS_GT: True
  SAVE_HEATMAPS_PRED: True
GPUS: (0,)
LOG_DIR: log
LOSS:
  TOPK: 8
  USE_DIFFERENT_JOINTS_WEIGHT: False
  USE_OHKM: False
  USE_TARGET_WEIGHT: True
MODEL:
  EXTRA:
    FINAL_CONV_KERNEL: 1
    PRETRAINED_LAYERS: ['conv1', 'bn1', 'conv2', 'bn2', 'layer1', 'transition1', 'stage2', 'transition2', 'stage3'

Passed box_model an geimg, received predictions Instances(num_instances=4, image_height=360, image_width=360, fields=[pred_boxes: Boxes(tensor([[330.4837, 266.9525, 358.9575, 325.1373],
        [ 52.2834,  73.6707, 144.5942, 141.8535],
        [130.1968,  73.7037, 239.6924, 201.7101],
        [ 91.4286, 281.4868, 298.2621, 355.5309]], device='cuda:0')), scores: tensor([0.9792, 0.9663, 0.9009, 0.6132], device='cuda:0'), pred_classes: tensor([0, 0, 0, 0], device='cuda:0')])
Corresponding pred_boxes are Boxes(tensor([[330.4837, 266.9525, 358.9575, 325.1373],
        [ 52.2834,  73.6707, 144.5942, 141.8535],
        [130.1968,  73.7037, 239.6924, 201.7101],
        [ 91.4286, 281.4868, 298.2621, 355.5309]], device='cuda:0'))
Analyzing box tensor([330.4837, 266.9525, 358.9575, 325.1373], device='cuda:0') with config parms 288 384
Centers are [344.7206 296.0449] and scales are [0.2727416  0.36365545]
Analyzing box tensor([ 52.2834,  73.6707, 144.5942, 141.8535], device='cuda:0') with config 

Passed box_model an geimg, received predictions Instances(num_instances=5, image_height=360, image_width=360, fields=[pred_boxes: Boxes(tensor([[323.9984, 273.1866, 359.6343, 333.4800],
        [  1.2180,   1.2785, 115.2580, 135.5789],
        [ 92.2434, 288.5818, 287.7645, 355.7087],
        [104.8096,  76.4691, 234.3820, 209.5261],
        [ 94.6254,  79.9210, 149.8288, 115.4408]], device='cuda:0')), scores: tensor([0.9723, 0.8408, 0.7954, 0.7289, 0.5638], device='cuda:0'), pred_classes: tensor([0, 0, 0, 0, 0], device='cuda:0')])
Corresponding pred_boxes are Boxes(tensor([[323.9984, 273.1866, 359.6343, 333.4800],
        [  1.2180,   1.2785, 115.2580, 135.5789],
        [ 92.2434, 288.5818, 287.7645, 355.7087],
        [104.8096,  76.4691, 234.3820, 209.5261],
        [ 94.6254,  79.9210, 149.8288, 115.4408]], device='cuda:0'))
Analyzing box tensor([323.9984, 273.1866, 359.6343, 333.4800], device='cuda:0') with config parms 288 384
Centers are [341.81635 303.3333 ] and scales are [0.

Passed box_model an geimg, received predictions Instances(num_instances=4, image_height=360, image_width=360, fields=[pred_boxes: Boxes(tensor([[326.5625, 283.8093, 359.8702, 334.9345],
        [ 53.2204,  86.7575, 148.4649, 156.8876],
        [106.2571,  82.9174, 233.0136, 216.2184],
        [  0.7834,   2.1402, 112.2184, 144.8908]], device='cuda:0')), scores: tensor([0.9761, 0.9120, 0.8060, 0.6642], device='cuda:0'), pred_classes: tensor([0, 0, 0, 0], device='cuda:0')])
Corresponding pred_boxes are Boxes(tensor([[326.5625, 283.8093, 359.8702, 334.9345],
        [ 53.2204,  86.7575, 148.4649, 156.8876],
        [106.2571,  82.9174, 233.0136, 216.2184],
        [  0.7834,   2.1402, 112.2184, 144.8908]], device='cuda:0'))
Analyzing box tensor([326.5625, 283.8093, 359.8702, 334.9345], device='cuda:0') with config parms 288 384
Centers are [343.21634 309.37195] and scales are [0.23964943 0.31953257]
Analyzing box tensor([ 53.2204,  86.7575, 148.4649, 156.8876], device='cuda:0') with confi

Passed box_model an geimg, received predictions Instances(num_instances=6, image_height=360, image_width=360, fields=[pred_boxes: Boxes(tensor([[ 53.8095, 100.2418, 148.1327, 173.1719],
        [320.4056, 296.0067, 359.6488, 351.4620],
        [115.2979,  96.4737, 231.6451, 228.9924],
        [251.6316, 264.5906, 351.3847, 349.7454],
        [  1.0279,  17.7474, 108.0523, 146.0761],
        [ 64.5841,  97.2411, 285.2202, 342.9411]], device='cuda:0')), scores: tensor([0.8863, 0.7550, 0.7494, 0.7098, 0.6619, 0.6265], device='cuda:0'), pred_classes: tensor([0, 0, 0, 0, 0, 0], device='cuda:0')])
Corresponding pred_boxes are Boxes(tensor([[ 53.8095, 100.2418, 148.1327, 173.1719],
        [320.4056, 296.0067, 359.6488, 351.4620],
        [115.2979,  96.4737, 231.6451, 228.9924],
        [251.6316, 264.5906, 351.3847, 349.7454],
        [  1.0279,  17.7474, 108.0523, 146.0761],
        [ 64.5841,  97.2411, 285.2202, 342.9411]], device='cuda:0'))
Analyzing box tensor([ 53.8095, 100.2418, 148.1

Passed box_model an geimg, received predictions Instances(num_instances=5, image_height=360, image_width=360, fields=[pred_boxes: Boxes(tensor([[ 54.1567, 109.4471, 135.5761, 177.8814],
        [114.2683, 105.3116, 232.2652, 235.4489],
        [247.1149, 249.7515, 353.9951, 356.8291],
        [  1.0576,   3.4291, 111.8285, 154.9855],
        [ 50.8365, 117.7035, 307.5353, 353.2213]], device='cuda:0')), scores: tensor([0.7653, 0.7545, 0.6233, 0.6044, 0.5868], device='cuda:0'), pred_classes: tensor([0, 0, 0, 0, 0], device='cuda:0')])
Corresponding pred_boxes are Boxes(tensor([[ 54.1567, 109.4471, 135.5761, 177.8814],
        [114.2683, 105.3116, 232.2652, 235.4489],
        [247.1149, 249.7515, 353.9951, 356.8291],
        [  1.0576,   3.4291, 111.8285, 154.9855],
        [ 50.8365, 117.7035, 307.5353, 353.2213]], device='cuda:0'))
Analyzing box tensor([ 54.1567, 109.4471, 135.5761, 177.8814], device='cuda:0') with config parms 288 384
Centers are [ 94.86636 143.66425] and scales are [0.

Passed box_model an geimg, received predictions Instances(num_instances=5, image_height=360, image_width=360, fields=[pred_boxes: Boxes(tensor([[ 46.6760, 114.1131, 150.1513, 184.3135],
        [116.2874, 106.8290, 229.9433, 240.2035],
        [243.6368, 261.1933, 354.2955, 355.1373],
        [  1.3179,   6.4200, 111.7981, 181.1491],
        [ 47.4753, 108.3192, 253.1618, 353.2555]], device='cuda:0')), scores: tensor([0.8816, 0.7946, 0.7450, 0.7432, 0.6209], device='cuda:0'), pred_classes: tensor([0, 0, 0, 0, 0], device='cuda:0')])
Corresponding pred_boxes are Boxes(tensor([[ 46.6760, 114.1131, 150.1513, 184.3135],
        [116.2874, 106.8290, 229.9433, 240.2035],
        [243.6368, 261.1933, 354.2955, 355.1373],
        [  1.3179,   6.4200, 111.7981, 181.1491],
        [ 47.4753, 108.3192, 253.1618, 353.2555]], device='cuda:0'))
Analyzing box tensor([ 46.6760, 114.1131, 150.1513, 184.3135], device='cuda:0') with config parms 288 384
Centers are [ 98.41367 149.21329] and scales are [0.

Passed box_model an geimg, received predictions Instances(num_instances=4, image_height=360, image_width=360, fields=[pred_boxes: Boxes(tensor([[235.7392, 253.2817, 355.1985, 357.1808],
        [ 41.3350, 107.3162, 236.0205, 353.2734],
        [109.1304, 105.7099, 224.6971, 245.4709],
        [  1.0924,  12.0035, 132.8654, 205.7899]], device='cuda:0')), scores: tensor([0.8397, 0.7497, 0.7291, 0.6843], device='cuda:0'), pred_classes: tensor([0, 0, 0, 0], device='cuda:0')])
Corresponding pred_boxes are Boxes(tensor([[235.7392, 253.2817, 355.1985, 357.1808],
        [ 41.3350, 107.3162, 236.0205, 353.2734],
        [109.1304, 105.7099, 224.6971, 245.4709],
        [  1.0924,  12.0035, 132.8654, 205.7899]], device='cuda:0'))
Analyzing box tensor([235.7392, 253.2817, 355.1985, 357.1808], device='cuda:0') with config parms 288 384
Centers are [295.46884 305.23126] and scales are [0.74662083 0.9954944 ]
Analyzing box tensor([ 41.3350, 107.3162, 236.0205, 353.2734], device='cuda:0') with confi

Performing Inference on Frame Number  32

KeyboardInterrupt: 