# Detectron2 Beginner's Tutorial

<img src="https://dl.fbaipublicfiles.com/detectron2/Detectron2-Logo-Horz.png" width="500">

Welcome to detectron2! This is the official colab tutorial of detectron2. Here, we will go through some basics usage of detectron2, including the following:
* Run inference on images or videos, with an existing detectron2 model
* Train a detectron2 model on a new dataset

You can make a copy of this tutorial by "File -> Open in playground mode" and play with it yourself. __DO NOT__ request access to this tutorial.


# Install detectron2

In [1]:
# install dependencies: 
!pip install pyyaml==5.1 'pycocotools>=2.0.1'
#!pip install torch==1.6.0+cu101 torchvision==0.7.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html

import torch, torchvision
print(torch.__version__, torch.cuda.is_available())
#!gcc --version
# opencv is pre-installed on colab

1.7.0+cu101 True


In [2]:
# install detectron2: (Colab has CUDA 10.1 + torch 1.6)
# See https://detectron2.readthedocs.io/tutorials/install.html for instructions
#assert torch.__version__.startswith("1.6")
!pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu101/torch1.6/index.html

Looking in links: https://dl.fbaipublicfiles.com/detectron2/wheels/cu101/torch1.6/index.html


In [7]:
# Some basic setup:
# Setup detectron2 logger
import detectron2
from detectron2.utils.logger import setup_logger
setup_logger()

# import some common libraries
import numpy as np
import os, json, cv2, random
from google.colab.patches import cv2_imshow

# import some common detectron2 utilities
from detectron2 import model_zoo
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from detectron2.utils.visualizer import Visualizer
from detectron2.data import MetadataCatalog, DatasetCatalog

In [8]:
import time
from detectron2.utils.visualizer import ColorMode

import matplotlib.pyplot as plt
import matplotlib as mpl
import logging
import threading
import time

from detectron2.data.datasets import register_coco_instances

In [2]:

from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from fvcore.common.file_io import PathManager
import logging
logger = logging.getLogger(__name__)

#datalabels=load_sem_seg(annotations, images_dir, gt_ext="png", image_ext="png")


##images_dir = "/content/drive/My Drive/CIS581/FinalProject/Kitti/train_images/image_2"
#annotations="/content/drive/My Drive/CIS581/FinalProject/Kitti/trainingmask/"

images_dir = "/content/drive/My Drive/CIS581/ImageSegmentation/bdd/bdd100k/seg/images/train/"
annotations="/content/drive/My Drive/CIS581/ImageSegmentation/bdd/bdd100k/seg/labels/train/"
json_fileloc="/content/drive/My Drive/CIS581/ImageSegmentation/bdd/training_corrected_1000v2.json"

In [None]:
from detectron2.data.datasets import register_coco_instances
register_coco_instances("my_dataset_train5", {}, "/content/drive/My Drive/CIS581/ImageSegmentation/bdd/training_corrected_1000v2.json",images_dir)



In [None]:
from detectron2.engine import DefaultTrainer

cfg = get_cfg()
cfg.merge_from_file(model_zoo.get_config_file("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml"))
cfg.DATASETS.TRAIN = ("my_dataset_train5",)
cfg.DATASETS.TEST = ()#"my_dataset_val3",)
cfg.DATALOADER.NUM_WORKERS = 4
cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml")#"COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x.yaml")   #"COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml")  # Let training initialize from model zoo
cfg.SOLVER.IMS_PER_BATCH = 4
cfg.SOLVER.BASE_LR = 0.00075  #0.00025  # pick a good LR

cfg.SOLVER.WARMUP_ITERS=500
cfg.SOLVER.MAX_ITER = 1000   # 300 iterations seems good enough for this toy dataset; you will need to train longer for a practical dataset

#cfg.SOLVER.STEPS=(10,15)
#cfg.SOLVER.GAMMA=0.05

cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 128 #128   # faster, and good enough for this toy dataset (default: 512)
cfg.MODEL.ROI_HEADS.NUM_CLASSES = 14  # only has one class (ballon). (see https://detectron2.readthedocs.io/tutorials/datasets.html#update-the-config-for-new-datasets)

#cfg.TEST.EVAL_PERIOD=500


os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)
trainer = DefaultTrainer(cfg) 
trainer.resume_or_load(resume=False)
trainer.train()

[32m[11/24 19:16:01 d2.engine.defaults]: [0mModel:
GeneralizedRCNN(
  (backbone): FPN(
    (fpn_lateral2): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral3): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output3): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral4): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output4): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral5): Conv2d(2048, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output5): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (top_block): LastLevelMaxPool()
    (bottom_up): ResNet(
      (stem): BasicStem(
        (conv1): Conv2d(
          3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False
          (norm): FrozenBatchNorm2d(num_features=64, eps=1e-05)
        )
      )
 

Skip loading parameter 'roi_heads.box_predictor.cls_score.weight' to the model due to incompatible shapes: (81, 1024) in the checkpoint but (15, 1024) in the model! You might want to double check if this is expected.
Skip loading parameter 'roi_heads.box_predictor.cls_score.bias' to the model due to incompatible shapes: (81,) in the checkpoint but (15,) in the model! You might want to double check if this is expected.
Skip loading parameter 'roi_heads.box_predictor.bbox_pred.weight' to the model due to incompatible shapes: (320, 1024) in the checkpoint but (56, 1024) in the model! You might want to double check if this is expected.
Skip loading parameter 'roi_heads.box_predictor.bbox_pred.bias' to the model due to incompatible shapes: (320,) in the checkpoint but (56,) in the model! You might want to double check if this is expected.
Skip loading parameter 'roi_heads.mask_head.predictor.weight' to the model due to incompatible shapes: (80, 256, 1, 1) in the checkpoint but (14, 256, 1, 

[32m[11/24 19:16:03 d2.engine.train_loop]: [0mStarting training from iteration 0


	nonzero()
Consider using one of the following signatures instead:
	nonzero(*, bool as_tuple) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:882.)
  item = item.nonzero().squeeze(1).cpu().numpy().tolist()
	nonzero()
Consider using one of the following signatures instead:
	nonzero(*, bool as_tuple) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:882.)
  item = item.nonzero().squeeze(1).cpu().numpy().tolist()
	nonzero()
Consider using one of the following signatures instead:
	nonzero(*, bool as_tuple) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:882.)
  item = item.nonzero().squeeze(1).cpu().numpy().tolist()
	nonzero()
Consider using one of the following signatures instead:
	nonzero(*, bool as_tuple) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:882.)
  item = item.nonzero().squeeze(1).cpu().numpy().tolist()
	nonzero()
Consider using one of the following signatures instead:
	nonz

[32m[11/24 19:16:22 d2.utils.events]: [0m eta: 0:12:52  iter: 19  total_loss: 6.4  loss_cls: 2.731  loss_box_reg: 0.6493  loss_mask: 0.6937  loss_rpn_cls: 1.479  loss_rpn_loc: 0.8201  time: 0.8951  data_time: 0.5667  lr: 2.9221e-05  max_mem: 4990M
[32m[11/24 19:16:38 d2.utils.events]: [0m eta: 0:12:24  iter: 39  total_loss: 5.273  loss_cls: 2.222  loss_box_reg: 0.5749  loss_mask: 0.6862  loss_rpn_cls: 0.786  loss_rpn_loc: 0.8682  time: 0.8335  data_time: 0.3667  lr: 5.9192e-05  max_mem: 4990M
[32m[11/24 19:16:56 d2.utils.events]: [0m eta: 0:12:09  iter: 59  total_loss: 3.799  loss_cls: 1.31  loss_box_reg: 0.5219  loss_mask: 0.6776  loss_rpn_cls: 0.4443  loss_rpn_loc: 0.7878  time: 0.8645  data_time: 0.5444  lr: 8.9161e-05  max_mem: 4990M
[32m[11/24 19:17:16 d2.utils.events]: [0m eta: 0:11:55  iter: 79  total_loss: 3.436  loss_cls: 1.134  loss_box_reg: 0.5358  loss_mask: 0.6651  loss_rpn_cls: 0.3054  loss_rpn_loc: 0.7985  time: 0.8938  data_time: 0.5879  lr: 0.00011913  max_mem:

In [None]:
!cp 'output/model_final.pth' '/content/drive/My Drive/CIS581/ImageSegmentation/model/outputfile.pth'

In [9]:
#To load a model run:
cfg = get_cfg()
cfg.merge_from_file(model_zoo.get_config_file("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml"))
cfg.MODEL.ROI_HEADS.NUM_CLASSES = 14
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.7   # set a custom testing threshold
cfg.MODEL.WEIGHTS ="/content/drive/My Drive/CIS581/ImageSegmentation/model/outputfile.pth"# path to the model we trained
predictor = DefaultPredictor(cfg)

In [None]:
# Inference should use the config with parameters that are used in training
# cfg now already contains everything we've set previously. We changed it a little bit for inference:
#cfg.MODEL.WEIGHTS = os.path.join(cfg.OUTPUT_DIR, "model_final.pth")  # path to the model we just trained
#cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.7   # set a custom testing threshold
#predictor = DefaultPredictor(cfg)

In [3]:
%cd /content/drive/My Drive/CIS581/ImageSegmentation

/content/drive/My Drive/CIS581/ImageSegmentation


In [12]:
#optical flow for mask
import cv2
import numpy as np
import matplotlib.pyplot as plt
import imageio
from skimage import img_as_ubyte
import os
import matplotlib as mpl
from optical_flow import *

def objectTrackingNew(rawVideo):
    """

        Description: Generate and save tracking video
        Input:
        rawVideo: Raw video file name, String
        Instruction: Please feel free to use cv.selectROI() to manually select bounding box

    """
    cap = cv2.VideoCapture(rawVideo)
    imgs = []
    frame_cnt = 0
    
    #random colors for 14 classes
    colors = [tuple(np.random.randint(256, size=3)) for _ in range(14)]
    print(len(colors))
    # Initialize video writer for tracking video
    trackVideo = '/content/drive/My Drive/CIS581/ImageSegmentation/optresults/Output1.mov'
    fourcc = cv2.VideoWriter_fourcc(*'XVID')
    fps = cap.get(cv2.CAP_PROP_FPS)
    size = (int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)), int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)))
    #size = (int(cap.get(360)), int(cap.get(480)))
    writer = cv2.VideoWriter(trackVideo, fourcc, fps, size)

    #max number of features you will extract
    N=5

    #Lucas Kanade param
    # lk_params = dict( winSize  = (35,35),
    #               maxLevel = 5,
    #               criteria = (cv2.TERM_CRITERIA_EPS | cv2.TERM_CRITERIA_COUNT, 10, 0.03))
    #variables
    old_classes=np.array([])
    old_count=0
    old_masks=[]
    old_coords=np.array([])
    initNF=[]
    #trajectoryX,trajectoryY=np.array([]),np.array([])
    while (cap.isOpened()):
        ret, frame = cap.read()
        if not ret: break
        #rotate the video frame 
        frame=np.rot90(frame)
        #writing video on vis
        vis = frame.copy() 
        #frames used for feature translation
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY).astype(np.float32) / 255 
        frame_cnt += 1
        #if frame_cnt<11:
        #  continue
        H,W = vis.shape[0],vis.shape[1]
        outputs = predictor(vis)
        v = Visualizer(vis[:, :, ::-1], MetadataCatalog.get(cfg.DATASETS.TRAIN[0]), scale=0.85)
        out = v.draw_instance_predictions(outputs["instances"].to("cpu"))
        
        newimage=np.zeros((H,W))
        count=outputs["instances"].to("cpu").pred_classes.numpy().shape
        
        #put mask on frame using detectron2 visualizer
        for i in range(count[0]):
          mask=outputs["instances"].to("cpu").pred_masks.numpy()[i].astype(int)*(i+1)
          #use current instance's class id to get corresponding color
          class_num=outputs["instances"].pred_classes[i]
          color=colors[class_num]
          for n in range(3):
            vis[:, :, n] = np.where(mask!= 0, (vis[:, :, n] * 0.5 + 0.5*color[n]),vis[:, :, n])
        
        cv2.imwrite('/content/drive/My Drive/CIS581/ImageSegmentation/optresults/{}_1.jpg'.format(frame_cnt), img_as_ubyte(vis))
        

        if frame_cnt==1:
          #save first frame's features, bboxes, mask coordinates, count, and classes
          bboxes=outputs["instances"].to("cpu").pred_boxes.tensor.numpy()
          num=bboxes.shape[0]
          bboxes=bboxes.reshape(num,2,2)
          initNF,features=getFeatures(frame,bboxes,N)
          old_classes=outputs["instances"].to("cpu").pred_classes.numpy()
          old_bboxes=bboxes
          old_masks=outputs["instances"].to("cpu").pred_masks.numpy()
          old_coords=generateAllCoordinates(old_masks,W,H)
          old_count=count[0]
        else:
          all_count=0
          all_bboxes,all_masks,all_features,all_masks,all_coords,all_classes = np.array([]),np.array([]),np.array([]),np.array([]),np.array([]),[]
          all_featNum=[]
          #current frame's class dictionary -> key: class, value: number of class
          new_classes=outputs["instances"].to("cpu").pred_classes.numpy()
          unique, counts = np.unique(new_classes, return_counts=True)
          dictC=dict(zip(unique, counts)) 
          print("ALL classes",all_classes)
          print("old count",old_count," and count",count)
          print("old classes",old_classes)
          for k in range(old_count):
            print(count[0],"running:",k)
            cnt=0
            if old_classes[k]!= 9 and old_classes[k]!=3 and old_classes[k]!=12:
              #we are generating masks only on human and cars
              continue
            if old_classes[k] not in new_classes:
              #class from previous frames are not detected in current frame -> optical flow to generate mask
              print("class",old_classes[k]," not found")
              all_featNum,all_features,all_bboxes, coord,all_coords,all_classes, eraseObject=transformMask(initNF[k],frame,frame_old,all_featNum, all_features,all_bboxes,all_coords,all_classes,features[k],old_bboxes[k],old_coords[k],old_classes[k],H,W,N)
              if eraseObject==False:
                tmp_coord=coord.reshape(H*W,2)
                #generate mask on the frame with the transformed coordinates
                mask=generateMaskWithCoordinates(tmp_coord,W,H)
                color=colors[int(old_classes[k])]
                for n in range(3):
                  vis[:, :, n] = np.where(mask!= 0, (vis[:, :, n] * 0.5 + 0.5*color[n]),vis[:, :, n])
                all_count+=1
              continue
            for j in range(count[0]):
              class_num=new_classes[j]
              mask=outputs["instances"].to("cpu").pred_masks.numpy()[j].astype(int)*(i+1)
              print("DICT",dictC )
              print("Class Num:",class_num,type(class_num))
              print("get",dictC.get(class_num.item())," K",k)
              if old_classes[k]==class_num:
                old_mask=generateMaskWithCoordinates(old_coords[k],W,H)
                intersect=np.logical_and(old_mask,mask)
                interNum=np.count_nonzero(intersect)
                maskNum=np.count_nonzero(old_mask)
                print("InterNum ",interNum," maskNum ",maskNum)
                if interNum>=0.5*maskNum:
                  print("masks match")
                  break
                else:
                  cnt+=1
                  if (dictC.get(class_num.item())>cnt):
                    print("masks do not match but will have to look more : cnt",cnt)
                    pass
                  else:
                    print("masks do not match : optical flow to put mask on pic")
                    all_featNum,all_features,all_bboxes, coord,all_coords,all_classes, eraseObject=transformMask(initNF[k],frame,frame_old,all_featNum, all_features,all_bboxes,all_coords,all_classes,features[k],old_bboxes[k],old_coords[k],old_classes[k],H,W,N)
                    if eraseObject==False:
                      tmp_coord=coord.reshape(H*W,2)
                      mask=generateMaskWithCoordinates(tmp_coord,W,H)
                      color=colors[int(old_classes[k])]
                      for n in range(3):
                        vis[:, :, n] = np.where(mask!= 0, (vis[:, :, n] * 0.5 + 0.5*color[n]),vis[:, :, n])
                      all_count+=1
          print("Count",all_count)
          print("########save frame",frame_cnt,"###########")
          bboxes=outputs["instances"].to("cpu").pred_boxes.tensor.numpy()
          num=bboxes.shape[0]
          bboxes=bboxes.reshape(num,2,2)
          masks=outputs["instances"].to("cpu").pred_masks.numpy()
          if all_count>0:
            #optical flow was used at least once, save feature points, classes, mask coordinates, bboxes from previous frames and current frame
            old_count=all_count+count[0]
            old_classes=np.append(np.array(all_classes),new_classes)
            tmp_coords=generateAllCoordinates(masks,W,H)
            old_coords=np.append(all_coords,tmp_coords)
            old_bboxes=np.append(all_bboxes,bboxes)
            numF,features=getFeatures(frame,bboxes,N)
            initNF=np.append(all_featNum,numF)
            features=np.append(all_features,features)
          elif all_count==0:
            #no optical flow was used, save only current frame's feature points, bboxes, masks, count, and classes
            numF,features=getFeatures(frame,bboxes,N)
            old_classes=new_classes
            initNF=numF
            old_bboxes=bboxes
            tmp_coords=generateAllCoordinates(masks,W,H)
            old_coords=np.append(all_coords,tmp_coords)
            old_count=count[0]
        #reshaping coordinates, bboxes, features
          old_coords=old_coords.reshape((old_count,H*W,2))
          old_bboxes=old_bboxes.reshape((old_count,2,2))
          features=features.reshape((old_count,N,2))

        #save frame   
        frame_old=frame.copy()
        # save to list
        imgs.append(img_as_ubyte(vis))
        
        # save image 
        #if (frame_cnt + 1) % 2 == 0:
        cv2.imwrite('/content/drive/My Drive/CIS581/ImageSegmentation/optresults/{}_2.jpg'.format(frame_cnt), img_as_ubyte(vis))
        
        # Save video
        writer.write(vis)
        #if (frame_cnt==50):
          #break
        
    # Release video reader and video writer
    cap.release()
    writer.release()
    
    return


In [13]:
rawVideo = "/content/drive/My Drive/CIS581/ImageSegmentation/test videos/cabc30fc-eb673c5a.mov"
if not os.path.exists("/content/drive/My Drive/CIS581/ImageSegmentation/optresults"): os.mkdir("/content/drive/My Drive/CIS581/ImageSegmentation/optresults")
objectTrackingNew(rawVideo)

14
ALL classes []
old count 9  and count (8,)
old classes [4 2 9 9 1 9 1 8 1]
8 running: 0
8 running: 1
8 running: 2
DICT {1: 2, 2: 1, 4: 1, 8: 2, 9: 2}
Class Num: 4 <class 'numpy.int64'>
get 1  K 2
DICT {1: 2, 2: 1, 4: 1, 8: 2, 9: 2}
Class Num: 2 <class 'numpy.int64'>
get 1  K 2
DICT {1: 2, 2: 1, 4: 1, 8: 2, 9: 2}
Class Num: 9 <class 'numpy.int64'>
get 2  K 2
InterNum  24469  maskNum  26096
masks match
8 running: 3
DICT {1: 2, 2: 1, 4: 1, 8: 2, 9: 2}
Class Num: 4 <class 'numpy.int64'>
get 1  K 3
DICT {1: 2, 2: 1, 4: 1, 8: 2, 9: 2}
Class Num: 2 <class 'numpy.int64'>
get 1  K 3
DICT {1: 2, 2: 1, 4: 1, 8: 2, 9: 2}
Class Num: 9 <class 'numpy.int64'>
get 2  K 3
InterNum  734  maskNum  11057
masks do not match but will have to look more : cnt 1
DICT {1: 2, 2: 1, 4: 1, 8: 2, 9: 2}
Class Num: 1 <class 'numpy.int64'>
get 2  K 3
DICT {1: 2, 2: 1, 4: 1, 8: 2, 9: 2}
Class Num: 9 <class 'numpy.int64'>
get 2  K 3
InterNum  0  maskNum  11057
masks do not match : optical flow to put mask on pic
Trans

KeyboardInterrupt: ignored