In [1]:
import torch
import cv2

# Import pipeline from sample script

In [2]:
# import the necessary packages
import numpy as np
import argparse
import sys
import cv2

In [3]:
names = "/nvdli-nano/data/action_recognition_kinetics.txt"
model = "/nvdli-nano/data/resnet-34_kinetics.onnx"

In [4]:
with open(names) as l:
    CLASSES = l.read().strip().split("\n")

In [5]:
# load the contents of the class labels file, then define the sample
# duration (i.e., # of frames for classification) and sample size
# (i.e., the spatial dimensions of the frame)
# The number of frame to sample from one second
SAMPLE_DURATION = 16
SAMPLE_FRAME_RATE = 30
SAMPLE_SIZE = 112

In [6]:
# load the human activity recognition model
print("[INFO] loading human activity recognition model...")
net = cv2.dnn.readNet(model)

[INFO] loading human activity recognition model...


In [7]:
# initialize the list / dictionaries to captured classified frames
frame_and_metadatas = []
n = 0

In [8]:
from jetcam.csi_camera import CSICamera
fps_origin = 30
#camera = CSICamera(width=224, height=224)

camera = CSICamera(width=400, height=225, capture_width=1280, capture_height=720, capture_fps=fps_origin)

In [None]:
# Run the model
# loop until we explicitly break from it
from google.colab.patches import cv2_imshow

assert SAMPLE_FRAME_RATE <= fps_origin, "sample frame rate should be less than or equal to the original video fps"
sample_period = int(fps_origin / SAMPLE_FRAME_RATE)
   
size = (400, 225)

result = cv2.VideoWriter('video_with_label.mp4', 
                         cv2.VideoWriter_fourcc(*'MP4V'),
                         10, size)
processed = 0
while True:
  # initialize the batch of frames that will be passed through the model
  frames = []

  n += 1
    
  # loop over the number of required sample frames
  for i in range(0, SAMPLE_DURATION*sample_period):
    # read a frame from the video stream
    frame = camera.read()
    # We only sample portion of frames to match the desired fps.
    if i % sample_period != 0:
        continue

    #Identiy the frame number
    pos_frame = vs.get(cv2.CAP_PROP_POS_FRAMES) / sample_period
    # if the frame was not grabbed then we've reached the end of
    # the video stream so exit the script
    if not grabbed:
      print("[INFO] no frame read from stream - exiting")
      sys.exit(0)
    # otherwise, the frame was read so resize it and add it to
    # our frames list
    frame = imutils.resize(frame, width=400)
    frames.append(frame)
  
  # now that our frames array is filled we can construct our blob
  blob = cv2.dnn.blobFromImages(frames, 1.0, (SAMPLE_SIZE, SAMPLE_SIZE), (114.7748, 107.7354, 99.4750), 
                                swapRB=True, crop=True)
  blob = np.transpose(blob, (1, 0, 2, 3))
  blob = np.expand_dims(blob, axis=0)
  print(blob.shape)
  # pass the blob through the network to obtain our human activity
  # recognition predictions
  net.setInput(blob)
  outputs = net.forward()
  label = CLASSES[np.argmax(outputs)]
  
  
  #capture the frames and labels for future database search engine 
  if label.strip() not in category2group:
    group = "others"
  else:
    group = category2group[label.strip()]


  # loop over our frames
  for frame in frames:
    # draw the predicted activity on the frame
    cv2.rectangle(frame, (0, 0), (300, 40), (0, 0, 0), -1) #300, 40
    cv2.putText(frame, label, (10, 25), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 255, 255), 2)
    # display the frame to our screen
    cv2_imshow(frame) # colab이 아닌 그냥 Ipynb에서는 cv.imshow() 쓰면 돼요
    result.write(frame) # Unf
    meta_dict = {"image":frame, "label":label, "pos":processed, "group":group}
    frame_and_metadatas.append(meta_dict)

    processed +=1

    key = cv2.waitKey(1) & 0xFF
    # if the `q` key was pressed, break from the loop
    if key == ord("q"):
      break