In [2]:
import torch
import cv2

# Import pipeline from sample script

In [3]:
# import the necessary packages
import numpy as np
import argparse
import sys
import cv2

In [4]:
names = "/nvdli-nano/data/action_recognition_kinetics.txt"
model = "/nvdli-nano/data/resnet-34_kinetics.onnx"

In [5]:
with open(names) as l:
    CLASSES = l.read().strip().split("\n")

In [6]:
# load the contents of the class labels file, then define the sample
# duration (i.e., # of frames for classification) and sample size
# (i.e., the spatial dimensions of the frame)
# The number of frame to sample from one second
SAMPLE_DURATION = 16
SAMPLE_FRAME_RATE = 30
SAMPLE_SIZE = 112

In [7]:
# load the human activity recognition model
print("[INFO] loading human activity recognition model...")
net = cv2.dnn.readNet(model)

[INFO] loading human activity recognition model...


In [8]:
# initialize the list / dictionaries to captured classified frames
frame_and_metadatas = []
n = 0

In [9]:
from jetcam.csi_camera import CSICamera
fps_origin = 30
#camera = CSICamera(width=224, height=224)

camera = CSICamera(width=400, height=225, capture_width=1280, capture_height=720, capture_fps=fps_origin)

In [10]:
# Run the model
# loop until we explicitly break from it

assert SAMPLE_FRAME_RATE <= fps_origin, "sample frame rate should be less than or equal to the original video fps"
sample_period = int(fps_origin / SAMPLE_FRAME_RATE)
   
size = (400, 225)

result = cv2.VideoWriter('video_with_label.mp4', 
                         cv2.VideoWriter_fourcc(*'MP4V'),
                         10, size)
processed = 0
while True:
  # initialize the batch of frames that will be passed through the model
  frames = []

  n += 1
    
  # loop over the number of required sample frames
  for i in range(0, SAMPLE_DURATION*sample_period):
    # read a frame from the video stream
    frame = camera.read()
    # We only sample portion of frames to match the desired fps.
    if i % sample_period != 0:
        continue

    #Identiy the frame number
    # if the frame was not grabbed then we've reached the end of
    # the video stream so exit the script
    # otherwise, the frame was read so resize it and add it to
    # our frames list
    frames.append(frame)
  
  # now that our frames array is filled we can construct our blob
  blob = cv2.dnn.blobFromImages(frames, 1.0, (SAMPLE_SIZE, SAMPLE_SIZE), (114.7748, 107.7354, 99.4750), 
                                swapRB=True, crop=True)
  blob = np.transpose(blob, (1, 0, 2, 3))
  blob = np.expand_dims(blob, axis=0)
  print(blob.shape)
  # pass the blob through the network to obtain our human activity
  # recognition predictions
  net.setInput(blob)
  outputs = net.forward()
  label = CLASSES[np.argmax(outputs)]
  print(f"label is: {label}")

  # loop over our frames
  for frame in frames:
    # draw the predicted activity on the frame
    cv2.rectangle(frame, (0, 0), (300, 40), (0, 0, 0), -1) #300, 40
    cv2.putText(frame, label, (10, 25), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 255, 255), 2)

    # display the frame to our screen
    #cv2.imshow("Activity Recognition", frame) # colab이 아닌 그냥 Ipynb에서는 cv.imshow() 쓰면 돼요


    result.write(frame) # Unf
    meta_dict = {"image":frame, "label":label, "pos":processed}
    frame_and_metadatas.append(meta_dict)

    processed +=1
    print(f"{processed} frame processed!")
    key = cv2.waitKey(1) & 0xFF
    # if the `q` key was pressed, break from the loop
    if key == ord("q"):
      break

(1, 3, 16, 112, 112)
label is: dunking basketball
1 frame processed!
2 frame processed!
3 frame processed!
4 frame processed!
5 frame processed!
6 frame processed!
7 frame processed!
8 frame processed!
9 frame processed!
10 frame processed!
11 frame processed!
12 frame processed!
13 frame processed!
14 frame processed!
15 frame processed!
16 frame processed!
(1, 3, 16, 112, 112)
label is: playing squash or racquetball
17 frame processed!
18 frame processed!
19 frame processed!
20 frame processed!
21 frame processed!
22 frame processed!
23 frame processed!
24 frame processed!
25 frame processed!
26 frame processed!
27 frame processed!
28 frame processed!
29 frame processed!
30 frame processed!
31 frame processed!
32 frame processed!
(1, 3, 16, 112, 112)
label is: playing squash or racquetball
33 frame processed!
34 frame processed!
35 frame processed!
36 frame processed!
37 frame processed!
38 frame processed!
39 frame processed!
40 frame processed!
41 frame processed!
42 frame processe

KeyboardInterrupt: 