In [1]:
import argparse
import logging
import time
import sys
import cv2
import numpy as np
import os

from tf_pose.estimator import BodyPart
from tf_pose.estimator import TfPoseEstimator
from tf_pose.networks import get_graph_path, model_wh

In [2]:
logger = logging.getLogger('TfPoseEstimator-Video')
logger.setLevel(logging.DEBUG)
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
formatter = logging.Formatter('[%(asctime)s] [%(name)s] [%(levelname)s] %(message)s')
ch.setFormatter(formatter)
logger.addHandler(ch)

fps_time = 0

In [3]:
def process_human_data(humans):
    
    if (len(humans)==0):
        return np.zeros(shape=(18,2))
                        
    feature = np.zeros(shape=(18,2))
    for i in range(18):
        if i not in humans[0].body_parts:
            feature[i] = [0, 0]
        else:
            feature[i] = [humans[0].body_parts[i].x, humans[0].body_parts[i].y]
    
    return feature
        

In [5]:
model_path='mobilenet_thin'
resolution = '320x240'
showBG=True
    
logger.debug('initialization %s : %s' % (model_path, get_graph_path(model_path)))
w, h = model_wh(resolution)
e = TfPoseEstimator(get_graph_path(model_path), target_size=(w, h))


video = '../UCF-101/PlayingPiano/v_PlayingPiano_g02_c01.avi'
cap = cv2.VideoCapture(video)

#---------------modified----------------#
num_frames = float(cap.get(cv2.CAP_PROP_FRAME_COUNT))
print ("All Frames: " ,num_frames)
cur_frames = 0.0
step = (num_frames / 20.0) 
#---------------modified----------------#

fourcc = cv2.VideoWriter_fourcc(*'XVID')
width = cap.get(cv2.CAP_PROP_FRAME_WIDTH)
height = cap.get(cv2.CAP_PROP_FRAME_HEIGHT)
resize_out_ratio = 8.0
print("Image Size: %d x %d" % (width, height)) 

single_video_features = np.array([])
if cap.isOpened() is False:
    print("Error opening video stream or file")

while (cap.isOpened()):   
    if(cur_frames >= num_frames):
        break

    frame_no = (cur_frames/num_frames)
    cap.set(1,frame_no)
    ret_val, image = cap.read()
    
    print("Frame no: ", frame_no)
    print ("Count: ", cur_frames)
    
    if ret_val == True:
        humans = e.inference(image, resize_to_default=(w > 0 and h > 0), upsample_size=resize_out_ratio)
        #print ("Frame numbers: ", cur_frames, humans)
        frame_feature = process_human_data(humans) 
        single_video_features = np.append(single_video_features, frame_feature) 
        
    cur_frames+=step
    if cv2.waitKey(1) == 27:
        break
print (single_video_features)
cv2.destroyAllWindows()
logger.debug('finished+')

[2018-07-25 15:20:37,462] [TfPoseEstimator-Video] [DEBUG] initialization mobilenet_thin : /home/MPLab/mplab006/tf_action_recognition/models/graph/mobilenet_thin/graph_opt.pb
[2018-07-25 15:20:37,463] [TfPoseEstimator] [INFO] loading graph from /home/MPLab/mplab006/tf_action_recognition/models/graph/mobilenet_thin/graph_opt.pb(default size=320x240)


All Frames:  225.0
Image Size: 320 x 240
Frame no:  0.0
Count:  0.0
human_id= 0 , BodyPart:0-(0.71, 0.54) score=0.60
human_id= 0 , BodyPart:1-(0.73, 0.60) score=0.53
human_id= 0 , BodyPart:2-(0.72, 0.60) score=0.45
human_id= 0 , BodyPart:3-(0.69, 0.66) score=0.16
human_id= 0 , BodyPart:5-(0.76, 0.59) score=0.66
human_id= 0 , BodyPart:6-(0.75, 0.68) score=0.50
human_id= 0 , BodyPart:7-(0.69, 0.72) score=0.58
human_id= 0 , BodyPart:8-(0.71, 0.74) score=0.26
human_id= 0 , BodyPart:9-(0.69, 0.79) score=0.07
human_id= 0 , BodyPart:11-(0.75, 0.74) score=0.33
human_id= 0 , BodyPart:12-(0.71, 0.77) score=0.12
human_id= 0 , BodyPart:14-(0.71, 0.53) score=0.40
human_id= 0 , BodyPart:15-(0.72, 0.53) score=0.59
human_id= 0 , BodyPart:17-(0.73, 0.53) score=0.55
Frame no:  0.05
Count:  11.25
human_id= 0 , BodyPart:0-(0.71, 0.54) score=0.60
human_id= 0 , BodyPart:1-(0.73, 0.60) score=0.53
human_id= 0 , BodyPart:2-(0.72, 0.60) score=0.45
human_id= 0 , BodyPart:3-(0.69, 0.66) score=0.16
human_id= 0 , B

human_id= 0 , BodyPart:0-(0.71, 0.54) score=0.60
human_id= 0 , BodyPart:1-(0.73, 0.60) score=0.53
human_id= 0 , BodyPart:2-(0.72, 0.60) score=0.45
human_id= 0 , BodyPart:3-(0.69, 0.66) score=0.16
human_id= 0 , BodyPart:5-(0.76, 0.59) score=0.66
human_id= 0 , BodyPart:6-(0.75, 0.68) score=0.50
human_id= 0 , BodyPart:7-(0.69, 0.72) score=0.58
human_id= 0 , BodyPart:8-(0.71, 0.74) score=0.26
human_id= 0 , BodyPart:9-(0.69, 0.79) score=0.07
human_id= 0 , BodyPart:11-(0.75, 0.74) score=0.33
human_id= 0 , BodyPart:12-(0.71, 0.77) score=0.12
human_id= 0 , BodyPart:14-(0.71, 0.53) score=0.40
human_id= 0 , BodyPart:15-(0.72, 0.53) score=0.59
human_id= 0 , BodyPart:17-(0.73, 0.53) score=0.55
Frame no:  0.65
Count:  146.25
human_id= 0 , BodyPart:0-(0.71, 0.54) score=0.60
human_id= 0 , BodyPart:1-(0.73, 0.60) score=0.53
human_id= 0 , BodyPart:2-(0.72, 0.60) score=0.45
human_id= 0 , BodyPart:3-(0.69, 0.66) score=0.16
human_id= 0 , BodyPart:5-(0.76, 0.59) score=0.66
human_id= 0 , BodyPart:6-(0.75, 0

[2018-07-25 15:21:01,194] [TfPoseEstimator-Video] [DEBUG] finished+


human_id= 0 , BodyPart:0-(0.71, 0.54) score=0.60
human_id= 0 , BodyPart:1-(0.73, 0.60) score=0.53
human_id= 0 , BodyPart:2-(0.72, 0.60) score=0.45
human_id= 0 , BodyPart:3-(0.69, 0.66) score=0.16
human_id= 0 , BodyPart:5-(0.76, 0.59) score=0.66
human_id= 0 , BodyPart:6-(0.75, 0.68) score=0.50
human_id= 0 , BodyPart:7-(0.69, 0.72) score=0.58
human_id= 0 , BodyPart:8-(0.71, 0.74) score=0.26
human_id= 0 , BodyPart:9-(0.69, 0.79) score=0.07
human_id= 0 , BodyPart:11-(0.75, 0.74) score=0.33
human_id= 0 , BodyPart:12-(0.71, 0.77) score=0.12
human_id= 0 , BodyPart:14-(0.71, 0.53) score=0.40
human_id= 0 , BodyPart:15-(0.72, 0.53) score=0.59
human_id= 0 , BodyPart:17-(0.73, 0.53) score=0.55
(720,)


In [82]:


feature = np.zeros(shape=(18,2))
for i in range(18):
    if i not in humans[0].body_parts:
        feature[i] = [0, 0]
    else:
        feature[i] = [bp[i].x, bp[i].y]
feature = list(feature)

print(feature[0])


[0.475      0.22916667]


In [6]:
import os
import pickle
video_dict = { 'PlayingCello':0,'PlayingDaf':1,'PlayingDhol':2,'PlayingFlute':3,'PlayingGuitar':4,'PlayingPiano':5, 'PlayingSitar':6,'PlayingTabla':7,'PlayingViolin':8}
rootdir = '/home/MPLab/mplab006/UCF-101/'
def iterate_interest_dir():
    feature_set=[]
    for subdir, dirs, files in os.walk(rootdir):
        for dirss in dirs:
            if (dirss in video_dict):
                for filename in os.listdir(os.path.join(subdir,dirss)):
                    abs_path =os.path.join(subdir,dirss,filename)
                    feature =inference_video(abs_path)
                    classification = get_classification(dirss)
                    feature =list(feature)
                    print(feature)
                    feature_set.append([feature,classification])
                    #print(feature_set)
    with open('feature_set.pickle','wb') as file:
        pickle.dump(feature_set,file)
iterate_interest_dir()

[2018-07-25 17:04:56,175] [TfPoseEstimator-Video] [DEBUG] initialization mobilenet_thin : /home/MPLab/mplab006/tf_action_recognition/models/graph/mobilenet_thin/graph_opt.pb
[2018-07-25 17:04:56,176] [TfPoseEstimator] [INFO] loading graph from /home/MPLab/mplab006/tf_action_recognition/models/graph/mobilenet_thin/graph_opt.pb(default size=320x240)


human_id= 0 , BodyPart:0-(0.48, 0.20) score=0.86
human_id= 0 , BodyPart:1-(0.54, 0.28) score=0.76
human_id= 0 , BodyPart:2-(0.48, 0.27) score=0.70
human_id= 0 , BodyPart:3-(0.41, 0.37) score=0.56
human_id= 0 , BodyPart:4-(0.36, 0.41) score=0.67
human_id= 0 , BodyPart:5-(0.62, 0.28) score=0.63
human_id= 0 , BodyPart:6-(0.68, 0.43) score=0.77
human_id= 0 , BodyPart:7-(0.62, 0.33) score=0.50
human_id= 0 , BodyPart:8-(0.48, 0.58) score=0.43
human_id= 0 , BodyPart:9-(0.46, 0.74) score=0.10
human_id= 0 , BodyPart:10-(0.44, 0.85) score=0.12
human_id= 0 , BodyPart:11-(0.60, 0.58) score=0.65
human_id= 0 , BodyPart:12-(0.54, 0.75) score=0.81
human_id= 0 , BodyPart:13-(0.57, 0.95) score=0.19
human_id= 0 , BodyPart:14-(0.48, 0.17) score=0.73
human_id= 0 , BodyPart:15-(0.50, 0.18) score=0.85
human_id= 0 , BodyPart:17-(0.55, 0.17) score=0.81
human_id= 0 , BodyPart:0-(0.48, 0.20) score=0.86
human_id= 0 , BodyPart:1-(0.54, 0.28) score=0.76
human_id= 0 , BodyPart:2-(0.48, 0.27) score=0.70
human_id= 0 ,

human_id= 0 , BodyPart:0-(0.48, 0.20) score=0.86
human_id= 0 , BodyPart:1-(0.54, 0.28) score=0.76
human_id= 0 , BodyPart:2-(0.48, 0.27) score=0.70
human_id= 0 , BodyPart:3-(0.41, 0.37) score=0.56
human_id= 0 , BodyPart:4-(0.36, 0.41) score=0.67
human_id= 0 , BodyPart:5-(0.62, 0.28) score=0.63
human_id= 0 , BodyPart:6-(0.68, 0.43) score=0.77
human_id= 0 , BodyPart:7-(0.62, 0.33) score=0.50
human_id= 0 , BodyPart:8-(0.48, 0.58) score=0.43
human_id= 0 , BodyPart:9-(0.46, 0.74) score=0.10
human_id= 0 , BodyPart:10-(0.44, 0.85) score=0.12
human_id= 0 , BodyPart:11-(0.60, 0.58) score=0.65
human_id= 0 , BodyPart:12-(0.54, 0.75) score=0.81
human_id= 0 , BodyPart:13-(0.57, 0.95) score=0.19
human_id= 0 , BodyPart:14-(0.48, 0.17) score=0.73
human_id= 0 , BodyPart:15-(0.50, 0.18) score=0.85
human_id= 0 , BodyPart:17-(0.55, 0.17) score=0.81


KeyboardInterrupt: 

In [4]:
def get_classification(filename):
    label=np.zeros(shape=(9))
    label[video_dict[filename]]=1
    return label

In [5]:
def inference_video(path):
    model_path='mobilenet_thin'
    resolution = '320x240'
    showBG=True

    logger.debug('initialization %s : %s' % (model_path, get_graph_path(model_path)))
    w, h = model_wh(resolution)
    e = TfPoseEstimator(get_graph_path(model_path), target_size=(w, h))


    cap = cv2.VideoCapture(path)

    #---------------modified----------------#
    num_frames = float(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    #print ("All Frames: " ,num_frames)
    cur_frames = 0.0
    step = (num_frames / 20.0) 
    #---------------modified----------------#

    fourcc = cv2.VideoWriter_fourcc(*'XVID')
    width = cap.get(cv2.CAP_PROP_FRAME_WIDTH)
    height = cap.get(cv2.CAP_PROP_FRAME_HEIGHT)
    resize_out_ratio = 8.0
    #print("Image Size: %d x %d" % (width, height)) 

    single_video_features = np.array([])
    if cap.isOpened() is False:
        print("Error opening video stream or file")

    while (cap.isOpened()):   
        if(cur_frames >= num_frames):
            break

        frame_no = (cur_frames/num_frames)
        cap.set(1,frame_no)
        ret_val, image = cap.read()

        #print("Frame no: ", frame_no)
        #print ("Count: ", cur_frames)

        if ret_val == True:
            humans = e.inference(image, resize_to_default=(w > 0 and h > 0), upsample_size=resize_out_ratio)
            #print ("Frame numbers: ", cur_frames, humans)
            frame_feature = process_human_data(humans) 
            single_video_features = np.append(single_video_features, frame_feature) 

        cur_frames+=step
        if cv2.waitKey(1) == 27:
            break
    #print (single_video_features)
    cv2.destroyAllWindows()
    #logger.debug('finished+')
    return single_video_features
    