###Required package
h5py
Keras
numpy
opencv-python
sklearn


In [1]:
%matplotlib widget
#load library
import ipywidgets as widgets
import cv2
import time
import numpy as np
import pandas as pd
from ipywebrtc import CameraStream, ImageRecorder, VideoRecorder
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras 
import sklearn as sk
from sklearn.datasets import make_classification
import scikitplot as skplt
from sklearn.model_selection import train_test_split # Helps with organizing data for training
from sklearn.metrics import confusion_matrix # Helps present results as a confusion-matrix

In [2]:
#parameters
video_landmark_data_path = 'Video Data.xlsx'
#Read the directory name
video_subdirecory_name = "videos"
directory_names= ["102","159","294","441","564","576","609","666","711","723"]
#directory_names= ["666"]
conditions = ["open_palm","open_dorsal","fist_palm","fist_dorsal","three_fingers_palm","three_fingers_dorsal"]
#conditions = ["fist_palm"]
video_file_extensions = [".mp4",".webm"]
columns=['filepath', 'x1',  'y1', 'x2','y2','class_name']
hand_detect_tune_param = 30
#Note Input of training is all land mark and output column name is output_label which classified fro 0 to 5
#write image to path
train_image_path = "train_images_hand_specialization/{}.png"

In [3]:
#process data function.
def processedData(data):
    #process output data by labeling multiclass 0 to 5 instead of one hot encoding
    processing_data = data
    processing_data.loc[processing_data.camera_facing_side.isin(['open']) & processing_data.gesture.isin(['palm']), 'camera_facing_side'] = 0
    processing_data.loc[processing_data.camera_facing_side.isin(['fist']) & processing_data.gesture.isin(['palm']), 'camera_facing_side'] = 1
    processing_data.loc[processing_data.camera_facing_side.isin(['three_fingers']) & processing_data.gesture.isin(['palm']), 'camera_facing_side'] = 2
    processing_data.loc[processing_data.camera_facing_side.isin(['open']) & processing_data.gesture.isin(['dorsal']), 'camera_facing_side'] = 3
    processing_data.loc[processing_data.camera_facing_side.isin(['fist']) & processing_data.gesture.isin(['dorsal']), 'camera_facing_side'] = 4
    processing_data.loc[processing_data.camera_facing_side.isin(['three_fingers']) & processing_data.gesture.isin(['dorsal']), 'camera_facing_side'] = 5

    # this will be used later to classify if hand is open or dorsal side fro raw image
    processing_data.loc[processing_data.gesture == 'palm', 'gesture'] = 0
    processing_data.loc[processing_data.gesture == 'dorsal', 'gesture'] = 1
    #Convert all column to integer type
    processing_data = processing_data.astype({"camera_facing_side": np.int64, "gesture": np.int64}) 
    #rename output column name as label
    processing_data=processing_data.rename(columns = {'camera_facing_side':'output_label'})
    return processing_data



In [4]:
#Load test dataset from video data input
test_data = pd.read_excel(video_landmark_data_path, index_col=0, comment='#').dropna().reset_index() 
processed_test_data= processedData(test_data).drop(columns=['ID'])

*****This function is used to train our model to get the hand area from provided video. Tuning parameter here 20 will get exact hand from image as we have provided landmark. Get xmin, xmax, ymin, ymax from the videodata.xlsx and draw rectangle by tuning left and right how much to increase to detect exact hand.
param name : hand_detect_tune_param

In [5]:
#Function to write output class on video frame and draw circle and line
def Extract_hand_from_train_image(image_frame, img_x_test_input_array, hand_tune_param):
        #draw circle and line
        dropped_filter_position = []
        for indexval in range(0,80,2):
            xpos = img_x_test_input_array.iloc[ 0 , indexval ] 
            ypos = img_x_test_input_array.iloc[ 0 , indexval + 1 ] 
            #cv2.circle(image_frame,(xpos, ypos), 3, (128,0,50),5)
            if xpos > 0 or ypos > 0 :
                dropped_filter_position.append(xpos)
                dropped_filter_position.append(ypos)
        #get the filtered list
        xpos_list = dropped_filter_position[::2]
        ypos_list = dropped_filter_position[1::2]
        #print(xpos_list)
        #print(ypos_list)
        #print(dropped_filter_position)
        X_min =  min(xpos_list)
        X_max =  max(xpos_list)
        Y_min =  min(ypos_list)
        Y_max =  max(ypos_list)
        hh = Y_max - Y_min + hand_tune_param
        ww = X_max - X_min + hand_tune_param
        centerx = int ((X_min + X_max) / 2 )
        centery = int ((Y_min + Y_max) / 2 )
        #Check if frame goes to out of boundary return 0
        im_width = image_frame.shape[1]
        im_height = image_frame.shape[0]
        X1 = int(centerx - ww/2)
        Y1 = int(centery - hh/2)
        X2 = int(centerx + ww/2)
        Y2 = int(centery + hh/2)
        if X1 <= 0 or Y1 <= 0 or X2 >= im_width or Y2 >= im_height :
            X1=  -1
            X2 = -1
            Y1 = -1 
            Y2 = -1
        #cv2.circle(image_frame,(int(centerx - ww/2), int(centery - hh/2)), 3, (128,0,50),5)
        #cv2.circle(image_frame,(int(centerx + ww/2), int(centery + hh/2)), 3, (128,0,50),5)
        #else:
            #cv2.rectangle(image_frame,
                #(int(centerx - ww/2), int(centery - hh/2)),
                #(int(centerx + ww/2), int(centery + hh/2)),
                #(0,0,255), 2)
        
        #print( str(X_min) + "M " + str(X_max) + "M " + str(Y_min)   + "M " + str(Y_max))
        return image_frame , X1, Y1, X2, Y2 

#Save all image from video to train_images path and generate a text file with columns=['filepath', 'x1',  'y1', 'x2','y2','class_name'] and here we have only one class called hand and other is not identified. Now we have to just put one class to text file 

In [None]:
# Do not run this section if you allready saved the image to train folder and generated text file

In [6]:

output_data_frame = pd.DataFrame(columns = columns)
bad_annotated_data_frame = pd.DataFrame(columns = ['file_name'])

import os
current_working_directory = os.getcwd()
for directory_name in directory_names:
    for video_name in conditions:
        for extension in video_file_extensions:
            video_path =directory_name + "\\" + video_name 
            #print(current_working_directory + "\\" + video_path + extension)
            input_video = cv2.VideoCapture(r"" + video_subdirecory_name + "\\" + video_path + extension)
            output_file_name = r"" + video_subdirecory_name + "\\" + video_path  + "_predicted.mp4"
            backend = cv2.CAP_ANY
            fourcc_code = cv2.VideoWriter_fourcc(*"H264")
            fps = 24
            frame_size = (640, 480)
            #output_video = cv2.VideoWriter(output_file_name, backend, fourcc_code, fps, frame_size)

            ret, frame = input_video.read()
            counter = 0
            while ret:
                ret, frame = input_video.read()
                if not ret:
                    continue
                #if counter == 0:
                data_frame = processed_test_data.loc[processed_test_data.source.isin([directory_name + "/" + video_name  + extension]) & processed_test_data.frame.isin([counter])]

                #Input landmark - feed this to prediction model
                X_test_input = data_frame.drop(columns=['output_label','gesture','source','frame'])
                if X_test_input.empty == False :
                    #put output class to the image from predicted model. Now put as on the data
                    X_test_output = data_frame['output_label'].iloc[0]
                    #print(X_test_output)
                    output_frame, x_min, y_min, x_max, y_max = Extract_hand_from_train_image( frame, X_test_input, hand_detect_tune_param)
                    #plt.imshow(output_frame)
                    fileName =  directory_name + "_" + video_name + "_" + str(counter)
                    if x_min > 0 or y_min > 0 or x_max > 0 or y_max > 0:
                        #cv2.imwrite(train_image_path.format(fileName), frame)
                        output_data_frame = output_data_frame.append({columns[0] : train_image_path.format(fileName), columns[1] : x_min, columns[2] : y_min, columns[3] : x_max, columns[4] : y_max, columns[5] : str(X_test_output) }, ignore_index=True)
                        #output_video.write(output_frame)
                    else:
                        bad_annotated_data_frame = bad_annotated_data_frame.append({'file_name': fileName}, ignore_index=True)
                counter += 1
            input_video.release()
            #output_video.release()


bad_annotated_data_frame.to_csv('bad_annotated_data-fame.csv', index=False)
#output_data_frame.to_csv('annotate.txt', header=None, index=None, sep=' ')

# generate text File

In [7]:
data = pd.DataFrame()
data['format'] = output_data_frame['filepath']

# add xmin, ymin, xmax, ymax and class as per the format required
for i in range(data.shape[0]):
    data['format'][i] =  data['format'][i] + ',' + str(output_data_frame['x1'][i]) + ',' + str(output_data_frame['y1'][i]) + ',' + str(output_data_frame['x2'][i]) + ',' + str(output_data_frame['y2'][i]) + ',' + output_data_frame['class_name'][i]

data.to_csv('open_three_closed_annotate.txt', header=None, index=None, sep=' ')

# Train frcnn hand detection and specialization model using terminal
parser can be pascal_voc or simple we used simple: Go to the location of train_frcnn.py and run using below command


In [None]:
python train_frcnn.py -o simple -p open_three_closed_annotate.txt