In [1]:
# Import and install dependencies.
!pip install tensorflow==2.4.1 tensorflow-gpu==2.4.1 opencv-python mediapipe sklearn matplotlib



In [2]:
import cv2
import numpy as np
import os
from matplotlib import pyplot as plt
import time
import mediapipe as mp

In [3]:
# We will setup Media Pipe Holistic

#This is mediapipe holistic model which makes the detections.
mp_holistic = mp.solutions.holistic

#This is mediapipe holistic drawing utilities which will draw the detection.
mp_drawing = mp.solutions.drawing_utils

In [15]:
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) #This is color coversion from BGR to RGB.
    image.flags.writeable = False  #image is no longer writeable.
    results = model.process(image) #Make prediction.
    image.flags.writeable = True  #Image is now writeable.
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) #This is color conversion from RGB to BGR.
    return image, results

In [16]:
def draw_landmarks(image, results):
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_TESSELATION)
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS)
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS)

In [22]:
def enhanced_landmarks(image, results):
    #Draw Face connections
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_TESSELATION,
                             mp_drawing.DrawingSpec(color=(80, 110, 10), thickness=1, circle_radius=1),
                             mp_drawing.DrawingSpec(color=(80,256,121), thickness=1, circle_radius=1)
                             )
    #Draw pose Connections
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(80, 22, 10), thickness=2, circle_radius=4),
                             mp_drawing.DrawingSpec(color=(80,44,121), thickness=2, circle_radius=2)
                             )
    #Draw left Hand Connections
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(121, 22, 76), thickness=2, circle_radius=4),
                             mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2)
                             )
    #Draw right hand connections
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(245, 117, 76), thickness=2, circle_radius=4),
                             mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2)
                             )

In [23]:
#first we will access webcam using cv2

#This is accessing the webcam
cap = cv2.VideoCapture(0)
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    
    while cap.isOpened():
        
        #capturig frames from the webcam
        ret, frame = cap.read()

        #between read and rendering we want to make our prediction.
        image, results = mediapipe_detection(frame, holistic)
        print(results)
        
        #Draw ladmarks in real time
        enhanced_landmarks(image, results)

        #Showing frames on the screen.
        cv2.imshow('OpenCV Feed', image)

        #Breaking gracefully from the loop.
        #This line specifies that when we want to break out of loop we need to press 'q' on the keyboard.
        if cv2.waitKey(10) & 0XFFF == ord('q'):
            break;   
    cap.release()
    #This line of code closes alll the windows.
    cv2.destroyAllWindows()

#Then on above that layer we will apply mediapipe

<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.soluti

In [21]:
cap.release()
#This line of code closes alll the windows.
cv2.destroyAllWindows()

In [24]:
print(results.pose_landmarks.landmark[0].visibility)
print(len(results.pose_landmarks.landmark))

0.9998953342437744
33


In [25]:
pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(132)
#print(len(pose))  ------>   33
#print(pose.shape)  ------>   (33, 4)
#pose = pose.flatten()
#print(pose.shape)   ------>  (33, 4)
face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(1404)

lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
print(len(lh)) #----->  21
#print(lh.shape)  -----> (21, 3)
#lh = lh.flatten()
#print(lh.shape)   -----> (63,)

rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
#print(len(rh)) ------>  21
#print(rh.shape)  -------> (21, 3)
#rh = rh.flatten()  
#print(rh.shape)  -------> (63,)

63


In [26]:
def extract_keypoints(results):
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    return np.concatenate([pose, face, lh, rh])

In [27]:
extract_keypoints(results).shape

(1662,)

In [28]:
result_test = extract_keypoints(results)

In [29]:
result_test

array([ 0.57515895,  0.30823773, -0.85163993, ...,  0.        ,
        0.        ,  0.        ])

In [None]:
#To save each frame as np array we will use this method
#np.save('0', result_test)

In [None]:
#To load back that frame as np array
#np.load('0.npy')

# Video classification methods

In [None]:
# no_frame = 15
# Height = 255
# Width = 255

# #Function to load video from the folder
# def load_video_names(path):
#     videos = []  #list to store videos names only
#     labels = []  #list to store labels of the video which action is performed in the video
#     for category in os.listdir(path):  #This will go throgh all folders 
#         for video in os.listdir(path+"/"+ category):  #This will go throughh all videos in the folder
#             videos.append(path+"/"+category+"/"+video)  #This is complete path of video
#             labels.append(category) #This is the label
#     return np.array(videos), np.array(labels)


# #Function to convert each frame pixels in range between -1 to +1
# def preprocess(frame):
#     frame = cv2.resize(frame, (Width, Height)) # This will resize the frame
#     frame = frame-127.5
#     frame = frame/127.5
#     return frame


# #This function will load from its path, this will be used only when training. It will return a single video
# def load_video(video_path):
#     video_frames = [] #frame of a single video
#     cap = cv2.VideoCapture(video_path) #path of video, to load video using opencv
#     while True:
#         ret, frame = cap.read()
#         if ret == True:
#             video_frames.append(preprocess(frame)) #store all frames
#         else:
#             break
#     cap.release()
#     video_frames = select_frames(video_frames) #This is used to choose specific frames
#     if len(video_frames) != frames: #whether we have desired number of frames or not
#         print('short_video', video_path, len(video_frames))
        
#     return np.array(video_frames)




# #Function to choose desired number of frames, since some video have more frames
# def select_frames(video_frames):
#     selected_frames = []
#     if len(video_frames) > frames:
#         fn = len(video_frames)//frames #This will selecst every nth frame
#         f_num = 0
#         for f in video_frames:
#             if len(selected_frames) < frames:
#                 if f_num % fn == 0:
#                     selected_frames.append(f)
#             f_num += 1
#     else:
#         selected_frames = video_frames
#     return selected_frames


# def create_dataset(videos, labels, index):
#     '''
#     this function will be used to load batch of video by using previous function
#     which load single video.
#     parameters:
#     videos : path of all videos
#     labels : output labels of videos
#     index : index of videos to be used in batch
#     '''
    
#     x = []
#     y = []
#     for video, label in zip(videos[index], labels[index]):
#         x.append(load_video(video))
#         y.append(label)
        
#     return np.arrays(x), np.array(y)



# videos, labels = load_video_names('./folder name') # This will load all labels and video paths
# samples = len(videos)

# We will provide training data through webcam

In [30]:
#path for exprted data, numpy arrays
DATA_PATH = os.path.join('MP-Data')

#Actions that we try to detect
#First we will train our model for alphabet
actions = np.array(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'])

#Thirty videos worth of data
no_sequences = 30

#videos are going to be 30 frames in length
sequence_length = 30

In [31]:
#This traverse through every actions or labels and inside that action there will be thirty videos, each of thirty frames.
for action in actions:
    for sequence in range(no_sequences):
        try:
            os.makedirs(os.path.join(DATA_PATH, action, str(sequence)))
        except:
            pass

In [38]:
cap = cv2.VideoCapture(0)
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    
    #Loop through actions
    for action in actions:
        #Loop through each videos
        for sequence in range(no_sequences):
            #Loop through video length
            for frame_num in range(sequence_length):
                #capturig frames from the webcam
                ret, frame = cap.read()

                #between read and rendering we want to make our prediction.
                image, results = mediapipe_detection(frame, holistic)
                print(results)

                #Draw ladmarks in real time
                draw_landmarks(image, results)

                #Apply wait Logic.
                if frame_num == 0:
                    cv2.putText(image, 'STARTING COLLECTION', (120,200),
                               cv2.FONT_HERSHEY_SIMPLEX, 1, (0,255,0), 1, cv2.LINE_AA)
                    cv2.putText(image, 'Collecting frames for {} video Number {}'.format(action, sequence), (15,12),
                               cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,0,255), 1, cv2.LINE_AA)
                    cv2.waitKey(2000)
                else:
                    cv2.putText(image, 'Collecting frames for {} Video Number {}'.format(action, sequence), (15,12),
                               cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA)

                #Now we will extract keypoints
                keypoints = extract_keypoints(results)
                npy_path = os.path.join(DATA_PATH, action, str(sequence), str(frame_num))
                np.save(npy_path, keypoints)


                #Showing frames on the screen.
                cv2.imshow('OpenCV Feed', image)

                #Breaking gracefully from the loop.
                #This line specifies that when we want to break out of loop we need to press 'q' on the keyboard.
                if cv2.waitKey(10) & 0XFFF == ord('q'):
                    break;  
                    
    cap.release()
    #This line of code closes alll the windows.
    cv2.destroyAllWindows()

<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.soluti

KeyboardInterrupt: 

In [39]:
cap.release()
cv2.destroyAllWindows()