In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dense,Conv1D,MaxPooling1D, Flatten
import cv2 as cv
import numpy as np
import os
import mediapipe as mp
import json
import copy
import math

In [None]:
features = ["eat", "fish", "milk", "cousin", "want", "nice"]

In [None]:
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils

In [None]:
model = Sequential()

# CNN layers
model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(35, 1086)))
model.add(MaxPooling1D(pool_size=2))


# LSTM layers
model.add(LSTM(64, return_sequences=True, activation='relu'))
model.add(LSTM(128, return_sequences=True, activation='relu'))
model.add(LSTM(64, return_sequences=False, activation='relu'))

# Dense layers
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(features.shape[0], activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


In [None]:
model.load_weights("model_data_17(idle_gest_iso_reduMax2).h5")

In [None]:
actions = np.array(features)

In [None]:
# detects and returnsthe landmarks of humans as results.
def mediapipe_detection(image,model):
    # cv.flip(image,1)
    image = cv.cvtColor(image,cv.COLOR_BGR2RGB)
    image.flags.writeable = False
    results = model.process(image)
    image.flags.writeable = True
    image = cv.cvtColor(image, cv.COLOR_BGR2RGB)
    return image, results

In [None]:
# gets keypoints of result's landmaks and returns as single flatten landmarks np.array().
def keypoints_extraction(results):
    face =  np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    pose = np.array([[res.x, res.y,res.z] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*3) # removed res.visibility()
    lh = np.array([[res.x, res.y,res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)

    return np.concatenate([pose, face, lh, rh])

In [None]:
# converts normalised cordinates into pixel cordinates
def locate_it(norm_val, w, h):
    try:
        point = (int(norm_val.x * w),int(norm_val.y * h))
    except AttributeError as ae:
        point = (int(norm_val[0] * w),int(norm_val[1] * h))

    point = (min(point[0], w - 1),min(point[1], h - 1))

    return point


In [None]:
def get_rect_cords(frame,results):
    
    try:
        height, width, channels = frame.shape
    except AttributeError as ae:
        height, width = frame.size
    top_head_index_in_face_landmarks = 10
    hip_index_in_pose_landmarks = (24,23)

    left_shoulder = results.pose_landmarks.landmark[11]
    right_shoulder = results.pose_landmarks.landmark[12]

    centre_point = ((left_shoulder.x + right_shoulder.x) / 2 , (left_shoulder.y + right_shoulder.y) / 2)
    
    if not results.face_landmarks:
        face_loc = results.face_landmarks.landmark[top_head_index_in_face_landmarks]
        st_norm_val = (face_loc.x,face_loc.y)
        head_point = locate_it(face_loc,width,height)
    
    else:
        left_eyebrow_inner = results.pose_landmarks.landmark[4]
        # right_eyebrow_inner = results.pose_landmarks.landmark[1]
        nose_point = results.pose_landmarks.landmark[0]

        st_norm_val = [nose_point.x,nose_point.y]
        norm_dist = cv.norm(np.array([left_eyebrow_inner.x,left_eyebrow_inner.y]), np.array(st_norm_val))
        st_norm_val = (st_norm_val[0], st_norm_val[1] - norm_dist*3)

        nose_point = locate_it(nose_point, width, height)
        left_eyebrow_inner = locate_it(left_eyebrow_inner, width, height)

        # face_loc = ((left_eyebrow_inner.x + right_eyebrow_inner.x) / 2 ,  nose_point.y - (left_eyebrow_inner.y + right_eyebrow_inner.y) / 2) 
        dist  = cv.norm(np.array(left_eyebrow_inner), np.array(nose_point))
        head_point = (nose_point[0], nose_point[1] - int(dist)*3)
        
    pose_loc_1 = results.pose_landmarks.landmark[hip_index_in_pose_landmarks[0]]
    pose_loc_2 = results.pose_landmarks.landmark[hip_index_in_pose_landmarks[1]]

        
    pose_point_1 = locate_it(pose_loc_1,width,height)
    pose_point_2 = locate_it(pose_loc_2,width,height)
    pose_point = (int((pose_point_1[0]+pose_point_2[0])/2),int((pose_point_1[1]+pose_point_2[1])/2))
    distance = int(math.sqrt((pose_point[0] - head_point[0]) ** 2 + (pose_point[1] - head_point[1]) ** 2))


    # adjusting the resolution with reducing crop size when there is less space.
    if head_point[0]-int(distance/2)-29 > 0:
        st_pt1 = head_point[0]-int(distance/2)-30
        st_pt2 = pose_point[0]+int(distance/2)+30

    elif head_point[0]-int(distance/2)-19 > 0:
        st_pt1 = head_point[0]-int(distance/2)-20
        st_pt2 = pose_point[0]+int(distance/2)+20


    elif head_point[0]-int(distance/2)-9 > 0:
        st_pt1 = head_point[0]-int(distance/2)-10
        st_pt2 = pose_point[0]+int(distance/2)+10

    else: #head_point[0]-int(distance/2) > 0:
        st_pt1 = head_point[0]-int(distance/2)
        st_pt2 = pose_point[0]+int(distance/2)


    head_point = (max(st_pt1,0),head_point[1]-10 if head_point[1]-10 > 0 else 0)
    pose_point = (min(st_pt2, width - 1), min(pose_point[1], height - 1))

    rects_st_pt = head_point
    rects_sp_pt = pose_point
    
    return [rects_st_pt, rects_sp_pt],centre_point,st_norm_val

In [None]:
cap = cv.VideoCapture(0)
sequence = []
threshold = 0.30
predictions = []
current_frame = 0
can_record = False
mode = "Loading..."
idle_gest_reg = np.array([])

can_do_live_text = False
live_text_frame_number = 0
last_crop = [(0,0),(0+300, 0+300)]

# videoNcrop is used to cropping resolution as pre-defined.
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    
    # initially considering the inside video frame reading can continue and the current_frame with st_Frame.
    can_continue = True
    frame_number = 1

    while can_continue:

        ret, ori_img = cap.read()

        if ret:
            crop_frame = copy.deepcopy(ori_img)
            # print(ori_img.shape[1]," X ",ori_img.shape[0])
            img_result , results = mediapipe_detection(ori_img, holistic)
            keypoints = keypoints_extraction(results)
            try:
                crop, c_point, st_norm_pt = get_rect_cords(ori_img, results)
                last_crop = crop
            except AttributeError as ae:
                crop = last_crop

            crop_frame = crop_frame[crop[0][1]:crop[1][1], crop[0][0]:crop[1][0]]
        
            standard_frame = copy.deepcopy(crop_frame)

            s_height = 300
            s_width = 300

            width_scale = s_height/crop_frame.shape[1] 
            height_scale = s_width/crop_frame.shape[0] 

            standard_frame = cv.resize(standard_frame, (s_height, s_width))
            custom_keypoints = np.array([])
        
            if results.left_hand_landmarks or results.right_hand_landmarks:
                idle_gest_reg = np.append(idle_gest_reg, 1)
                can_record = True
                frame_track = True
            else:
                idle_gest_reg = np.append(idle_gest_reg, None)

            if can_record:
                if len(idle_gest_reg) > 7:
                    idle_gest_reg = idle_gest_reg[-7:]
                        
                if len(idle_gest_reg) == 7:
                    if np.all(idle_gest_reg == None):        
                        mode = "idle"
                        print("Gesture Ended! Stopping...")
                        can_record = False
                        frame_track = False
                        frame_number = 1
                        sequence = []
                        
                    else:
                        mode = "gest"
                        frame_number +=1
            
            
            st_pt = (crop[0][0]/ori_img.shape[1], crop[0][1]/ori_img.shape[0])

            for i in range(0, len(keypoints), 3):
                
                loc_ = (keypoints[i] - st_pt[0], keypoints[i+1] - st_pt[1])
                point = locate_it(loc_,ori_img.shape[1],ori_img.shape[0])
                standard_point = (int(point[0] * width_scale), int(point[1] * height_scale))

                norm_standard_point = [standard_point[0] / s_width, standard_point[1] / s_height]

                if (0 > standard_point[0] > s_width or 0 > standard_point[1] > s_height):
                    print("this point went into invalid section:")
                    print("standard_point: {}".format(standard_point))
                    norm_stand_point = [0.0, 0.0]
            
                cv.circle(standard_frame, standard_point, 1, (255,255, 0), 1)
                custom_keypoints = np.append(custom_keypoints, norm_standard_point)
            
        else:
            custom_keypoints = np.zeros(1086, dtype=np.float64)
        current_frame += 1  
        
        if can_record:
            sequence.append(custom_keypoints)
            sequence = sequence[-35:]
            
        if len(sequence) == 35:
            res = model.predict(np.expand_dims(sequence, axis=0))[0]      
            predictions.append(np.argmax(res))

            if np.unique(predictions[-10:])[0] == np.argmax(res):   
                if res[np.argmax(res)] > threshold:                       
                    predicted_text = actions[np.argmax(res)]
                    can_do_live_text = True
                    live_text_frame_number = 0
                    alpha = 1.0                   
                else:
                    predicted_text = "Low Accuracy"
                
        if can_do_live_text:
            if live_text_frame_number < 20:
                live_text_frame_number += 1
                alpha -= 0.5
            else:
                can_do_live_text = False
                
               
        if np.all(idle_gest_reg == None):        
            mode = "idle"
        else:
            mode = "gest"  
            
     
        cv.putText(ori_img, mode, (3, 60), cv.FONT_HERSHEY_SIMPLEX, 0.8, (255, 255, 255), 0, cv.LINE_AA)
        cv.putText(standard_frame, "{}FN:{}".format(mode, frame_number), (int(s_height/2), 20), cv.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 0, cv.LINE_AA)     
        overlay = standard_frame.copy()
        
        if can_do_live_text:
            cv.putText(overlay, predicted_text,(125, 280), cv.FONT_HERSHEY_SIMPLEX, 0.9, (255, 120, 50), 2)
            cv.addWeighted(overlay, alpha, standard_frame, 1 - alpha, 0, standard_frame)
        
        cv.imshow('standard_Frame', standard_frame)            
        cv.imshow('full_screen', ori_img)

        if (cv.waitKey(1) & 0xFF == ord('q')):
            break
        
cap.release()
cv.destroyAllWindows()