# 1. Import and Install Dependencies

In [None]:
# !pip install tensorflow==2.12.0 tensorflow-gpu==2.12.0 opencv-python mediapipe sklearn matplotlib
# install dlib

In [1]:
# chay 1
import cv2
import numpy as np
import os
from matplotlib import pyplot as plt
import time
import mediapipe as mp
import math 

# 2. Keypoints using MP Holistic

In [2]:
# chay 2
mp_holistic = mp.solutions.holistic # Holistic model
mp_drawing = mp.solutions.drawing_utils # Drawing utilities

In [3]:
# chay 3
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # COLOR CONVERSION BGR 2 RGB
    image.flags.writeable = False                  # Image is no longer writeable
    results = model.process(image)                 # Make prediction
    image.flags.writeable = True                   # Image is now writeable 
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # COLOR COVERSION RGB 2 BGR
    return image, results

In [4]:
# chay 4
def draw_landmarks(image, results):
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACE_CONNECTIONS) # Draw face connections
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS) # Draw pose connections
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS) # Draw left hand connections
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS) # Draw right hand connections

In [5]:
# chay 5
def draw_styled_landmarks(image, results):
    # Draw pose connections
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(80,22,10), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(80,44,121), thickness=2, circle_radius=2)
                             ) 
    # Draw left hand connections
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2)
                             ) 
    # Draw right hand connections  
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2)
                             ) 

# 3. Extract Keypoint Values

In [6]:
# chay 6
def extract_keypoints_reduce(results):   
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark[:25]]).flatten() if results.pose_landmarks else np.zeros(25*4)
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    return np.concatenate([pose, lh, rh])

# 4. Setup Folders for Collection

# 5. Collect Keypoint Values for Training and Testing

In [34]:
# chay 7
# Path for exported data, numpy arrays
DATA_PATH = os.path.join('MP_Data_reduce') 
VIDEO_PATH = os.path.join('Videos') 
    
# Actions that we try to detect 'Correct!', 'hi'
actions = np.array(['anhhuong','camcum','camthu','congnhan','dau','daubung','daulung','hello','hoabinh','howAre','kham','langnghe','lanh','love','mask','mau','met','metmoi','moico'
'no','please','sorry','thanks','wear','you'])

# Thirty videos worth of data
no_sequences = 75

# Videos are going to be 30 frames in length
sequence_length = 30

for action in actions: 
    for sequence in range(no_sequences):
        try: 
            os.makedirs(os.path.join(DATA_PATH, action, str(sequence)))
        except:
            pass

# Set mediapipe model 
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    
    # NEW LOOP
    # Loop through actions
    for action in actions:
        # Loop through sequences aka videos
        for sequence in range(no_sequences):
            # video 2s => 30 frame 
            # Đường dẫn tới file video
            video_path = os.path.join(VIDEO_PATH, action, action + '_' + str(sequence))

            # Mở file video
            cap = cv2.VideoCapture(f'{video_path}.avi')

            # Số frame của video
            total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
            print('Tổng số frame: ', total_frames)
            
            step=1
            # Loop through video length aka sequence length
            for frame_num, i in zip(range(sequence_length), range(0, total_frames, step)):
                
                # Đọc frame
                cap.set(cv2.CAP_PROP_POS_FRAMES, i)
                ret, frame = cap.read()

                # Make detections
                image, results = mediapipe_detection(frame, holistic)
                
                # NEW Export keypoints
                keypoints = extract_keypoints_reduce(results)
                npy_path = os.path.join(DATA_PATH, action, str(sequence), str(frame_num))
                np.save(npy_path, keypoints)

    cap.release()
    cv2.destroyAllWindows()

Tổng số frame:  30
Tổng số frame:  30
Tổng số frame:  30
Tổng số frame:  30
Tổng số frame:  29
Tổng số frame:  30
Tổng số frame:  30
Tổng số frame:  32
Tổng số frame:  30
Tổng số frame:  30
Tổng số frame:  30
Tổng số frame:  32
Tổng số frame:  32
Tổng số frame:  30
Tổng số frame:  29
Tổng số frame:  30
Tổng số frame:  30
Tổng số frame:  30
Tổng số frame:  29
Tổng số frame:  30
Tổng số frame:  31
Tổng số frame:  32
Tổng số frame:  32
Tổng số frame:  30
Tổng số frame:  30
Tổng số frame:  31
Tổng số frame:  29
Tổng số frame:  30
Tổng số frame:  31
Tổng số frame:  30
Tổng số frame:  31
Tổng số frame:  31
Tổng số frame:  31
Tổng số frame:  31
Tổng số frame:  32
Tổng số frame:  30
Tổng số frame:  30
Tổng số frame:  32
Tổng số frame:  31
Tổng số frame:  30
Tổng số frame:  32
Tổng số frame:  31
Tổng số frame:  31
Tổng số frame:  31
Tổng số frame:  30
Tổng số frame:  31
Tổng số frame:  31
Tổng số frame:  30
Tổng số frame:  31
Tổng số frame:  31
Tổng số frame:  31
Tổng số frame:  31
Tổng số fram

# 6. Preprocess Data and Create Labels and Features

In [35]:
# chay 8
# Path for exported data, numpy arrays
DATA_PATH = os.path.join('MP_Data_reduce') 

root_dir = 'MP_Data_reduce'
actions = np.array([os.path.basename(dir_name) for dir_name in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, dir_name))])

# Thirty videos worth of data
no_sequences = 75

# Videos are going to be 30 frames in length
sequence_length = 30
print(actions)

['hello' 'langnghe' 'no']


In [36]:
# chay 9
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

In [37]:
# chay 10
label_map = {label:num for num, label in enumerate(actions)}
label_map

{'hello': 0, 'langnghe': 1, 'no': 2}

In [39]:
# chay 11
print("no_sequences:", no_sequences)
sequences, labels = [], []
for action in actions:
    for sequence in range(no_sequences):
        window = []
        for frame_num in range(sequence_length):
            res = np.load(os.path.join(DATA_PATH, action, str(sequence), "{}.npy".format(frame_num)))
            window.append(res)
        sequences.append(window)
        labels.append(label_map[action])

no_sequences: 75


In [40]:
# chay 12
X = np.array(sequences)

In [41]:
# chay 13
y = to_categorical(labels).astype(int)

In [42]:
# chay 14
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)
print(X_train.shape)    
print(X_test.shape)    

(180, 30, 226)
(45, 30, 226)


# 7. Build and Train LSTM Neural Network

In [43]:
# chay 15
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Bidirectional, GRU
from tensorflow.keras.callbacks import TensorBoard

In [44]:
# chay 16
log_dir = os.path.join('Logs')
tb_callback = TensorBoard(log_dir=log_dir)

In [45]:
# chay 17
model = Sequential()
model.add(GRU(64, return_sequences=True, activation='relu', input_shape=(30, 226)))
model.add(GRU(128, return_sequences=True, activation='relu'))
model.add(GRU(64, return_sequences=False, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(actions.shape[0], activation='softmax'))

In [46]:
# chay 18
model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [47]:
# chay 19
model.fit(X_train, y_train, epochs=1000, callbacks=[tb_callback])

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
1

KeyboardInterrupt: 

In [48]:
# chay 20
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 gru_3 (GRU)                 (None, 30, 64)            56064     
                                                                 
 gru_4 (GRU)                 (None, 30, 128)           74496     
                                                                 
 gru_5 (GRU)                 (None, 64)                37248     
                                                                 
 dense_3 (Dense)             (None, 64)                4160      
                                                                 
 dense_4 (Dense)             (None, 32)                2080      
                                                                 
 dense_5 (Dense)             (None, 3)                 99        
                                                                 
Total params: 174,147
Trainable params: 174,147
Non-tr

# 8. Make Predictions

# 9. Save Weights

In [49]:
# chay 21
model.save('actionGRUThreeLayersReduce.h5')

In [50]:
# chay 22
from tensorflow.keras.models import load_model
model = load_model('actionGRUThreeLayersReduce.h5')

# 10. Evaluation using Confusion Matrix and Accuracy

# 11. Test in Real Time

In [51]:
# chay 23
# 1. New detection variables
sequence = []
sentence = []
threshold = 0.9

cap = cv2.VideoCapture(0)
# Set mediapipe model 
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():

        # Read feed
        ret, frame = cap.read()
    
        # Make detections
        image, results = mediapipe_detection(frame, holistic)
        print(results)
        
        # Draw landmarks
        draw_styled_landmarks(image, results)
        
        # 2. Prediction logic
        keypoints = extract_keypoints_reduce(results)
#         sequence.insert(0,keypoints)
#         sequence = sequence[:30]
        sequence.append(keypoints)
        sequence = sequence[-30:]
        
        if len(sequence) == 30:
            res = model.predict(np.expand_dims(sequence, axis=0))[0]
            print(actions[np.argmax(res)])
#             if (actions[np.argmax(res)]!='nothing'):
#                 sequence = sequence[-15:]
            
            
        #3. Viz logic
            if res[np.argmax(res)] > threshold: 
                if len(sentence) > 0: 
                    if actions[np.argmax(res)] != sentence[-1]:
                        sentence.append(actions[np.argmax(res)])
                else:
                    sentence.append(actions[np.argmax(res)])

            if len(sentence) > 5: 
                sentence = sentence[-5:]

            # Viz probabilities
#             image = prob_viz(res, actions, image, colors)
            
#         cv2.rectangle(image, (0,0), (600, 40), (245, 117, 16), -1)
        cv2.putText(image, ' '.join(sentence), (3,30), 
                       cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 0), 2, cv2.LINE_AA)
        
        # Show to screen
        cv2.imshow('OpenCV Feed', image)

        # Break gracefully
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break
    cap.release()
    cv2.destroyAllWindows()

<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.soluti

In [27]:
cap.release()
cv2.destroyAllWindows()