# 손 동작 인식
- 01. LSTM
- 02. KNN
- 03. RNN

In [2]:
import mediapipe as mp
import cv2
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import os
import time

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import models, layers, activations, initializers, losses, optimizers, metrics
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import load_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn import model_selection
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

2023-12-14 15:55:53.391875: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-14 15:55:53.391909: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-14 15:55:53.392865: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-14 15:55:53.398118: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
# 파일 확인
# npy_data = np.load('./dataset/seq_go_1700819427.npy')
# npy_data = np.load('./dataset/seq_back_1700819496.npy')
# npy_data = np.load('./dataset/seq_go_1700819427.npy')
# npy_data = np.load('./dataset/seq_go_1700819427.npy')
# npy_data = np.load('./dataset/seq_go_1700819427.npy')

# print(npy_data.shape)

In [None]:
# npy_data = np.load('../data/dataset_opensource_bbang/raw_away_1627646273.npy')
npy_data = np.load("../data/dataset_opensource_bbang/seq_away_1627646273.npy")

print(npy_data)

---

# 01. LSTM

## 01-1. LSTM - 데이터셋 구축

In [None]:
max_num_hands = 1
actions = ['back']
# actions = ['go', 'back', 'stop', 'left_spin', 'right_spin', 'speed_up', 'speed_down', 'bad_gesture']

# 시퀀스 길이 지정
seq_length = 40
secs_for_action = 2

# 미디어파이프 패키지에서 손 인식을 위한 객체 생성
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils

hands = mp_hands.Hands(
    max_num_hands = max_num_hands,
    min_detection_confidence = 0.5,
    min_tracking_confidence = 0.5)

cap = cv2.VideoCapture(0)

created_time = str(time.strftime('%X', time.localtime(time.time())))
# exist_ok를 True로 설정하지 않았을 땐, 해당 디렉토리가 존재하는 경우 exception 에러 발생
os.makedirs('../data/dataset', exist_ok=True)

while cap.isOpened():

    for idx, action in enumerate(actions):

        data = []

        ret, img = cap.read()
        if not ret:
            print("카메라 연결 실패")
            # break
            continue

        img = cv2.flip(img, 1)
        cv2.putText(img, f'Waiting for collecting {action.upper()} action...', org=(10, 30), fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=1, color=(255, 255, 255), thickness=2)
        cv2.imshow('Dataset', img)
        cv2.waitKey(3000)

        start_time = time.time()

        while time.time() - start_time < secs_for_action:

            ret, img = cap.read()
            img = cv2.flip(img, 1)
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            result = hands.process(img)
            img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)

            if result.multi_hand_landmarks is not None:

                for res in result.multi_hand_landmarks:

                    joint = np.zeros((21, 4))

                    for j, lm in enumerate(res.landmark):
                        
                        joint[j] = [lm.x, lm.y, lm.z, lm.visibility]

                    v1 = joint[[0,1,2,3,0,5,6,7,0,9,10,11,0,13,14,15,0,17,18,19], :3]
                    v2 = joint[[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20], :3]
                    v = v2 - v1

                    v = v / np.linalg.norm(v, axis=1)[:, np.newaxis]

                    angle = np.arccos(np.einsum('nt,nt->n',
                        v[[0,1,2,4,5,6,8,9,10,12,13,14,16,17,18],:], 
                        v[[1,2,3,5,6,7,9,10,11,13,14,15,17,18,19],:]))

                    angle = np.degrees(angle)

                    angle_label = np.array([angle], dtype=np.float32)
                    angle_label = np.append(angle_label, 0)

                    d = np.concatenate([joint.flatten(), angle_label])

                    data.append(d)
                    # print(data)

                    mp_drawing.draw_landmarks(img,
                            res,
                            mp_hands.HAND_CONNECTIONS
                        #   mp_hands.get_default_hand_landmarks_style(),
                        #   mp_hands.get_default_hand_connections_style()
                    )

            cv2.imshow('Dataset', img)
            
            key = cv2.waitKey(5) & 0xFF
            
            if key == 27:  # ESC를 눌렀을 경우
                cv2.destroyAllWindows()
                cap.release()  # 비디오 캡처 객체 해제
                break

        data = np.array(data)
        print(action, data.shape)
        np.save(os.path.join('../data', f'raw_{action}_{created_time}'), data)

        # seq 데이터 저장
        full_seq_data = []
        for seq in range(len(data) - seq_length):
            full_seq_data.append(data[seq:seq + seq_length])

        full_seq_data = np.array(full_seq_data)
        print(action, full_seq_data.shape)
        np.save(os.path.join('../data', f'seq_{action}_{created_time}'), full_seq_data)
    
    break

## 01-2. LSTM - 학습

In [None]:
# # 데이터셋_10
# data_go = np.loadtxt('../data/dataset_10_231211/go_04:54:34 PM.csv', delimiter=',')
# data_back= np.loadtxt('../data/dataset_10_231211/back_04:55:04 PM.csv', delimiter=',')
# data_stop= np.loadtxt('../data/dataset_10_231211/stop_04:55:31 PM.csv', delimiter=',')
# data_left= np.loadtxt('../data/dataset_10_231211/left_spin_04:55:49 PM.csv', delimiter=',')
# data_right= np.loadtxt('../data/dataset_10_231211/right_spin_04:56:08 PM.csv', delimiter=',')
# data_up= np.loadtxt('../data/dataset_10_231211/speed_up_04:57:54 PM.csv', delimiter=',')
# data_down= np.loadtxt('../data/dataset_10_231211/speed_down_04:58:18 PM.csv', delimiter=',')
# data_bad= np.loadtxt('../data/dataset_10_231211/bad_gesture_05:00:03 PM.csv', delimiter=',')

In [None]:
# seq_length = 4
# full_seq_data = []

# for seq in range(len(data) - seq_length):
#     full_seq_data.append(data[seq:seq + seq_length])

# full_seq_data = np.array(full_seq_data)
# np.save('../data/dataset_10_231211/go_04:54:34 PM.npy', full_seq_data)

In [None]:
path = "../data/dataset_100_231211/"
file_list = os.listdir(path)
file_list_csv = [file for file in file_list if file.endswith(".csv")]

for idx, item in enumerate(file_list_csv):
    if 'total' in item:
        file_list_csv.pop(idx)
        
print ("file_list_csv: {}".format(file_list_csv))

In [None]:
seq_length = 4
path = "../data/dataset_100_231211/"

for i in file_list_csv:

    data = np.loadtxt(os.path.join(path, i), delimiter=',')
    
    full_seq_data = []

    for seq in range(len(data) - seq_length):
        full_seq_data.append(data[seq:seq + seq_length])

    full_seq_data = np.array(full_seq_data)
    file_name = i[:-3] + 'npy'
    
    np.save(os.path.join(path, file_name), full_seq_data)

In [None]:
npy_data_create = np.load('../data/dataset_100_231211/go_05:04:05 PM.npy')
print(npy_data_create)

In [None]:
print(npy_data_create.shape)

In [None]:
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

In [None]:
actions = ['go', 'back', 'stop', 'left_spin', 'right_spin', 'speed_up', 'speed_down', 'bad_gesture']

# 데이터셋_10
data = np.concatenate([
    np.load('../data/dataset_10_231211/go_04:54:34 PM.npy'),
    np.load('../data/dataset_10_231211/back_04:55:04 PM.npy'),
    np.load('../data/dataset_10_231211/stop_04:55:31 PM.npy'),
    np.load('../data/dataset_10_231211/left_spin_04:55:49 PM.npy'),
    np.load('../data/dataset_10_231211/right_spin_04:56:08 PM.npy'),
    np.load('../data/dataset_10_231211/speed_up_04:57:54 PM.npy'),
    np.load('../data/dataset_10_231211/speed_down_04:58:18 PM.npy'),
    np.load('../data/dataset_10_231211/bad_gesture_05:00:03 PM.npy')
], axis=0)

# 데이터셋_100
# data = np.concatenate([
#     np.load('../data/dataset_100_231211/go_05:04:05 PM.npy'),
#     np.load('../data/dataset_100_231211/back_05:06:55 PM.npy'),
#     np.load('../data/dataset_100_231211/stop_05:09:41 PM.npy'),
#     np.load('../data/dataset_100_231211/left_spin_05:12:50 PM.npy'),
#     np.load('../data/dataset_100_231211/right_spin_05:15:07 PM.npy'),
#     np.load('../data/dataset_100_231211/speed_up_05:18:02 PM.npy'),
#     np.load('../data/dataset_100_231211/speed_down_05:20:26 PM.npy'),
#     np.load('../data/dataset_100_231211/bad_gesture_05:23:23 PM.npy')
# ], axis=0)

data.shape

In [None]:
x_data = data[:, :, :-1]
labels = data[:, 0, -1]

print(x_data.shape)
print(labels.shape)

In [None]:
y_data = to_categorical(labels, num_classes=None)
y_data.shape

In [None]:
x_data = x_data.astype(np.float32)
y_data = y_data.astype(np.float32)

x_train, x_val, y_train, y_val = train_test_split(x_data,
                                                  y_data,
                                                  test_size=0.2,
                                                  random_state=2023,
                                                  stratify=y_data)

print(x_train.shape, y_train.shape)
print(x_val.shape, y_val.shape)

In [None]:
model = Sequential([
    LSTM(64, activation='relu', input_shape=x_train.shape[1:3]),
    Dense(32, activation='relu'),
    Dense(len(actions), activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
model.summary()

In [None]:
history = model.fit(
    x_train,
    y_train,
    validation_data=(x_val, y_val),
    epochs=500,
    callbacks=[
        ModelCheckpoint('../model/model_.h5', monitor='val_acc', verbose=1, save_best_only=True, mode='auto'),
        ReduceLROnPlateau(monitor='val_acc', factor=0.5, patience=50, verbose=1, mode='auto')
    ]
)

In [None]:
result = model.evaluate(x_val, y_val)

print('loss (cross-entropy) :', result[0])
print('test accuracy :', result[1])

In [None]:
fig, loss_ax = plt.subplots(figsize=(16, 10))
acc_ax = loss_ax.twinx()

loss_ax.plot(history.history['loss'], 'y', label='train loss')
loss_ax.plot(history.history['val_loss'], 'r', label='val loss')
loss_ax.set_xlabel('epoch')
loss_ax.set_ylabel('loss')
loss_ax.legend(loc='upper left')

acc_ax.plot(history.history['acc'], 'b', label='train acc')
acc_ax.plot(history.history['val_acc'], 'g', label='val acc')
acc_ax.set_ylabel('accuracy')
acc_ax.legend(loc='upper left')

plt.show()

In [None]:
# MLP 모델
# 이진 분류
fig, axes = plt.subplots(1, 2, figsize = (12, 4))

sns.lineplot(x = range(len(history.history["loss"])),
             y = history.history["loss"], ax = axes[0],
             label = 'Training Loss')

sns.lineplot(x = range(len(history.history["loss"])),
             y = history.history["val_loss"], ax = axes[0],
             label = 'Validation Loss')


sns.lineplot(x = range(len(history.history["acc"])),
             y = history.history["acc"], ax = axes[1],
             label = 'Training Accuracy')

sns.lineplot(x = range(len(history.history["acc"])),
             y = history.history["val_acc"], ax = axes[1],
             label = 'Validation Accuracy')
axes[0].set_title("Loss"); axes[1].set_title("Accuracy")

sns.despine()
plt.show()

In [None]:
model = load_model('../model/model_.h5')

y_pred = model.predict(x_val)

multilabel_confusion_matrix(np.argmax(y_val, axis=1), np.argmax(y_pred, axis=1))

## 01-3. LSTM - 성능 검증

In [None]:
a = [1, 2,3,4,5]
a[-2:]

In [None]:
# # 8가지 제스처 인식 Test
# max_num_hands = 1
# actions = ['go', 'back', 'stop', 'left_spin', 'right_spin', 'speed_up', 'speed_down', 'bad_gesture']
# gestures = {
#     0:'go', 1:'back', 2:'stop', 3:'left_spin', 4:'right_spin', 5:'speed_up',
#     6:'speed_down', 7:'bad_gesture'}

# # 시퀀스 길이 지정
# seq_length = 4

# # 미디어파이프 패키지에서 손 인식을 위한 객체 생성
# mp_hands = mp.solutions.hands
# mp_drawing = mp.solutions.drawing_utils

# hands = mp_hands.Hands(
#     max_num_hands = max_num_hands,
#     min_detection_confidence = 0.5,
#     min_tracking_confidence = 0.5)

# model = keras.models.load_model("../model/LSTM_model_dataset100_epoch500.h5")

# cap = cv2.VideoCapture(0)
# seq = []
# action_seq = []

# while cap.isOpened():

#     ret, img = cap.read()
#     if not ret:
#         print("카메라 연결 실패")
#         # break
#         continue
    
#     img = cv2.flip(img, 1)
#     img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

#     result = hands.process(img)

#     img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)

#     # 손 인식 여부 확인
#     if result.multi_hand_landmarks is not None:

#         for res in result.multi_hand_landmarks:
        
#             joint = np.zeros((21, 4))
        
#             for j, lm in enumerate(res.landmark):
#                 joint[j] = [lm.x, lm.y, lm.z, lm.visibility]

#             v1 = joint[[0,1,2,3,0,5,6,7,0,9,10,11,0,13,14,15,0,17,18,19], :3]
#             v2 = joint[[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20], :3]
#             v = v2 - v1 # [20, 3]

#             v = v / np.linalg.norm(v, axis=1)[:, np.newaxis]

#             angle = np.arccos(np.einsum('nt,nt->n',
#                 v[[0,1,2,4,5,6,8,9,10,12,13,14,16,17,18],:], 
#                 v[[1,2,3,5,6,7,9,10,11,13,14,15,17,18,19],:])) # [15,]

#             angle = np.degrees(angle)

#             data = angle

#             # data = np.array([angle], dtype=np.float32)

#             seq.append(data)

#             mp_drawing.draw_landmarks(img,
#                                       res,
#                                       mp_hands.HAND_CONNECTIONS
#                                     #   mp_hands.get_default_hand_landmarks_style(),
#                                     #   mp_hands.get_default_hand_connections_style()
#             )
            
#             if len(seq) < seq_length:
#                 continue

#             # print(seq[-seq_length:])

#             input_data = np.expand_dims(np.array(seq[-seq_length:], dtype=np.float32), axis=0)

#             # print(input_data)
#             y_pred = model.predict(input_data).squeeze()

#             i_pred = int(np.argmax(y_pred))
#             conf = y_pred[i_pred]

#             if conf < 0.9:
#                 continue

#             action = actions[i_pred]
#             action_seq.append(action)

#             if len(action_seq) < 8:
#                 continue

#             this_action = '?'
#             if action_seq[-1] == action_seq[-2] == action_seq[-3]:
#                 this_action = action

#             # idx = encoder.inverse_transform(model.predict([data]))[0,0]

#             # if idx in gestures.keys():
#             #     cv2.putText(img, text=gestures[idx].upper(), org=(int(res.landmark[0].x * img.shape[1]), int(res.landmark[0].y * img.shape[0] + 20)), fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=1, color=(255, 255, 255), thickness=2)
            
#             cv2.putText(img, f'{this_action.upper()}', org=(int(res.landmark[0].x * img.shape[1]), int(res.landmark[0].y * img.shape[0] + 20)), fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=1, color=(255, 255, 255), thickness=2)

#     cv2.imshow('Test', img)

#     key = cv2.waitKey(5) & 0xFF

#     if key == 27:
#         cv2.destroyAllWindows()
#         cap.release()
#         break

In [None]:
# 8가지 제스처 인식 Test
# 한손만 인식
max_num_hands = 1
actions = ['go', 'back', 'stop', 'left_spin', 'right_spin', 'speed_up', 'speed_down', 'bad_gesture']
gestures = {
    0:'go', 1:'back', 2:'stop', 3:'left_spin', 4:'right_spin', 5:'speed_up',
    6:'speed_down', 7:'bad_gesture'}

# 시퀀스 길이 지정
seq_length = 4

# 미디어파이프 패키지에서 손 인식을 위한 객체 생성
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils

hands = mp_hands.Hands(
    max_num_hands = max_num_hands,
    min_detection_confidence = 0.5,
    min_tracking_confidence = 0.5)

model = keras.models.load_model("../model/LSTM_model_dataset10_epoch500.h5")

cap = cv2.VideoCapture(0)
seq = []
action_seq = []

fps = 0
frame_count = 0
start_time = time.time()

prev_action = '?'
current_action = '?'

while cap.isOpened():

    ret, img = cap.read()
    if not ret:
        print("카메라 연결 실패")
        # break
        continue

    frame_count += 1
    elapsed_time = time.time() - start_time
    if elapsed_time > 1.0:  # 1초마다 FPS 갱신
        fps = frame_count / elapsed_time
        frame_count = 0
        start_time = time.time()
    
    img = cv2.flip(img, 1)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    result = hands.process(img)

    cv2.putText(img, f'FPS: {int(fps)}', (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)

    img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)

    # 손 인식 여부 확인
    if result.multi_hand_landmarks is not None:

        for res in result.multi_hand_landmarks:
        
            joint = np.zeros((21, 4))
        
            for j, lm in enumerate(res.landmark):
                joint[j] = [lm.x, lm.y, lm.z, lm.visibility]

            v1 = joint[[0,1,2,3,0,5,6,7,0,9,10,11,0,13,14,15,0,17,18,19], :3]
            v2 = joint[[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20], :3]
            v = v2 - v1 # [20, 3]

            v = v / np.linalg.norm(v, axis=1)[:, np.newaxis]

            angle = np.arccos(np.einsum('nt,nt->n',
                v[[0,1,2,4,5,6,8,9,10,12,13,14,16,17,18],:], 
                v[[1,2,3,5,6,7,9,10,11,13,14,15,17,18,19],:])) # [15,]

            angle = np.degrees(angle)

            data = angle

            # data = np.array([angle], dtype=np.float32)

            seq.append(data)

            mp_drawing.draw_landmarks(img,
                                      res,
                                      mp_hands.HAND_CONNECTIONS
                                    #   mp_hands.get_default_hand_landmarks_style(),
                                    #   mp_hands.get_default_hand_connections_style()
            )
            
            if len(seq) < seq_length:
                continue

            # print(seq[-seq_length:])

            input_data = np.expand_dims(np.array(seq[-seq_length:], dtype=np.float32), axis=0)

            # print(input_data)
            y_pred = model.predict(input_data).squeeze()

            i_pred = int(np.argmax(y_pred))
            conf = y_pred[i_pred]

            if conf < 0.9:
                continue

            action = actions[i_pred]
            action_seq.append(action)

            if len(action_seq) < 8:
                continue

            if len(action_seq) >= 8:
                prev_action = action_seq[-8]
                current_action = action_seq[-1]

            this_action = '?'
            if action_seq[-1] == action_seq[-2] == action_seq[-3]:
                this_action = action

            # idx = encoder.inverse_transform(model.predict([data]))[0,0]

            # if idx in gestures.keys():
            #     cv2.putText(img, text=gestures[idx].upper(), org=(int(res.landmark[0].x * img.shape[1]), int(res.landmark[0].y * img.shape[0] + 20)), fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=1, color=(255, 255, 255), thickness=2)
            
            cv2.putText(img, f'{this_action.upper()}', org=(int(res.landmark[0].x * img.shape[1]), int(res.landmark[0].y * img.shape[0] + 20)), fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=1, color=(0, 255, 0), thickness=2)

        cv2.putText(img, f'Prev Action: {prev_action.upper()}', (10, 60), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)
        cv2.putText(img, f'Current Action: {current_action.upper()}', (10, 90), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

    cv2.imshow('Test', img)

    key = cv2.waitKey(5) & 0xFF

    if key == 27:
        cv2.destroyAllWindows()
        cap.release()
        break

In [None]:
# 8가지 제스처 인식 Test
# 영상용
# 한손만 인식
max_num_hands = 1
actions = ['go', 'back', 'stop', 'left_spin', 'right_spin', 'speed_up', 'speed_down', 'bad_gesture']
gestures = {
    0:'go', 1:'back', 2:'stop', 3:'left_spin', 4:'right_spin', 5:'speed_up',
    6:'speed_down', 7:'bad_gesture'}

# 시퀀스 길이 지정
seq_length = 4

# 미디어파이프 패키지에서 손 인식을 위한 객체 생성
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils

hands = mp_hands.Hands(
    max_num_hands = max_num_hands,
    min_detection_confidence = 0.5,
    min_tracking_confidence = 0.5)

model = keras.models.load_model("../model/LSTM_model_dataset100_epoch500.h5")

cap = cv2.VideoCapture(0)
seq = []
action_seq = []

fps = 0
frame_count = 0
start_time = time.time()

# prev_action = '?'
# current_action = '?'

while cap.isOpened():

    ret, img = cap.read()
    if not ret:
        print("카메라 연결 실패")
        # break
        continue

    frame_count += 1
    elapsed_time = time.time() - start_time
    if elapsed_time > 1.0:  # 1초마다 FPS 갱신
        fps = frame_count / elapsed_time
        frame_count = 0
        start_time = time.time()
    
    img = cv2.flip(img, 1)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    result = hands.process(img)

    cv2.putText(img, 'LSTM', (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
    cv2.putText(img, f'FPS: {int(fps)}', (10, 60), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)

    img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)

    # 손 인식 여부 확인
    if result.multi_hand_landmarks is not None:

        for res in result.multi_hand_landmarks:
        
            joint = np.zeros((21, 4))
        
            for j, lm in enumerate(res.landmark):
                joint[j] = [lm.x, lm.y, lm.z, lm.visibility]

            v1 = joint[[0,1,2,3,0,5,6,7,0,9,10,11,0,13,14,15,0,17,18,19], :3]
            v2 = joint[[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20], :3]
            v = v2 - v1 # [20, 3]

            v = v / np.linalg.norm(v, axis=1)[:, np.newaxis]

            angle = np.arccos(np.einsum('nt,nt->n',
                v[[0,1,2,4,5,6,8,9,10,12,13,14,16,17,18],:], 
                v[[1,2,3,5,6,7,9,10,11,13,14,15,17,18,19],:])) # [15,]

            angle = np.degrees(angle)

            data = angle

            # data = np.array([angle], dtype=np.float32)

            seq.append(data)

            mp_drawing.draw_landmarks(img,
                                      res,
                                      mp_hands.HAND_CONNECTIONS
                                    #   mp_hands.get_default_hand_landmarks_style(),
                                    #   mp_hands.get_default_hand_connections_style()
            )
            
            if len(seq) < seq_length:
                continue

            # print(seq[-seq_length:])

            input_data = np.expand_dims(np.array(seq[-seq_length:], dtype=np.float32), axis=0)

            # print(input_data)
            y_pred = model.predict(input_data).squeeze()

            i_pred = int(np.argmax(y_pred))
            conf = y_pred[i_pred]

            if conf < 0.9:
                continue

            action = actions[i_pred]
            action_seq.append(action)

            if len(action_seq) < 8:
                continue

            if len(action_seq) >= 8:
                prev_action = action_seq[-8]
                current_action = action_seq[-1]

            this_action = '?'
            if action_seq[-1] == action_seq[-2] == action_seq[-3]:
                this_action = action
                this_num = i_pred
            # idx = encoder.inverse_transform(model.predict([data]))[0,0]

            # if idx in gestures.keys():
            #     cv2.putText(img, text=gestures[idx].upper(), org=(int(res.landmark[0].x * img.shape[1]), int(res.landmark[0].y * img.shape[0] + 20)), fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=1, color=(255, 255, 255), thickness=2)
            
            cv2.putText(img, f'{this_action.upper()}', org=(int(res.landmark[0].x * img.shape[1]), int(res.landmark[0].y * img.shape[0] + 20)), fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=2, color=(0, 255, 0), thickness=7)
            cv2.putText(img, f'Action: {this_num}. {prev_action.upper()}', (10, 90), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)
        
        # cv2.putText(img, f'Current Action: {current_action.upper()}', (10, 90), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

    cv2.imshow('Test', img)

    key = cv2.waitKey(5) & 0xFF

    if key == 27:
        cv2.destroyAllWindows()
        cap.release()
        break

In [None]:
# # 8가지 제스처 인식 Test
# # 양손 인식
# max_num_hands = 1
# actions = ['go', 'back', 'stop', 'left_spin', 'right_spin', 'speed_up', 'speed_down', 'bad_gesture']
# gestures = {
#     0:'go', 1:'back', 2:'stop', 3:'left_spin', 4:'right_spin', 5:'speed_up',
#     6:'speed_down', 7:'bad_gesture'}

# # 시퀀스 길이 지정
# seq_length = 4

# # 미디어파이프 패키지에서 손 인식을 위한 객체 생성
# mp_hands = mp.solutions.hands
# mp_drawing = mp.solutions.drawing_utils

# hands = mp_hands.Hands(
#     max_num_hands = max_num_hands,
#     min_detection_confidence = 0.5,
#     min_tracking_confidence = 0.5)

# model = keras.models.load_model("../model/LSTM_model_dataset100_epoch500.h5")

# cap = cv2.VideoCapture(0)

# left_seq = []
# left_action_seq = []
# right_seq = []
# right_action_seq = []

# fps = 0
# frame_count = 0
# start_time = time.time()

# prev_action = '?'
# current_action = '?'

# while cap.isOpened():

#     ret, img = cap.read()
#     if not ret:
#         print("카메라 연결 실패")
#         # break
#         continue

#     frame_count += 1
#     elapsed_time = time.time() - start_time
#     if elapsed_time > 1.0:  # 1초마다 FPS 갱신
#         fps = frame_count / elapsed_time
#         frame_count = 0
#         start_time = time.time()
    
#     img = cv2.flip(img, 1)
#     img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

#     result = hands.process(img)

#     cv2.putText(img, f'FPS: {int(fps)}', (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)

#     img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)

#     # 손 인식 여부 확인
#     if result.multi_hand_landmarks is not None:

#         for res in result.multi_hand_landmarks:
        
#             joint = np.zeros((21, 4))
        
#             for j, lm in enumerate(res.landmark):
#                 joint[j] = [lm.x, lm.y, lm.z, lm.visibility]

#             v1 = joint[[0,1,2,3,0,5,6,7,0,9,10,11,0,13,14,15,0,17,18,19], :3]
#             v2 = joint[[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20], :3]
#             v = v2 - v1 # [20, 3]

#             v = v / np.linalg.norm(v, axis=1)[:, np.newaxis]

#             angle = np.arccos(np.einsum('nt,nt->n',
#                 v[[0,1,2,4,5,6,8,9,10,12,13,14,16,17,18],:], 
#                 v[[1,2,3,5,6,7,9,10,11,13,14,15,17,18,19],:])) # [15,]

#             angle = np.degrees(angle)

#             data = angle

#             # data = np.array([angle], dtype=np.float32)

#             seq.append(data)

#             mp_drawing.draw_landmarks(img,
#                                       res,
#                                       mp_hands.HAND_CONNECTIONS
#                                     #   mp_hands.get_default_hand_landmarks_style(),
#                                     #   mp_hands.get_default_hand_connections_style()
#             )
            
#             if len(seq) < seq_length:
#                 continue

#             # print(seq[-seq_length:])

#             input_data = np.expand_dims(np.array(seq[-seq_length:], dtype=np.float32), axis=0)

#             # print(input_data)
#             y_pred = model.predict(input_data).squeeze()

#             i_pred = int(np.argmax(y_pred))
#             conf = y_pred[i_pred]

#             if conf < 0.9:
#                 continue

#             action = actions[i_pred]
#             action_seq.append(action)

#             if len(action_seq) < 8:
#                 continue

#             if len(action_seq) >= 8:
#                 prev_action = action_seq[-8]
#                 current_action = action_seq[-1]

#             this_action = '?'
#             if action_seq[-1] == action_seq[-2] == action_seq[-3]:
#                 this_action = action

#             # idx = encoder.inverse_transform(model.predict([data]))[0,0]

#             # if idx in gestures.keys():
#             #     cv2.putText(img, text=gestures[idx].upper(), org=(int(res.landmark[0].x * img.shape[1]), int(res.landmark[0].y * img.shape[0] + 20)), fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=1, color=(255, 255, 255), thickness=2)
            
#             cv2.putText(img, f'{this_action.upper()}', org=(int(res.landmark[0].x * img.shape[1]), int(res.landmark[0].y * img.shape[0] + 20)), fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=1, color=(0, 255, 0), thickness=2)

#         cv2.putText(img, f'Prev Action: {prev_action.upper()}', (10, 60), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)
#         cv2.putText(img, f'Current Action: {current_action.upper()}', (10, 90), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

#     cv2.imshow('Test', img)

#     key = cv2.waitKey(5) & 0xFF

#     if key == 27:
#         cv2.destroyAllWindows()
#         cap.release()
#         break

---

# 01-4. 최종 코드


In [None]:
import mediapipe as mp
import cv2
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import os
import time

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import models, layers, activations, initializers, losses, optimizers, metrics
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import load_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn import model_selection
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [None]:
pip install pyautogui

In [None]:
from tensorflow import keras
from tensorflow.keras import models
import mediapipe as mp
import numpy as np
import cv2
import pyautogui
import time

# 8가지 제스처 인식 Test
# 영상용
# 한손만 인식
max_num_hands = 1
actions = ['go', 'back', 'stop', 'left_spin', 'right_spin', 'speed_up', 'speed_down', 'bad_gesture']
gestures = {
    0:'go', 1:'back', 2:'stop', 3:'left_spin', 4:'right_spin', 5:'speed_up',
    6:'speed_down', 7:'bad_gesture'}

# 시퀀스 길이 지정
seq_length = 4

# 미디어파이프 패키지에서 손 인식을 위한 객체 생성
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils

hands = mp_hands.Hands(
    max_num_hands = max_num_hands,
    min_detection_confidence = 0.5,
    min_tracking_confidence = 0.5)

model = keras.models.load_model("../model/LSTM_model_dataset100_epoch500.h5")

cap = cv2.VideoCapture(0)
seq = []
action_seq = []

fps = 0
frame_count = 0
start_time = time.time()

prev_action = '?'
current_action = '?'

while cap.isOpened():

    ret, img = cap.read()
    if not ret:
        print("카메라 연결 실패")
        # break
        continue

    frame_count += 1
    elapsed_time = time.time() - start_time
    if elapsed_time > 1.0:  # 1초마다 FPS 갱신
        fps = frame_count / elapsed_time
        frame_count = 0
        start_time = time.time()
    
    img = cv2.flip(img, 1)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    result = hands.process(img)

    cv2.putText(img, 'Hand Gesture Detecting System', (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 0, 255), 2)
    cv2.putText(img, f'FPS: {int(fps)}', (10, 60), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)

    img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)

    # 손 인식 여부 확인
    if result.multi_hand_landmarks is not None:

        for res in result.multi_hand_landmarks:
        
            joint = np.zeros((21, 4))
        
            for j, lm in enumerate(res.landmark):
                joint[j] = [lm.x, lm.y, lm.z, lm.visibility]

            v1 = joint[[0,1,2,3,0,5,6,7,0,9,10,11,0,13,14,15,0,17,18,19], :3]
            v2 = joint[[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20], :3]
            v = v2 - v1 # [20, 3]

            v = v / np.linalg.norm(v, axis=1)[:, np.newaxis]

            angle = np.arccos(np.einsum('nt,nt->n',
                v[[0,1,2,4,5,6,8,9,10,12,13,14,16,17,18],:], 
                v[[1,2,3,5,6,7,9,10,11,13,14,15,17,18,19],:])) # [15,]

            angle = np.degrees(angle)

            data = angle

            # data = np.array([angle], dtype=np.float32)

            seq.append(data)

            mp_drawing.draw_landmarks(img,
                                      res,
                                      mp_hands.HAND_CONNECTIONS
                                    #   mp_hands.get_default_hand_landmarks_style(),
                                    #   mp_hands.get_default_hand_connections_style()
            )
            
            if len(seq) < seq_length:
                continue

            # print(seq[-seq_length:])

            input_data = np.expand_dims(np.array(seq[-seq_length:], dtype=np.float32), axis=0)

            # print(input_data)
            y_pred = model.predict(input_data).squeeze()

            i_pred = int(np.argmax(y_pred))
            conf = y_pred[i_pred]

            if conf < 0.9:
                continue

            action = actions[i_pred]
            action_seq.append(action)

            if len(action_seq) < 8:
                continue

            if len(action_seq) >= 8:
                prev_action = action_seq[-8]
                current_action = action_seq[-1]

            this_action = '?'
            if action_seq[-1] == action_seq[-2] == action_seq[-3]:
                this_action = action
                this_num = i_pred
            # idx = encoder.inverse_transform(model.predict([data]))[0,0]

            # if idx in gestures.keys():
            #     cv2.putText(img, text=gestures[idx].upper(), org=(int(res.landmark[0].x * img.shape[1]), int(res.landmark[0].y * img.shape[0] + 20)), fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=1, color=(255, 255, 255), thickness=2)
            
            cv2.putText(img, f'{this_action.upper()}', org=(int(res.landmark[0].x * img.shape[1]), int(res.landmark[0].y * img.shape[0] + 20)), fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=2, color=(0, 255, 0), thickness=7)
        
        cv2.putText(img, f'Prev Action: {prev_action.upper()}', (10, 90), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)
        cv2.putText(img, f'Current Action: {current_action.upper()}', (10, 120), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

    cv2.imshow('Test', img)

    key = cv2.waitKey(5) & 0xFF

    if key == 27:
        cv2.destroyAllWindows()
        cap.release()
        break

In [4]:
import pyautogui
print(pyautogui.size())

Size(width=3000, height=1920)


In [6]:
time.sleep(2)
print(pyautogui.position())

Point(x=551, y=1339)


In [None]:
from tensorflow import keras
from tensorflow.keras import models
import mediapipe as mp
import numpy as np
import cv2
import pyautogui
import time

THRESHOLD = 0.2 # 20%, 값이 클수록 손이 카메라와 가까워야 인식함
action_done = False

# 8가지 제스처 인식 Testv
# 영상용
# 한손만 인식
max_num_hands = 1
actions = ['go', 'back', 'stop', 'left_spin', 'right_spin', 'speed_up', 'speed_down', 'bad_gesture']
gestures = {
    0:'go', 1:'back', 2:'stop', 3:'left_spin', 4:'right_spin', 5:'speed_up',
    6:'speed_down', 7:'bad_gesture'}

# 시퀀스 길이 지정
seq_length = 4

# 미디어파이프 패키지에서 손 인식을 위한 객체 생성
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils

hands = mp_hands.Hands(
    max_num_hands = max_num_hands,
    min_detection_confidence = 0.5,
    min_tracking_confidence = 0.5)

model = keras.models.load_model("../model/LSTM_model_dataset100_epoch500.h5")

cap = cv2.VideoCapture(0)
seq = []
action_seq = []

fps = 0
frame_count = 0
start_time = time.time()

key_input_mode = 'OFF'
prev_action = '?'
current_action = '?'

while cap.isOpened():

    ret, img = cap.read()
    if not ret:
        print("카메라 연결 실패")
        # break
        continue

    frame_count += 1
    elapsed_time = time.time() - start_time
    if elapsed_time > 1.0:  # 1초마다 FPS 갱신
        fps = frame_count / elapsed_time
        frame_count = 0
        start_time = time.time()
    
    img = cv2.flip(img, 1)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    result = hands.process(img)

    cv2.putText(img, 'Hand Gesture Detecting System', (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 0, 255), 2)
    cv2.putText(img, f'FPS: {int(fps)}', (10, 60), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
    cv2.putText(img, f'Key Auto Input Mode: {key_input_mode}', (10, 470), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 0, 0), 2)

    img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)

    # 손 인식 여부 확인
    if result.multi_hand_landmarks is not None:

        for res in result.multi_hand_landmarks:
        
            joint = np.zeros((21, 4))
        
            for j, lm in enumerate(res.landmark):
                joint[j] = [lm.x, lm.y, lm.z, lm.visibility]

            v1 = joint[[0,1,2,3,0,5,6,7,0,9,10,11,0,13,14,15,0,17,18,19], :3]
            v2 = joint[[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20], :3]
            v = v2 - v1 # [20, 3]

            v = v / np.linalg.norm(v, axis=1)[:, np.newaxis]

            angle = np.arccos(np.einsum('nt,nt->n',
                v[[0,1,2,4,5,6,8,9,10,12,13,14,16,17,18],:], 
                v[[1,2,3,5,6,7,9,10,11,13,14,15,17,18,19],:])) # [15,]

            angle = np.degrees(angle)

            data = angle

            # data = np.array([angle], dtype=np.float32)

            seq.append(data)

            mp_drawing.draw_landmarks(img,
                                      res,
                                      mp_hands.HAND_CONNECTIONS
                                    #   mp_hands.get_default_hand_landmarks_style(),
                                    #   mp_hands.get_default_hand_connections_style()
            )
            
            if len(seq) < seq_length:
                continue

            # print(seq[-seq_length:])

            input_data = np.expand_dims(np.array(seq[-seq_length:], dtype=np.float32), axis=0)

            # print(input_data)
            y_pred = model.predict(input_data).squeeze()

            i_pred = int(np.argmax(y_pred))
            conf = y_pred[i_pred]

            if conf < 0.9:
                continue

            action = actions[i_pred]
            action_seq.append(action)

            if len(action_seq) < 8:
                continue

            if len(action_seq) >= 8:
                prev_action = action_seq[-8]
                current_action = action_seq[-1]

            this_action = '?'
            if action_seq[-1] == action_seq[-2] == action_seq[-3]:
                this_action = action
                this_num = i_pred
            
            # 모자이크 처리
            if i_pred == 7:
                x1, y1 = tuple((joint.min(axis=0)[:2] * [img.shape[1], img.shape[0]] * 0.95).astype(int))
                x2, y2 = tuple((joint.max(axis=0)[:2] * [img.shape[1], img.shape[0]] * 1.05).astype(int))

                fy_img = img[y1:y2, x1:x2].copy()
                fy_img = cv2.resize(fy_img, dsize=None, fx=0.05, fy=0.05, interpolation=cv2.INTER_NEAREST)
                fy_img = cv2.resize(fy_img, dsize=(x2 - x1, y2 - y1), interpolation=cv2.INTER_NEAREST)

                img[y1:y2, x1:x2] = fy_img

            cv2.putText(img, f'{this_action.upper()}', org=(int(res.landmark[0].x * img.shape[1]), int(res.landmark[0].y * img.shape[0] + 20)), fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=2, color=(0, 255, 0), thickness=7)

            if i_pred == 5:
                key_input_mode = 'ON'

            if key_input_mode == 'ON':
                # key 입력
                if i_pred == 0:
                    pyautogui.keyDown('w')
                    action_done = True
                
                elif i_pred == 1:
                    if action_done == True:
                        pyautogui.keyUp('w')
                        pyautogui.keyUp('s')
                        action_done = False
                    pyautogui.keyDown('s')
                    action_done = True

                elif i_pred == 2:
                    if action_done == True:
                        pyautogui.keyUp('w')
                        pyautogui.keyUp('s')
            
            if i_pred == 6:
                key_input_mode = 'OFF'

            if key_input_mode == 'OFF':
                if action_done == True:
                    pyautogui.keyUp('w')
                    pyautogui.keyUp('s')
                    action_done = False
                    
                # key 입력
                if i_pred == 0:
                    pyautogui.press('w')
                    action_done = False
                
                elif i_pred == 1:
                    pyautogui.press('s')
                    action_done = False

                elif i_pred == 2:
                    if action_done == True:
                        pyautogui.keyUp('w')
                        pyautogui.keyUp('s')
                        action_done = False

            if i_pred == 3:
                if action_done == True:
                    pyautogui.keyUp('w')
                    pyautogui.keyUp('s')
                    action_done = False

                pyautogui.moveTo(760,540)
                action_done = False

            # thumb_end = res.landmark[4]
            # fist_end = res.landmark[17]

            # if thumb_end.y - fist_end.y > THRESHOLD:
            #     # text = 'DOWN'
            #     pyautogui.moveTo(760, 200)
            #     action_done = False


            elif i_pred == 4:
                if action_done == True:
                    pyautogui.keyUp('w')
                    pyautogui.keyUp('s')
                    action_done = False

                pyautogui.moveTo(1160,540)
                action_done = False

        cv2.putText(img, f'Prev Action: {prev_action.upper()}', (10, 90), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)
        cv2.putText(img, f'Current Action: {current_action.upper()}', (10, 120), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

    cv2.imshow('Test', img)

    key = cv2.waitKey(5) & 0xFF

    if key == 27:
        cv2.destroyAllWindows()
        cap.release()
        break

---

# 02. KNN

## 02-1. KNN - 데이터셋 구축

In [None]:
max_num_hands = 1
count_click = 0

# 제스처 클래스 정의
gestures = {
    0:'go', 1:'back', 2:'stop', 3:'left_spin', 4:'right_spin', 5:'speed_up',
    6:'speed_down', 7:'bad_gesture'}
action_label = 0
action = str(gestures[action_label])

mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils

hands = mp_hands.Hands(
    max_num_hands=max_num_hands,
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5)

file = np.genfromtxt('../data/gesture_train_tovstack.csv', delimiter=',')
print("csv파일 로드됨", file.shape)

cap = cv2.VideoCapture(0)

# 클릭 이벤트
# 화면을 클릭했을 때 각도 값을 csv파일에 추가
def click(event, x, y, flags, param):
    global data, file, count_click
    if event == cv2.EVENT_LBUTTONDOWN:
        count_click += 1
        file = np.vstack((file, data))  # numpy의 vstack 사용
        print(file.shape)

cv2.namedWindow('Dataset')
cv2.setMouseCallback('Dataset', click)

while cap.isOpened():

    ret, img = cap.read()

    if not ret:
        print("카메라 연결 실패")
        # break
        continue

    img = cv2.flip(img, 1)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    result = hands.process(img)
    img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)

    cv2.putText(img, action, org=(10, 30), fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=1, color=(0, 0, 255), thickness=3)
    cv2.putText(img, str(count_click), org=(10, 80), fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=1, color=(255, 255, 255), thickness=2)
    
    # 손 인식 여부 확인
    if result.multi_hand_landmarks is not None:

        for res in result.multi_hand_landmarks:

            joint = np.zeros((21, 3))

            for j, lm in enumerate(res.landmark):
                joint[j] = [lm.x, lm.y, lm.z]

            v1 = joint[[0,1,2,3,0,5,6,7,0,9,10,11,0,13,14,15,0,17,18,19],:]
            v2 = joint[[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20],:]
            v = v2 - v1 # [20,3]

            v = v / np.linalg.norm(v, axis=1)[:, np.newaxis]

            angle = np.arccos(np.einsum('nt,nt->n',
                v[[0,1,2,4,5,6,8,9,10,12,13,14,16,17,18],:], 
                v[[1,2,3,5,6,7,9,10,11,13,14,15,17,18,19],:])) # [15,]

            angle = np.degrees(angle)

            data = np.array([angle], dtype=np.float32)
            data = np.append(data, action_label)
            
            # 각도값 출력 확인
            # print(data)

            mp_drawing.draw_landmarks(img,
                                      res,
                                      mp_hands.HAND_CONNECTIONS
                                    #   mp_hands.get_default_hand_landmarks_style(),
                                    #   mp_hands.get_default_hand_connections_style()
            )

    cv2.imshow('Dataset', img)

    key = cv2.waitKey(5) & 0xFF

    if key == ord('a'):
        created_time = str(time.strftime('%X', time.localtime(time.time())))
        os.makedirs('../data/dataset', exist_ok=True)
        np.savetxt(os.path.join('../data/dataset', f'{action}_{created_time}.csv'), file[1:], delimiter=',')
        
        if action_label == 7:
            action_label = 0
        else:
            action_label += 1
        action = str(gestures[action_label])
        count_click = 0

        file = np.genfromtxt('../data/gesture_train_tovstack.csv', delimiter=',')
        print("csv파일 로드됨", file.shape)

    if key == 27:
        cv2.destroyAllWindows()
        cap.release()
        break


created_time = str(time.strftime('%X', time.localtime(time.time())))
# exist_ok를 True로 설정하지 않았을 땐, 해당 디렉토리가 존재하는 경우 exception 에러 발생
os.makedirs('../data/dataset', exist_ok=True)
# 한 동작만 저장
action = str(gestures[action_label])
np.savetxt(os.path.join('../data/dataset', f'{action}_{created_time}.csv'), file[1:], delimiter=',')

## 02-2. KNN - 성능 검증

In [None]:
# 제스처당 10개 dataset
df_file_10_go = pd.read_csv('../data/dataset_10_231211/go_04:54:34 PM.csv', header=None)
df_file_10_back = pd.read_csv('../data/dataset_10_231211/back_04:55:04 PM.csv', header=None)
df_file_10_stop = pd.read_csv('../data/dataset_10_231211/stop_04:55:31 PM.csv', header=None)
df_file_10_left = pd.read_csv('../data/dataset_10_231211/left_spin_04:55:49 PM.csv', header=None)
df_file_10_right = pd.read_csv('../data/dataset_10_231211/right_spin_04:56:08 PM.csv', header=None)
df_file_10_up = pd.read_csv('../data/dataset_10_231211/speed_up_04:57:54 PM.csv', header=None)
df_file_10_down = pd.read_csv('../data/dataset_10_231211/speed_down_04:58:18 PM.csv', header=None)
df_file_10_bad = pd.read_csv('../data/dataset_10_231211/bad_gesture_05:00:03 PM.csv', header=None)

df_file_10 = pd.concat([df_file_10_go, df_file_10_back, df_file_10_stop, df_file_10_left, df_file_10_right, df_file_10_up, df_file_10_down, df_file_10_bad], ignore_index=True)
df_file_10.to_csv("../data/dataset_10_231211/total_10.csv", index=None, header=None)

In [None]:
# 제스처당 100개 dataset
df_file_100_go = pd.read_csv('../data/dataset_100_231211_2/go_05:04:05 PM.csv', header=None)
df_file_100_back = pd.read_csv('../data/dataset_100_231211_2/back_05:06:55 PM.csv', header=None)
df_file_100_stop = pd.read_csv('../data/dataset_100_231211_2/stop_05:09:41 PM.csv', header=None)
df_file_100_left = pd.read_csv('../data/dataset_100_231211_2/left_spin_05:12:50 PM.csv', header=None)
df_file_100_right = pd.read_csv('../data/dataset_100_231211_2/right_spin_05:15:07 PM.csv', header=None)
df_file_100_up = pd.read_csv('../data/dataset_100_231211_2/speed_up_05:18:02 PM.csv', header=None)
df_file_100_down = pd.read_csv('../data/dataset_100_231211_2/speed_down_05:20:26 PM.csv', header=None)
df_file_100_bad = pd.read_csv('../data/dataset_100_231211_2/bad_gesture_05:23:23 PM.csv', header=None)

df_file_100 = pd.concat([df_file_100_go, df_file_100_back, df_file_100_stop, df_file_100_left, df_file_100_right, df_file_100_up, df_file_100_down, df_file_100_bad], ignore_index=True)
df_file_100.to_csv("../data/dataset_100_231211_2/total_100.csv", index=None, header=None)

In [None]:
# file = np.genfromtxt('../data/dataset_10_231211/total_10.csv', delimiter=',')
file = np.genfromtxt('../data/dataset_100_231211/total_100.csv', delimiter=',')
# file = np.genfromtxt('../data/dataset_2_1/bad_gesture_03:15:29 AM.csv', delimiter=',')

In [None]:
file.shape

In [None]:
# x데이터와 y데이터 나누기
# train셋, test셋 split
# file = np.genfromtxt('../data/dataset_2_1/bad_gesture_03:15:29 AM.csv', delimiter=',')
angle = file[:,:-1].astype(np.float32)
label = file[:, -1].astype(np.float32)

X_train, X_test, y_train, y_test = model_selection.train_test_split(angle,
                                                                    label,
                                                                    test_size = 0.2,
                                                                    random_state = 2023,
                                                                    stratify=label)

In [None]:
len(X_test)

In [None]:
# KNN 모델 생성 후 학습
knn = cv2.ml.KNearest_create()
knn.train(X_train, cv2.ml.ROW_SAMPLE, y_train)

In [None]:
# 정확도 계산
ret, result, neighbours, dist = knn.findNearest(np.array(X_test, dtype=np.float32), k=3)
accuracy = accuracy_score(y_test, result.flatten())
print(f'Accuracy: {accuracy}')

In [None]:
# from sklearn.metrics import classification_report

# y_pred = knn.predict(X_test)
# print("Classification Report:")
# print(classification_report(y_test, y_pred))

In [None]:
# # 인식 결과를 단순히 출력하도록
# max_num_hands = 1

# gestures = {
#     0:'go', 1:'back', 2:'stop', 3:'left_spin', 4:'right_spin', 5:'speed_up',
#     6:'speed_down', 7:'bad_gesture'}

# mp_hands = mp.solutions.hands
# mp_drawing = mp.solutions.drawing_utils

# hands = mp_hands.Hands(
#     max_num_hands=max_num_hands,
#     min_detection_confidence=0.5,
#     min_tracking_confidence=0.5)

# file = np.genfromtxt('../data/dataset_10_231211/total_10.csv', delimiter=',')
# angle = file[:,:-1].astype(np.float32)
# label = file[:, -1].astype(np.float32)
# knn = cv2.ml.KNearest_create()
# knn.train(angle, cv2.ml.ROW_SAMPLE, label)

# cap = cv2.VideoCapture(0)

# while cap.isOpened():
#     ret, img = cap.read()

#     if not ret:
#         print("카메라 연결 실패")
#         break
#         # continue

#     img = cv2.flip(img, 1)
#     img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

#     result = hands.process(img)

#     img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)

#     if result.multi_hand_landmarks is not None:

#         for res in result.multi_hand_landmarks:

#             joint = np.zeros((21, 3))

#             for j, lm in enumerate(res.landmark):
#                 joint[j] = [lm.x, lm.y, lm.z]

#             v1 = joint[[0,1,2,3,0,5,6,7,0,9,10,11,0,13,14,15,0,17,18,19],:]
#             v2 = joint[[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20],:]
#             v = v2 - v1 # [20,3]

#             v = v / np.linalg.norm(v, axis=1)[:, np.newaxis]

#             angle = np.arccos(np.einsum('nt,nt->n',
#                 v[[0,1,2,4,5,6,8,9,10,12,13,14,16,17,18],:], 
#                 v[[1,2,3,5,6,7,9,10,11,13,14,15,17,18,19],:])) # [15,]

#             angle = np.degrees(angle)

#             data = np.array([angle], dtype=np.float32)
#             ret, results, neighbours, dist = knn.findNearest(data, 5)
#             idx = int(results[0][0])

#             if idx in gestures.keys():
#                 cv2.putText(img, text=gestures[idx].upper(), org=(int(res.landmark[0].x * img.shape[1]), int(res.landmark[0].y * img.shape[0] + 20)), fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=1, color=(255, 255, 255), thickness=2)

#             mp_drawing.draw_landmarks(img,
#                                       res,
#                                       mp_hands.HAND_CONNECTIONS
#                                     #   mp_hands.get_default_hand_landmarks_style(),
#                                     #   mp_hands.get_default_hand_connections_style()
#             )
    
#     cv2.imshow('Test', img)

#     key = cv2.waitKey(5) & 0xFF

#     if key == 27:
#         cv2.destroyAllWindows()
#         cap.release()
#         break

In [None]:
# 인식 결과를 단순히 출력하도록
max_num_hands = 1

gestures = {
    0:'go', 1:'back', 2:'stop', 3:'left_spin', 4:'right_spin', 5:'speed_up',
    6:'speed_down', 7:'bad_gesture'}

mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils

hands = mp_hands.Hands(
    max_num_hands=max_num_hands,
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5)

file = np.genfromtxt('../data/dataset_100_231211/total_100.csv', delimiter=',')
angle = file[:,:-1].astype(np.float32)
label = file[:, -1].astype(np.float32)
knn = cv2.ml.KNearest_create()
knn.train(angle, cv2.ml.ROW_SAMPLE, label)

cap = cv2.VideoCapture(0)

fps = 0
frame_count = 0
start_time = time.time()

# prev_action = '?'
# current_action = '?'

while cap.isOpened():
    ret, img = cap.read()

    if not ret:
        print("카메라 연결 실패")
        # break
        continue

    frame_count += 1
    elapsed_time = time.time() - start_time
    if elapsed_time > 1.0:  # 1초마다 FPS 갱신
        fps = frame_count / elapsed_time
        frame_count = 0
        start_time = time.time()

    img = cv2.flip(img, 1)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    result = hands.process(img)

    cv2.putText(img, 'KNN', (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
    cv2.putText(img, f'FPS: {int(fps)}', (10, 60), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)

    img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)

    if result.multi_hand_landmarks is not None:

        for res in result.multi_hand_landmarks:

            joint = np.zeros((21, 3))

            for j, lm in enumerate(res.landmark):
                joint[j] = [lm.x, lm.y, lm.z]

            v1 = joint[[0,1,2,3,0,5,6,7,0,9,10,11,0,13,14,15,0,17,18,19],:]
            v2 = joint[[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20],:]
            v = v2 - v1 # [20,3]

            v = v / np.linalg.norm(v, axis=1)[:, np.newaxis]

            angle = np.arccos(np.einsum('nt,nt->n',
                v[[0,1,2,4,5,6,8,9,10,12,13,14,16,17,18],:], 
                v[[1,2,3,5,6,7,9,10,11,13,14,15,17,18,19],:])) # [15,]

            angle = np.degrees(angle)

            data = np.array([angle], dtype=np.float32)
            ret, results, neighbours, dist = knn.findNearest(data, 5)
            idx = int(results[0][0])

            if idx in gestures.keys():
                cv2.putText(img, text=gestures[idx].upper(), org=(int(res.landmark[0].x * img.shape[1]), int(res.landmark[0].y * img.shape[0] + 20)), fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=2, color=(0, 255, 0), thickness=7)

            mp_drawing.draw_landmarks(img,
                                      res,
                                      mp_hands.HAND_CONNECTIONS
                                    #   mp_hands.get_default_hand_landmarks_style(),
                                    #   mp_hands.get_default_hand_connections_style()
            )

            # if idx == 7:
            #     x1, y1 = tuple((joint.min(axis=0)[:2] * [img.shape[1], img.shape[0]] * 0.95).astype(int))
            #     x2, y2 = tuple((joint.max(axis=0)[:2] * [img.shape[1], img.shape[0]] * 1.05).astype(int))

            #     fy_img = img[y1:y2, x1:x2].copy()
            #     fy_img = cv2.resize(fy_img, dsize=None, fx=0.05, fy=0.05, interpolation=cv2.INTER_NEAREST)
            #     fy_img = cv2.resize(fy_img, dsize=(x2 - x1, y2 - y1), interpolation=cv2.INTER_NEAREST)

            #     img[y1:y2, x1:x2] = fy_img
            
        cv2.putText(img, f'Action: {idx}. {gestures[idx].upper()}', (10, 90), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)
        # cv2.putText(img, f'Current Action: {current_action.upper()}', (10, 90), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)

    cv2.imshow('Test', img)

    key = cv2.waitKey(5) & 0xFF

    if key == 27:
        cv2.destroyAllWindows()
        cap.release()
        break

# 03. RNN

## 03-1. RNN - 모델 생성 후 학습

In [None]:
# df_file = pd.read_csv('../data/dataset_10_231211/total_10.csv', delimiter=',', header=None)
df_file = pd.read_csv('../data/dataset_100_231211/total_100.csv', delimiter=',', header=None)

In [None]:
df_file.shape

In [None]:
df_file

In [None]:
# x데이터와 y데이터 나누기
# train셋, test셋 split
# df_file = pd.read_csv('/home/ckdal/dev_ws/project/Dl_Project/data/dataset_knn_1211/bad_gesture_01:03:08 PM.csv', header=None)
df_angle = df_file.iloc[:, 0:15]
df_label = df_file.iloc[:, -1]

# 훈련 데이터는 전체 데이터를 대표할 수 있도록 라벨이 골고루 포함되어야 함
# stratify: 원래 데이터의 분포와 유사하게 데이터를 추출해주는 파라미터
X_train, X_test, y_train, y_test = model_selection.train_test_split(df_angle,
                                                                    df_label,
                                                                    test_size = 0.2,
                                                                    random_state = 2023,
                                                                    stratify=df_label)

In [None]:
# y_train
# y_test

In [None]:
# one-hot encoding
encoder = OneHotEncoder(sparse_output = False)

y_train_np = y_train.to_numpy()
# 1D 배열을 열이 1인 2D 배열로 변환
y_train_2d = y_train_np.reshape(-1, 1)
y_train_encoded = encoder.fit_transform(y_train_2d)

y_test_np = y_test.to_numpy()
y_test_2d = y_test_np.reshape(-1, 1)
y_test_encoded = encoder.fit_transform(y_test_2d)

# y_train_encoded
y_test_encoded

In [None]:
y_train_encoded.shape
# y_test_encoded.shape

In [None]:
# MLP 모델 생성
# 이진 분류
# model = models.Sequential()

# model.add(layers.Dense(input_dim=15, units=64, activation=None, kernel_initializer=initializers.he_uniform()))

# model.add(layers.Activation('elu'))

# model.add(layers.Dense(units=32, activation=None, kernel_initializer=initializers.he_uniform())) 
# model.add(layers.Activation('elu')) 

# model.add(layers.Dense(units=32, activation=None, kernel_initializer=initializers.he_uniform())) 
# model.add(layers.Activation('elu'))

# model.add(layers.Dropout(rate=0.5))

# # 출력 레이어
# model.add(layers.Dense(units=1, activation='sigmoid'))

# model.compile(optimizer=optimizers.Adam(),
#               loss=losses.binary_crossentropy,
#               metrics=[metrics.binary_accuracy])

In [None]:
# 모델 생성
# 다중 클래스 분류
model = models.Sequential()

model.add(layers.Dense(input_dim=15, units=64, activation=None, kernel_initializer=initializers.he_uniform()))

model.add(layers.Activation('elu'))

model.add(layers.Dense(units=32, activation=None, kernel_initializer=initializers.he_uniform())) 
model.add(layers.Activation('elu')) 

model.add(layers.Dense(units=32, activation=None, kernel_initializer=initializers.he_uniform())) 
model.add(layers.Activation('elu'))

model.add(layers.Dropout(rate=0.5))

# 출력 레이어
model.add(layers.Dense(units=8, activation='softmax'))

model.compile(optimizer=optimizers.Adam(),
              loss=losses.categorical_crossentropy,
              metrics=[metrics.categorical_accuracy]) 

In [None]:
# model = Sequential()

# model.add(layers.LSTM(64, return_sequences=True,
#                 input_shape=(70, 84))) 

# model.add(layers.LSTM(32, return_sequences=True))

# model.add(layers.LSTM(32))

# model.add(layers.Dense(8, activation='softmax'))

# model.compile(loss='categorical_crossentropy',
#                 optimizer='adam',
#                 metrics=['accuracy'])

In [None]:
history = model.fit(X_train, y_train_encoded, batch_size = 200, epochs = 500, validation_split = 0.3)

In [None]:
# 정확도
result = model.evaluate(X_test, y_test_encoded)

print('loss (cross-entropy) :', result[0])
print('test accuracy :', result[1])

In [None]:
# MLP 모델
# 이진 분류
# fig, loss_ax = plt.subplots(figsize=(16, 10))
# acc_ax = loss_ax.twinx()

# loss_ax.plot(history.history['loss'], 'y', label='train loss')
# loss_ax.plot(history.history['val_loss'], 'r', label='val loss')
# loss_ax.set_xlabel('epoch')
# loss_ax.set_ylabel('loss')
# loss_ax.legend(loc='upper left')

# acc = history.history['binary_accuracy']
# val_acc = history.history['val_binary_accuracy']

# acc_ax.plot(acc, 'b', label='train acc')
# acc_ax.plot(val_acc, 'g', label='val acc')
# acc_ax.set_ylabel('accuracy')
# acc_ax.legend(loc='upper left')

# plt.show()

In [None]:
# 다중 분류
fig, loss_ax = plt.subplots(figsize=(16, 10))
acc_ax = loss_ax.twinx()

loss_ax.plot(history.history['loss'], 'y', label='train loss')
loss_ax.plot(history.history['val_loss'], 'r', label='val loss')
loss_ax.set_xlabel('epoch')
loss_ax.set_ylabel('loss')
loss_ax.legend(loc='upper left')

acc = history.history['categorical_accuracy']
val_acc = history.history['val_categorical_accuracy']

acc_ax.plot(acc, 'b', label='train acc')
acc_ax.plot(val_acc, 'g', label='val acc')
acc_ax.set_ylabel('accuracy')
acc_ax.legend(loc='upper left')

plt.show()

In [None]:
# MLP 모델
# 이진 분류
fig, axes = plt.subplots(1, 2, figsize = (12, 4))

sns.lineplot(x = range(len(history.history["loss"])),
             y = history.history["loss"], ax = axes[0],
             label = 'Training Loss')

sns.lineplot(x = range(len(history.history["loss"])),
             y = history.history["val_loss"], ax = axes[0],
             label = 'Validation Loss')


sns.lineplot(x = range(len(history.history["categorical_accuracy"])),
             y = history.history["categorical_accuracy"], ax = axes[1],
             label = 'Training Accuracy')

sns.lineplot(x = range(len(history.history["categorical_accuracy"])),
             y = history.history["val_categorical_accuracy"], ax = axes[1],
             label = 'Validation Accuracy')
axes[0].set_title("Loss"); axes[1].set_title("Accuracy")

sns.despine()
plt.show()

In [None]:
# 모델 저장
model.save('../model/model_100_231211_2_epoch500.h5')

## 03-2. RNN - 성능 검증

In [None]:
X_test

In [None]:
y_test_encoded

In [None]:
df_angle_test = pd.DataFrame(X_test.iloc[13,:])
df_angle_test = df_angle_test.transpose()
df_angle_test

In [None]:
model_test = keras.models.load_model("../model/model_epoch500.h5")
# angle_test = np.array([[2.356400489807128906e+01,6.761012554168701172e+00,6.285949230194091797e+00,8.304659843444824219e+00,6.339200019836425781e+00,4.894902706146240234e+00,4.375961303710937500e+00,6.588742256164550781e+00,4.068197727203369141e+00,7.041219234466552734e+00,5.034504413604736328e+00,4.858801364898681641e+00,1.002417564392089844e+01,7.321394920349121094e+00,4.523040771484375000e+00]])
# angle_test = df_
print(encoder.inverse_transform(model_test.predict(df_angle_test))[0, 0])

In [None]:
# 카메라 인덱스 확인
# sudo apt-get install v4l-utils -y
# v4l2-ctl --list-devices

In [None]:
# 인식 결과를 단순히 출력하도록
max_num_hands = 1
actions = ['go', 'back', 'stop', 'left_spin', 'right_spin', 'speed_up', 'speed_down', 'bad_gesture']
gestures = {
    0:'go', 1:'back', 2:'stop', 3:'left_spin', 4:'right_spin', 5:'speed_up',
    6:'speed_down', 7:'bad_gesture'}

# encoder = OneHotEncoder(sparse_output = False)

mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils

hands = mp_hands.Hands(
    max_num_hands=max_num_hands,                            
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5)

model = keras.models.load_model("../model/RNN_model_dataset100_epoch500_231211.h5")

cap = cv2.VideoCapture(0)
seq = []
action_seq = [] 

fps = 0
frame_count = 0
start_time = time.time()

while cap.isOpened():

    ret, img = cap.read()

    if not ret:
        print("카메라 연결 실패")
        # break
        continue

    frame_count += 1
    elapsed_time = time.time() - start_time
    if elapsed_time > 1.0:  # 1초마다 FPS 갱신
        fps = frame_count / elapsed_time
        frame_count = 0
        start_time = time.time()

    img = cv2.flip(img, 1)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    result = hands.process(img)

    cv2.putText(img, 'RNN', (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
    cv2.putText(img, f'FPS: {int(fps)}', (10, 60), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)

    img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)

    if result.multi_hand_landmarks is not None:

        for res in result.multi_hand_landmarks:

            joint = np.zeros((21, 3))

            for j, lm in enumerate(res.landmark):
                joint[j] = [lm.x, lm.y, lm.z]

            v1 = joint[[0,1,2,3,0,5,6,7,0,9,10,11,0,13,14,15,0,17,18,19],:]
            v2 = joint[[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20],:]
            v = v2 - v1 # [20,3]

            v = v / np.linalg.norm(v, axis=1)[:, np.newaxis]

            angles = []

            angle = np.arccos(np.einsum('nt,nt->n',
                v[[0,1,2,4,5,6,8,9,10,12,13,14,16,17,18],:], 
                v[[1,2,3,5,6,7,9,10,11,13,14,15,17,18,19],:])) # [15,]
            
            angle = np.degrees(angle)

            data = np.array([angle], dtype=np.float32)
            
            idx = encoder.inverse_transform(model.predict([data]))[0,0]

            if idx in gestures.keys():
                cv2.putText(img, text=gestures[idx].upper(), org=(int(res.landmark[0].x * img.shape[1]), int(res.landmark[0].y * img.shape[0] + 20)), fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=2, color=(0, 255, 0), thickness=7)

            mp_drawing.draw_landmarks(img,
                                      res,
                                      mp_hands.HAND_CONNECTIONS
                                    #   mp_hands.get_default_hand_landmarks_style(),
                                    #   mp_hands.get_default_hand_connections_style()
            )
        
        cv2.putText(img, f'Action: {int(idx)}. {gestures[idx].upper()}', (10, 90), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)
    
    cv2.imshow('Test', img)

    key = cv2.waitKey(5) & 0xFF

    if key == 27:
        cv2.destroyAllWindows()
        cap.release()
        break