In [1]:
%%capture
!pip install mediapipe==0.9.0.1
!pip install protobuf==3.20.*
!pip install scikit-image

In [2]:
import os
import shutil
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import tensorflow as tf
import json
import mediapipe as mp
import matplotlib
import matplotlib.pyplot as plt
import random


from skimage.transform import resize
from mediapipe.framework.formats import landmark_pb2
from tensorflow import keras
from tensorflow.keras import layers
from tqdm.notebook import tqdm
from matplotlib import animation, rc

# For extraction and drawing
import cv2
from itertools import chain
mp_pose = mp.solutions.pose
mp_hands = mp.solutions.hands
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils 
mp_drawing_styles = mp.solutions.drawing_styles

In [3]:
random.seed(42)

In [4]:
# # Pose coordinates for hand movement.
# LPOSE = [13, 15, 17, 19, 21]
# RPOSE = [14, 16, 18, 20, 22]
# POSE = LPOSE + RPOSE

# X = [f'x_right_hand_{i}' for i in range(21)] + [f'x_left_hand_{i}' for i in range(21)] + [f'x_pose_{i}' for i in POSE]
# Y = [f'y_right_hand_{i}' for i in range(21)] + [f'y_left_hand_{i}' for i in range(21)] + [f'y_pose_{i}' for i in POSE]
# Z = [f'z_right_hand_{i}' for i in range(21)] + [f'z_left_hand_{i}' for i in range(21)] + [f'z_pose_{i}' for i in POSE]

# FEATURE_COLUMNS = X + Y + Z

from itertools import chain

# Pose coordinates for hand movement.
LPOSE = [13, 15, 17, 19, 21]
RPOSE = [14, 16, 18, 20, 22]
POSE = LPOSE + RPOSE

def extract_from_result(res):
    # Extract specific pose landmarks if available
    px = []
    py = []
    pz = []
    if res.pose_landmarks:
        for i in POSE:
            lm = res.pose_landmarks.landmark[i]
            px.append(lm.x)
            py.append(lm.y)
            pz.append(lm.z)
    else:
        px = [0.0]*len(POSE)
        py = [0.0]*len(POSE)
        pz = [0.0]*len(POSE)

    # Extract left hand landmarks if available
    lx = []
    ly = []
    lz = []
    if res.left_hand_landmarks:
        for lm in res.left_hand_landmarks.landmark:
            lx.append(lm.x)
            ly.append(lm.y)
            lz.append(lm.z)
    else:
        lx = [0.0]*21
        ly = [0.0]*21
        lz = [0.0]*21

    # Extract right hand landmarks if available
    rx = []
    ry = []
    rz = []
    if res.right_hand_landmarks:
        for lm in res.right_hand_landmarks.landmark:
            rx.append(lm.x)
            ry.append(lm.y)
            rz.append(lm.z)
    else:
        rx = [0.0]*21
        ry = [0.0]*21
        rz = [0.0]*21

    return list(chain(rx, lx, px, ry, ly, py, rz, lz, pz))

In [5]:
def load_data_from_video(path_to_video: str):
    data = []
    video = cv2.VideoCapture(path_to_video)
    try:
        with mp_holistic.Holistic(min_detection_confidence=0.5,min_tracking_confidence=0.5) as holistic:
            while True:
                _, frame = video.read()
                if frame is None:
                    break

                frame.flags.writeable = False
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                results = holistic.process(frame)
                data.append(extract_from_result(results))
    finally:
        video.release()
        
    return data

In [6]:
from collections import deque
from copy import deepcopy

MAX_LEN = 15

def preprocess_data(data: list):
    sliding_window = deque(maxlen=MAX_LEN)

    sequences = []
    for pose in data:
        sliding_window.append(pose)
        if len(sliding_window) == MAX_LEN:
            seq = deepcopy(list(sliding_window))
            sequences.append(seq)
        
    return  sequences

In [31]:
x = []
y = []
for i in tqdm(range(1, 6)):
    data = load_data_from_video(f"signing samples/{i}.mp4")
    d = preprocess_data(data)
    x.extend(d)
    y.extend([1]*len(d))
    
for i in tqdm(range(1, 10)):
    data = load_data_from_video(f"not signing samples/{i}.mp4")
    d = preprocess_data(data)
    x.extend(d)
    y.extend([0]*len(d))

x = np.array(x)
y = np.array(y)
perm = np.random.permutation(len(x))

x = x[perm]
y = y[perm]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

In [47]:
x.shape

(19754, 15, 156)

In [48]:
from keras.models import Sequential
from keras.layers import LSTM, Dense

model = Sequential()
model.add(LSTM(1, input_shape=(MAX_LEN, 156)))
model.add(Dense(2, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_3 (LSTM)               (None, 1)                 632       
                                                                 
 dense_3 (Dense)             (None, 2)                 4         
                                                                 
Total params: 636
Trainable params: 636
Non-trainable params: 0
_________________________________________________________________


In [49]:
model.fit(x, y, validation_split=0.3, epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x266884e4820>

In [50]:
model.save("detect_signing.hdf5")

In [51]:
preds = model.predict(x)



In [52]:
sum(np.argmax(preds, axis=1))

9953

In [53]:
sum(y)

9906

In [26]:
y.shape

(1932,)