#### Importing Libraries


In [4]:
!pip install mediapipe
import mediapipe as mp
from google.colab import files
import cv2
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import numpy as np
import pandas as pd
import os
import math
!pip install keras_preprocessing
from keras_preprocessing.sequence import pad_sequences

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting keras_preprocessing
  Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.6/42.6 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: keras_preprocessing
Successfully installed keras_preprocessing-1.1.2


### Working on the Data

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#### Extracting landmarks

In [9]:
def landmark_extractor(img):

  mp_hands = mp.solutions.hands

  with mp_hands.Hands(static_image_mode=True,
                      max_num_hands=2,
                      min_detection_confidence=0.5) as hands:
      
        landmark = []

        image = cv2.flip(cv2.imread(img), 1)

        results = hands.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))

        landmarks = results.multi_hand_landmarks
        
        if landmarks != None:
          landmark_idx = mp_hands.HandLandmark.WRIST.value
          landmark += (np.array([[lmk.landmark[landmark_idx].x, 
                        lmk.landmark[landmark_idx].y,
                        lmk.landmark[landmark_idx].z] for lmk in landmarks]).flatten() if landmarks else np.zeros(3)).tolist()

          landmark_idx = mp_hands.HandLandmark.THUMB_CMC.value
          landmark += (np.array([[lmk.landmark[landmark_idx].x, 
                        lmk.landmark[landmark_idx].y, 
                        lmk.landmark[landmark_idx].z] for lmk in landmarks]).flatten() if landmarks else np.zeros(3)).tolist()

          landmark_idx = mp_hands.HandLandmark.THUMB_MCP.value
          landmark += (np.array([[lmk.landmark[landmark_idx].x, 
                        lmk.landmark[landmark_idx].y, 
                        lmk.landmark[landmark_idx].z] for lmk in landmarks]).flatten() if landmarks else np.zeros(3)).tolist()       

          landmark_idx = mp_hands.HandLandmark.THUMB_IP.value
          landmark += (np.array([[lmk.landmark[landmark_idx].x, 
                        lmk.landmark[landmark_idx].y, 
                        lmk.landmark[landmark_idx].z] for lmk in landmarks]).flatten() if landmarks else np.zeros(3)).tolist()   

          landmark_idx = mp_hands.HandLandmark.THUMB_TIP.value
          landmark += (np.array([[lmk.landmark[landmark_idx].x, 
                        lmk.landmark[landmark_idx].y, 
                        lmk.landmark[landmark_idx].z] for lmk in landmarks]).flatten() if landmarks else np.zeros(3)).tolist()

          landmark_idx = mp_hands.HandLandmark.INDEX_FINGER_MCP.value
          landmark += (np.array([[lmk.landmark[landmark_idx].x, 
                        lmk.landmark[landmark_idx].y, 
                        lmk.landmark[landmark_idx].z] for lmk in landmarks]).flatten() if landmarks else np.zeros(3)).tolist()

          landmark_idx = mp_hands.HandLandmark.INDEX_FINGER_PIP.value
          landmark += (np.array([[lmk.landmark[landmark_idx].x, 
                        lmk.landmark[landmark_idx].y, 
                        lmk.landmark[landmark_idx].z] for lmk in landmarks]).flatten() if landmarks else np.zeros(3)).tolist()

          landmark_idx = mp_hands.HandLandmark.INDEX_FINGER_DIP.value
          landmark += (np.array([[lmk.landmark[landmark_idx].x, 
                        lmk.landmark[landmark_idx].y, 
                        lmk.landmark[landmark_idx].z] for lmk in landmarks]).flatten() if landmarks else np.zeros(3)).tolist()       

          landmark_idx = mp_hands.HandLandmark.INDEX_FINGER_TIP.value
          landmark += (np.array([[lmk.landmark[landmark_idx].x, 
                        lmk.landmark[landmark_idx].y, 
                        lmk.landmark[landmark_idx].z] for lmk in landmarks]).flatten() if landmarks else np.zeros(3)).tolist()   

          landmark_idx = mp_hands.HandLandmark.MIDDLE_FINGER_MCP.value
          landmark += (np.array([[lmk.landmark[landmark_idx].x, 
                        lmk.landmark[landmark_idx].y, 
                        lmk.landmark[landmark_idx].z] for lmk in landmarks]).flatten() if landmarks else np.zeros(3)).tolist()

          landmark_idx = mp_hands.HandLandmark.MIDDLE_FINGER_PIP.value
          landmark += (np.array([[lmk.landmark[landmark_idx].x, 
                        lmk.landmark[landmark_idx].y, 
                        lmk.landmark[landmark_idx].z] for lmk in landmarks]).flatten() if landmarks else np.zeros(3)).tolist()

          landmark_idx = mp_hands.HandLandmark.MIDDLE_FINGER_DIP.value
          landmark += (np.array([[lmk.landmark[landmark_idx].x, 
                        lmk.landmark[landmark_idx].y, 
                        lmk.landmark[landmark_idx].z] for lmk in landmarks]).flatten() if landmarks else np.zeros(3)).tolist()

          landmark_idx = mp_hands.HandLandmark.MIDDLE_FINGER_TIP.value
          landmark += (np.array([[lmk.landmark[landmark_idx].x, 
                        lmk.landmark[landmark_idx].y, 
                        lmk.landmark[landmark_idx].z] for lmk in landmarks]).flatten() if landmarks else np.zeros(3)).tolist()       

          landmark_idx = mp_hands.HandLandmark.RING_FINGER_MCP.value
          landmark += (np.array([[lmk.landmark[landmark_idx].x, 
                        lmk.landmark[landmark_idx].y, 
                        lmk.landmark[landmark_idx].z] for lmk in landmarks]).flatten() if landmarks else np.zeros(3)).tolist()   

          landmark_idx = mp_hands.HandLandmark.RING_FINGER_PIP.value
          landmark += (np.array([[lmk.landmark[landmark_idx].x, 
                        lmk.landmark[landmark_idx].y, 
                        lmk.landmark[landmark_idx].z] for lmk in landmarks]).flatten() if landmarks else np.zeros(3)).tolist()

          landmark_idx = mp_hands.HandLandmark.RING_FINGER_DIP.value
          landmark += (np.array([[lmk.landmark[landmark_idx].x, 
                        lmk.landmark[landmark_idx].y, 
                        lmk.landmark[landmark_idx].z] for lmk in landmarks]).flatten() if landmarks else np.zeros(3)).tolist()

          landmark_idx = mp_hands.HandLandmark.RING_FINGER_TIP.value
          landmark += (np.array([[lmk.landmark[landmark_idx].x, 
                        lmk.landmark[landmark_idx].y, 
                        lmk.landmark[landmark_idx].z] for lmk in landmarks]).flatten() if landmarks else np.zeros(3)).tolist()

          landmark_idx = mp_hands.HandLandmark.PINKY_MCP.value
          landmark += (np.array([[lmk.landmark[landmark_idx].x, 
                        lmk.landmark[landmark_idx].y, 
                        lmk.landmark[landmark_idx].z] for lmk in landmarks]).flatten() if landmarks else np.zeros(3)).tolist()       

          landmark_idx = mp_hands.HandLandmark.PINKY_PIP.value
          landmark += (np.array([[lmk.landmark[landmark_idx].x, 
                        lmk.landmark[landmark_idx].y, 
                        lmk.landmark[landmark_idx].z] for lmk in landmarks]).flatten() if landmarks else np.zeros(3)).tolist()   

          landmark_idx = mp_hands.HandLandmark.PINKY_DIP.value 
          landmark += (np.array([[lmk.landmark[landmark_idx].x, 
                        lmk.landmark[landmark_idx].y, 
                        lmk.landmark[landmark_idx].z] for lmk in landmarks]).flatten() if landmarks else np.zeros(3)).tolist()

          landmark_idx = mp_hands.HandLandmark.PINKY_TIP.value
          landmark += (np.array([[lmk.landmark[landmark_idx].x, 
                        lmk.landmark[landmark_idx].y, 
                        lmk.landmark[landmark_idx].z] for lmk in landmarks]).flatten() if landmarks else np.zeros(3)).tolist()

        if len(landmark) == 126:
            row = np.around(landmark, decimals=5).reshape(1, -1)
        else:
            row = np.hstack((np.array(landmark), np.zeros(126-len(landmark)))).reshape(1, -1)
        return row

In [13]:
# Converting video clip to frames

def getFrame(vid_path, sec):
  vid = cv2.VideoCapture(vid_path)
  vid.set(cv2.CAP_PROP_POS_MSEC, sec*1000)
  ret, image = vid.read()
  path = vid_path[:-9]+"frames/"+vid_path[-9:-4]+f"{sec}.jpg"
  if ret:
    cv2.imwrite(path, image)

    return path


def gen_frames(vid_path, vid_length=2, frames_per_sec=15):

  n_iterations = vid_length * frames_per_sec
  frame_rate = 1 / frames_per_sec
  sec = 0
  frame_limit = 31
  dataset = np.empty((126)).reshape(1, -1)
  lst = [np.zeros(126)]

  for i in range(int(n_iterations)):
    sec += frame_rate
    sec = round(sec, 3)
    image = getFrame(vid_path, sec)
    if image:
      data_point = landmark_extractor(image)
      if len(data_point)<= frame_limit:
        dataset = np.append(dataset, data_point, axis=0)
      else :
        to_round = math.ceil(len(data_point)/frame_limit)
        dataset = np.append(data_point[::to_round])
  while len(dataset) <frame_limit:
    dataset = np.append(dataset,lst,axis=0)
  return dataset

#### Creating Train Dataset

In [14]:
video_files = ['/content/drive/MyDrive/Datasets/Capstone/chair/09848.mp4',
               '/content/drive/MyDrive/Datasets/Capstone/chair/09849.mp4',
               '/content/drive/MyDrive/Datasets/Capstone/chair/09850.mp4',
               '/content/drive/MyDrive/Datasets/Capstone/chair/09869.mp4'
               ]

action_1 = np.empty((126)).reshape(1, -1)
action_1= np.delete(action_1,np.s_[:],axis=0)
for video in video_files:
  int_data1 = gen_frames(video)
  print("int-data",len(int_data1))
  action_padded = pad_sequences(int_data1, padding='post', maxlen=126, dtype='float32')
  print('action_padded',len(action_padded))
  action_1 = np.append(action_1, int_data1, axis=0)
  print("action_1",len(action_1))

int-data 31
action_padded 31
action_1 31
int-data 31
action_padded 31
action_1 62
int-data 31
action_padded 31
action_1 93
int-data 31
action_padded 31
action_1 124


In [15]:
video_files = ['/content/drive/MyDrive/Datasets/Capstone/computer/12311.mp4',
               '/content/drive/MyDrive/Datasets/Capstone/computer/12313.mp4',
               '/content/drive/MyDrive/Datasets/Capstone/computer/12328.mp4',
               '/content/drive/MyDrive/Datasets/Capstone/computer/12338.mp4'
               ]

action_2 = np.empty((126)).reshape(1, -1)
action_2= np.delete(action_2,np.s_[:],axis=0)
for video in video_files:
  int_data2 = gen_frames(video)
  print("int-data",len(int_data2))
  #print(int_data2)
  action_padded = pad_sequences(int_data2, padding='post', maxlen=126, dtype='float32')
  print('action_padded',len(action_padded))
  action_2 = np.append(action_2, action_padded, axis=0)
  print("action_2",len(action_2))
  #print(action_2)

int-data 31
action_padded 31
action_2 31
int-data 31
action_padded 31
action_2 62
int-data 31
action_padded 31
action_2 93
int-data 31
action_padded 31
action_2 124


In [16]:
video_files = ['/content/drive/MyDrive/Datasets/Capstone/drink/17710.mp4',
               '/content/drive/MyDrive/Datasets/Capstone/drink/17734.mp4',
               '/content/drive/MyDrive/Datasets/Capstone/drink/65539.mp4',
               '/content/drive/MyDrive/Datasets/Capstone/drink/69302.mp4'
               ]

action_3 = np.empty((126)).reshape(1, -1)
action_3= np.delete(action_3,np.s_[:],axis=0)
for video in video_files:
  int_data3 = gen_frames(video)
  print("int-data",len(int_data3))
  action_padded = pad_sequences(int_data3, padding='post', maxlen=126, dtype='float32')
  print('action_padded',len(action_padded))
  action_3 = np.append(action_3, action_padded, axis=0)
  print("action_3",len(action_3))

int-data 31
action_padded 31
action_3 31
int-data 31
action_padded 31
action_3 62
int-data 31
action_padded 31
action_3 93
int-data 31
action_padded 31
action_3 124


In [19]:
x_train = np.append(action_1, action_2, axis=0)
x_train = np.append(x_train, action_3, axis=0)

In [20]:
x_train.shape

(372, 126)

In [17]:
train_y = []
for i in range(0,12):
  train_y = np.append(train_y,1).astype(int)
for i in range(0,12):
  train_y = np.append(train_y,2).astype(int)
for i in range(0,12):
  train_y = np.append(train_y,3).astype(int)

In [18]:
train_y.shape

(36,)

#### Creating Testing Dataset

In [23]:
# Converting video clip to frames

def getFrame(vid_path, sec):
  vid = cv2.VideoCapture(vid_path)
  vid.set(cv2.CAP_PROP_POS_MSEC, sec*1000)
  ret, image = vid.read()
  path = vid_path[:-9]+"frames_test/"+vid_path[-9:-4]+f"{sec}.jpg"
  if ret:
    cv2.imwrite(path, image)

    return path


def gen_frames(vid_path, vid_length=2, frames_per_sec=15):

  n_iterations = vid_length * frames_per_sec
  frame_rate = 1 / frames_per_sec
  sec = 0
  frame_limit = 31
  dataset = np.empty((126)).reshape(1, -1)
  lst = [np.zeros(126)]

  for i in range(int(n_iterations)):
    sec += frame_rate
    sec = round(sec, 3)
    image = getFrame(vid_path, sec)
    if image:
      data_point = landmark_extractor(image)
      if len(data_point)<= frame_limit:
        dataset = np.append(dataset, data_point, axis=0)
      else :
        to_round = math.ceil(len(data_point)/frame_limit)
        dataset = np.append(data_point[::to_round])
  while len(dataset) <frame_limit:
    dataset = np.append(dataset,lst,axis=0)
  return dataset

In [24]:
video_files = ['/content/drive/MyDrive/Datasets/Capstone/chair/09851.mp4',
               ]

action_1_test = np.empty((126)).reshape(1, -1)
action_1_test= np.delete(action_1,np.s_[:],axis=0)
for video in video_files:
  int_data1 = gen_frames(video)
  print("int-data",len(int_data1))
  action_padded = pad_sequences(int_data1, padding='post', maxlen=126, dtype='float32')
  print('action_padded',len(action_padded))
  action_1_test = np.append(action_1_test, int_data1, axis=0)
  print("action_1_test",len(action_1_test))

int-data 31
action_padded 31
action_1_test 31


In [25]:
video_files = ['/content/drive/MyDrive/Datasets/Capstone/computer/12320.mp4'
               ]
action_2_test = np.empty((126)).reshape(1, -1)
action_2_test = np.delete(action_2,np.s_[:],axis=0)
for video in video_files:
  int_data2 = gen_frames(video)
  print("int-data",len(int_data2))
  action_padded = pad_sequences(int_data2, padding='post', maxlen=126, dtype='float32')
  print('action_padded',len(action_padded))
  action_2_test = np.append(action_2_test, action_padded, axis=0)
  print("action_2_test",len(action_2_test))

int-data 31
action_padded 31
action_2_test 31


In [26]:
video_files = ['/content/drive/MyDrive/Datasets/Capstone/drink/17723.mp4'
               ]
action_3_test = np.empty((126)).reshape(1, -1)
action_3_test = np.delete(action_3,np.s_[:],axis=0)
for video in video_files:
  int_data3 = gen_frames(video)
  print("int-data",len(int_data3))
  action_padded = pad_sequences(int_data3, padding='post', maxlen=126, dtype='float32')
  print('action_padded',len(action_padded))
  action_3_test = np.append(action_3_test, action_padded, axis=0)
  print("action_3_test",len(action_3_test))

int-data 31
action_padded 31
action_3_test 31


In [27]:
x_test = np.append(action_1_test, action_2_test, axis=0)
x_test = np.append(x_test, action_3_test, axis=0)

In [28]:
x_test.shape

(93, 126)

In [33]:
test_y = []
for i in range(0,3):
  test_y = np.append(test_y,1).astype(int)
for i in range(0,3):
  test_y = np.append(test_y,2).astype(int)
for i in range(0,3):
  test_y = np.append(test_y,3).astype(int)
test_y.shape

(9,)

#### Reshaping Training Data

In [29]:
x_train = x_train.reshape(12,31,126)
x_train.shape

(12, 31, 126)

In [30]:
train_y = train_y.reshape(12,3)
train_y.shape

(12, 3)

In [31]:
x_test = x_test.reshape(3,31,126)
x_test.shape

(3, 31, 126)

In [34]:
test_y = test_y.reshape(3,3)
test_y.shape

(3, 3)

#### Model Generation

In [36]:
classes = ['chair','computer','drink']

In [35]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization, Activation
from tensorflow.keras.callbacks import TensorBoard

In [39]:
max_len=31
feature_len=126
classes_len= len(x_train)
model = Sequential()
model.add(LSTM(256, return_sequences=True, input_shape=(max_len, feature_len)))
model.add(Dropout(0.25))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.25))
model.add(LSTM(128, return_sequences=False))
model.add(Dense(64))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_3 (LSTM)               (None, 31, 256)           392192    
                                                                 
 dropout_2 (Dropout)         (None, 31, 256)           0         
                                                                 
 lstm_4 (LSTM)               (None, 31, 256)           525312    
                                                                 
 dropout_3 (Dropout)         (None, 31, 256)           0         
                                                                 
 lstm_5 (LSTM)               (None, 128)               197120    
                                                                 
 dense_2 (Dense)             (None, 64)                8256      
                                                                 
 batch_normalization_1 (Batc  (None, 64)              

In [41]:
verbose, epochs, batch_size = 1, 25, 10
model.fit(x_train, train_y, validation_data = (x_test, test_y),epochs=epochs, batch_size=batch_size, verbose=verbose, shuffle=True)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x7fa4aa7d19d0>

In [42]:
model.save('action.h5')

In [43]:
model.load_weights('action.h5')

In [44]:
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score

In [45]:
yhat = model.predict(x_test)



In [47]:
ytrue = np.argmax(test_y, axis=1).tolist()
yhat = np.argmax(yhat, axis=1).tolist()

In [48]:
ytrue

[0, 0, 0]

In [49]:
yhat

[1, 0, 1]

In [51]:
multilabel_confusion_matrix(ytrue, yhat)

array([[[0, 0],
        [2, 1]],

       [[1, 2],
        [0, 0]]])

In [52]:
accuracy_score(ytrue, yhat)

0.3333333333333333