# 0. Important Notes
This notebook was created for the project "Just Sign" which is able to detect various signing motions for the song "Cupid" by FIFTY FIFTY.

Created with the help of [Nicholas Renotte's tutorial on action recogniton](https://www.youtube.com/watch?v=doDUihpj6ro&t=2862s&ab_channel=NicholasRenotte).

# 1. Installing and Importing Dependencies

In [None]:
!pip install mediapipe tensorflow opencv-python scikit-learn

In [1]:
import os
import cv2
import time

import numpy as np
import mediapipe as mp

In [None]:
# Test webcam
video_capture = cv2.VideoCapture(0)

while True:
    # Read video feed
    success, frame = video_capture.read()
    
    # Show to screen
    cv2.imshow("OpenCV Feed", frame)
    
    # Quit application
    if cv2.waitKey(10) & 0xFF == ord("q"):
        break
        
video_capture.release()
cv2.destroyAllWindows()

# 2. Setup Folders for Collection

In [2]:
# Path for exported data
DATA_PATH = os.path.join("Training Data")

# Array of all actions to detect
all_actions = np.array([
                        "HOPELESS", 
                        "SWEETHEART", 
                        "ALL", 
                        "MY", 
                        "LIFE", 
                        "COUPLE", 
                        "SURROUND", 
                        "TIME", 
                        "I", 
                        "GUESS",  
                        "MEANS", 
                        "SOMETHING", 
                        "WHY", 
                        "FEEL",
                        "LONELY", 
                        "WISH", 
                        "FIND", 
                        "LOVER", 
                        "HUG", 
                        "NOW",           
                        "CRY", 
                        "IN",
                        "ROOM",
                        "SKEPTICAL",
                        "LOVE", 
                        "BUT", 
                        "STILL", 
                        "WANT",
                        "MORE", 
                        "GIVE", 
                        "NEW", 
                        "CHANCE", 
                        "CUPID",
                        "STUPID",
                        "HE",
                        "MAKE", 
                        "THAT",
                        "NOT", 
                        "REAL",  
                        "DUMB",
                        "NO_DETECTIONS"
                        ])

# Each action has 35 videos consisting of 25 frames
num_sequences = 35
sequence_length = 25

In [None]:
# Create directories for each action
for action in all_actions:
    for sequence in range(num_sequences):
        try:
            os.makedirs(os.path.join(DATA_PATH, action, str(sequence)))
            
        except:
            pass

# 3. Collect Keypoints Sequences

In [3]:
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils

In [4]:
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image.flags.writeable = False
    
    results = model.process(image)
    
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    
    return image, results

In [None]:
def draw_styled_landmarks(image, results):
    # Draw face connections
    mp_drawing.draw_landmarks(image,
                              results.face_landmarks,
                              mp_holistic.FACEMESH_CONTOURS,
                              mp_drawing.DrawingSpec(color=(192, 255, 48), thickness=1, circle_radius=1), 
                              mp_drawing.DrawingSpec(color=(224, 224, 224), thickness=1, circle_radius=1))
    
    # Draw pose connections
    mp_drawing.draw_landmarks(image,
                              results.pose_landmarks,
                              mp_holistic.POSE_CONNECTIONS,
                              mp_drawing.DrawingSpec(color=(192, 255, 48), thickness=2, circle_radius=3), 
                              mp_drawing.DrawingSpec(color=(224, 224, 224), thickness=2, circle_radius=2))
    
    # Draw left hand connections
    mp_drawing.draw_landmarks(image,
                              results.left_hand_landmarks,
                              mp_holistic.HAND_CONNECTIONS,
                              mp_drawing.DrawingSpec(color=(192, 255, 48), thickness=2, circle_radius=3), 
                              mp_drawing.DrawingSpec(color=(224, 224, 224), thickness=2, circle_radius=2))
    
    # Draw right hand connections
    mp_drawing.draw_landmarks(image,
                              results.right_hand_landmarks,
                              mp_holistic.HAND_CONNECTIONS,
                              mp_drawing.DrawingSpec(color=(192, 101, 21), thickness=2, circle_radius=3), 
                              mp_drawing.DrawingSpec(color=(224, 224, 224), thickness=2, circle_radius=2))

In [5]:
def extract_keypoints(results):
    face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    left_hand = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    right_hand = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    
    return np.concatenate([face, pose, left_hand, right_hand])

In [None]:
video_capture = cv2.VideoCapture(0)

# Setup the MediaPipe model
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    # Loop through all the actions
    for action in all_actions:
        # Loop through all the videos
        for sequence in range(num_sequences):
            # Loop through all the frames
            for frame_num in range(sequence_length):
                # Read video feed
                success, frame = video_capture.read()

                # Make detections
                image, results = mediapipe_detection(frame, holistic)

                # Draw landmarks
                draw_styled_landmarks(image, results)

                # Apply wait logic for recording
                if frame_num == 0:
                    cv2.putText(image, "STARTING COLLECTION", (120, 200),
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 4, cv2.LINE_AA)

                    cv2.putText(image, f"Current Action: {action} - Video Number {sequence}", (15, 12),
                                cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA)
                    
                    # Show to screen
                    cv2.imshow("OpenCV Feed", image)
                    cv2.waitKey(1000)
                    
                else:
                    cv2.putText(image, f"Current Action: {action} - Video Number {sequence}", (15, 12),
                                cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA)
                    
                    # Show to screen
                    cv2.imshow("OpenCV Feed", image)
                      
                # Save the keypoints
                keypoints = extract_keypoints(results)
                np_path = os.path.join(DATA_PATH, action, str(sequence), str(frame_num))
                np.save(np_path, keypoints)
                
                # Quit application
                if cv2.waitKey(10) & 0xFF == ord("q"):
                    break
            
    video_capture.release()
    cv2.destroyAllWindows()

# 4. Preprocess Data and Create Labels and Features

In [6]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

In [7]:
label_map = {label:num for num, label in enumerate(all_actions)}

In [8]:
labels, sequences = [], []

for action in all_actions:
    for sequence in np.array(os.listdir(os.path.join(DATA_PATH, action))).astype(int):
        video = []
        
        for frame_num in range(sequence_length):
            frame = np.load(os.path.join(DATA_PATH, action, str(sequence), f"{frame_num}.npy"))
            video.append(frame)
              
        sequences.append(video)
        labels.append(label_map[action])

In [9]:
x = np.array(sequences)
y = to_categorical(labels).astype(int)

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, test_size=0.2, shuffle=True, stratify=y)

# 5. Build and Train LSTM Neural Network

In [10]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import Callback, TensorBoard
from tensorflow.keras.optimizers import SGD

In [11]:
class ModelCheckpoint(Callback):
    def __init__(self, file_name, epoch_freq):
        super().__init__()
        
        self.file_name = file_name
        self.epoch_freq = epoch_freq
        
        self.directory = f"{self.file_name} Checkpoints"
        
        # Make a directory for file checkpoints if it does not exist already
        try:
            os.makedirs(self.directory)
        
        except:
            pass
        
    def on_epoch_end(self, epoch, logs=None):
        if self.epoch_freq > 0 and epoch % self.epoch_freq == 0:
            self.model.save(f"{self.directory}/{self.file_name} - Epoch {epoch}.h5")
        
    def on_train_end(self, logs=None):
        self.model.save(f"{self.directory}/{self.file_name}.h5")

In [12]:
def inject_noise(weights, noise_factor=0.01):
    return weights + tf.random.normal(shape=weights.shape, stddev=noise_factor)

## Create and Train Model From Scratch

In [13]:
model = Sequential()

model.add(LSTM(128, return_sequences=True, activation="tanh", input_shape=(25,1662)))

model.add(LSTM(256, return_sequences=True, activation="tanh"))
model.add(LSTM(128, return_sequences=False, activation="tanh"))
model.add(Dense(128, activation="relu"))
model.add(Dense(64, activation="relu"))

model.add(Dense(all_actions.shape[0], activation="softmax"))

In [14]:
# Adding data augmentation for better generalization
for layer in model.layers:
    for weights in layer.trainable_variables:
        weights.assign(inject_noise(weights))

In [15]:
model.compile(optimizer=SGD(learning_rate=0.001), loss='categorical_crossentropy', metrics=["categorical_accuracy"])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 25, 128)           916992    
                                                                 
 lstm_1 (LSTM)               (None, 25, 256)           394240    
                                                                 
 lstm_2 (LSTM)               (None, 128)               197120    
                                                                 
 dense (Dense)               (None, 128)               16512     
                                                                 
 dense_1 (Dense)             (None, 64)                8256      
                                                                 
 dense_2 (Dense)             (None, 41)                2665      
                                                                 
Total params: 1,535,785
Trainable params: 1,535,785
Non-

In [16]:
tensorboard = TensorBoard(log_dir=os.path.join("TensorBoard Logs"))
model_checkpoint = ModelCheckpoint(file_name="ASL Recognition Model", epoch_freq=50)

model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=500, callbacks=[tensorboard, model_checkpoint])

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

<keras.callbacks.History at 0x197f1efebc0>

## Load and Train Existing Model

In [None]:
model = tf.keras.models.load_model("ASL Recognition Model Checkpoints/ASL Recognition - Epoch 750.h5")

In [None]:
tensorboard = TensorBoard(log_dir=os.path.join("TensorBoard Logs"))
model_checkpoint = ModelCheckpoint(file_name="ASL Recognition Model", epoch_freq=50)

model.fit(x_train, y_train, validation_data=(x_test, y_test), initial_epoch=500, epochs=250, callbacks=[tensorboard, model_checkpoint])

# 6. Make Real-Time Predictions

In [24]:
sequence = []
threshold = 0.5

video_capture = cv2.VideoCapture(0)
model = tf.keras.models.load_model("ASL Recognition Model Checkpoints/ASL Recognition Model - Epoch 400.h5")

# Setup the MediaPipe model
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while video_capture.isOpened():
        # Read video feed
        success, frame = video_capture.read()

        # Make detections
        image, results = mediapipe_detection(frame, holistic)

        # Make predictions
        keypoints = extract_keypoints(results)
        sequence.append(keypoints)
        # sequence = sequence[-25:]
        
        if len(sequence) == 25:
            result = model.predict(np.expand_dims(sequence, axis=0), verbose=0)[0]
            print(all_actions[np.argmax(result)])

            sequence = sequence[-22:]
        
        # Show to screen
        cv2.imshow("OpenCV Feed", image)

        # Quit application
        if cv2.waitKey(10) & 0xFF == ord("q"):
            break
            
    video_capture.release()
    cv2.destroyAllWindows()

NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
HE
FIND
FIND
FIND
FIND
SOMETHING
SOMETHING
SOMETHING
SOMETHING
SOMETHING
THAT
THAT
THAT
I
I
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
SWEETHEART
SWEETHEART
SWEETHEART
COUPLE
ROOM
ROOM
ROOM
LOVER
LOVER
LIFE
GIVE
CHANCE
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
HE
HE
HE
HE
THAT
THAT
NOW
HE
HE
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
I
I
I
I
I
I
I
GIVE
GIVE
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
MY
MY
MY
WISH
WISH
WISH
FE