# 0. Important Notes
This notebook was created for the project "Just Sign" which is able to detect various signing motions for the song "Cupid" by FIFTY FIFTY.

Created with the help of [Nicholas Renotte's tutorial on action recogniton](https://www.youtube.com/watch?v=doDUihpj6ro&t=2862s&ab_channel=NicholasRenotte).

# 1. Installing and Importing Dependencies

In [None]:
!pip install mediapipe tensorflow opencv-python scikit-learn

In [1]:
import os
import cv2
import time

import numpy as np
import mediapipe as mp

In [None]:
# Test webcam
video_capture = cv2.VideoCapture(0)

while True:
    # Read video feed
    success, frame = video_capture.read()
    
    # Show to screen
    cv2.imshow("OpenCV Feed", frame)
    
    # Quit application
    if cv2.waitKey(10) & 0xFF == ord("q"):
        break
        
video_capture.release()
cv2.destroyAllWindows()

# 2. Setup Folders for Collection

In [2]:
# Path for exported data
DATA_PATH = os.path.join("Training Data")

# Array of all actions to detect
all_actions = np.array([
                        "HOPELESS", 
                        "SWEETHEART", 
                        "ALL", 
                        "ME", 
                        "LIFE", 
                        "COUPLE", 
                        "SURROUND", 
                        "TIME", 
                        "I", 
                        "GUESS",  
                        "MEANS", 
                        "SOMETHING", 
                        "WHY", 
                        "FEEL",
                        "LONELY", 
                        "WISH", 
                        "FIND", 
                        "LOVER", 
                        "HUG", 
                        "NOW",           
                        "CRY", 
                        "ROOM",
                        "SKEPTICAL",
                        "LOVE", 
                        "BUT", 
                        "STILL", 
                        "MORE", 
                        "GIVE", 
                        "NEW", 
                        "CHANCE", 
                        "CUPID",
                        "STUPID",
                        "HE",
                        "MAKE", 
                        "THAT",
                        "NOT", 
                        "REAL",  
                        "DUMB",
                        "NO_DETECTIONS"
                        ])

# Each action has 35 videos consisting of 25 frames
num_sequences = 35
sequence_length = 25

In [None]:
# Create directories for each action
for action in all_actions:
    for sequence in range(num_sequences):
        try:
            os.makedirs(os.path.join(DATA_PATH, action, str(sequence)))
            
        except:
            pass

# 3. Collect Keypoints Sequences

In [3]:
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils

In [4]:
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image.flags.writeable = False
    
    results = model.process(image)
    
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    
    return image, results

In [5]:
def draw_styled_landmarks(image, results):
    # Draw face connections
    mp_drawing.draw_landmarks(image,
                              results.face_landmarks,
                              mp_holistic.FACEMESH_CONTOURS,
                              mp_drawing.DrawingSpec(color=(192, 255, 48), thickness=1, circle_radius=1), 
                              mp_drawing.DrawingSpec(color=(224, 224, 224), thickness=1, circle_radius=1))
    
    # Draw pose connections
    mp_drawing.draw_landmarks(image,
                              results.pose_landmarks,
                              mp_holistic.POSE_CONNECTIONS,
                              mp_drawing.DrawingSpec(color=(192, 255, 48), thickness=2, circle_radius=3), 
                              mp_drawing.DrawingSpec(color=(224, 224, 224), thickness=2, circle_radius=2))
    
    # Draw left hand connections
    mp_drawing.draw_landmarks(image,
                              results.left_hand_landmarks,
                              mp_holistic.HAND_CONNECTIONS,
                              mp_drawing.DrawingSpec(color=(192, 255, 48), thickness=2, circle_radius=3), 
                              mp_drawing.DrawingSpec(color=(224, 224, 224), thickness=2, circle_radius=2))
    
    # Draw right hand connections
    mp_drawing.draw_landmarks(image,
                              results.right_hand_landmarks,
                              mp_holistic.HAND_CONNECTIONS,
                              mp_drawing.DrawingSpec(color=(192, 101, 21), thickness=2, circle_radius=3), 
                              mp_drawing.DrawingSpec(color=(224, 224, 224), thickness=2, circle_radius=2))

In [6]:
def extract_keypoints(results):
    face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    left_hand = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    right_hand = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    
    return np.concatenate([face, pose, left_hand, right_hand])

In [None]:
video_capture = cv2.VideoCapture(0)

# Setup the MediaPipe model
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    # Loop through all the actions
    for action in all_actions:
        # Loop through all the videos
        for sequence in range(num_sequences):
            # Loop through all the frames
            for frame_num in range(sequence_length):
                # Read video feed
                success, frame = video_capture.read()

                # Make detections
                image, results = mediapipe_detection(frame, holistic)

                # Draw landmarks
                draw_styled_landmarks(image, results)

                # Apply wait logic for recording
                if frame_num == 0:
                    cv2.putText(image, "STARTING COLLECTION", (120, 200),
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 4, cv2.LINE_AA)

                    cv2.putText(image, f"Current Action: {action} - Video Number {sequence}", (15, 12),
                                cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA)
                    
                    # Show to screen
                    cv2.imshow("OpenCV Feed", image)
                    cv2.waitKey(500)
                    
                else:
                    cv2.putText(image, f"Current Action: {action} - Video Number {sequence}", (15, 12),
                                cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA)
                    
                    # Show to screen
                    cv2.imshow("OpenCV Feed", image)
                      
                # Save the keypoints
                keypoints = extract_keypoints(results)
                np_path = os.path.join(DATA_PATH, action, str(sequence), str(frame_num))
                np.save(np_path, keypoints)
                
                # Quit application
                if cv2.waitKey(10) & 0xFF == ord("q"):
                    break
            
    video_capture.release()
    cv2.destroyAllWindows()

# 4. Preprocess Data and Create Labels and Features

In [7]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

In [None]:
label_map = {label:num for num, label in enumerate(all_actions)}

In [None]:
labels, sequences = [], []

for action in all_actions:
    for sequence in np.array(os.listdir(os.path.join(DATA_PATH, action))).astype(int):
        video = []
        
        for frame_num in range(sequence_length):
            frame = np.load(os.path.join(DATA_PATH, action, str(sequence), f"{frame_num}.npy"))
            video.append(frame)
              
        sequences.append(video)
        labels.append(label_map[action])

In [None]:
x = np.array(sequences)
y = to_categorical(labels).astype(int)

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, test_size=0.2, shuffle=True, stratify=y)

# 5. Build and Train LSTM Neural Network

In [8]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import Callback, TensorBoard
from tensorflow.keras.optimizers import SGD

In [None]:
class ModelCheckpoint(Callback):
    def __init__(self, file_name, epoch_freq):
        super().__init__()
        
        self.file_name = file_name
        self.epoch_freq = epoch_freq
        
        self.directory = f"{self.file_name} Model Checkpoints"
        
        # Make a directory for file checkpoints if it does not exist already
        try:
            os.makedirs(self.directory)
        
        except:
            pass
        
    def on_epoch_end(self, epoch, logs=None):
        if self.epoch_freq > 0 and epoch % self.epoch_freq == 0:
            self.model.save(f"{self.directory}/{self.file_name} - Epoch {epoch}.h5")
        
    def on_train_end(self, logs=None):
        self.model.save(f"{self.directory}/{self.file_name}.h5")

In [None]:
def inject_noise(weights, noise_factor=0.01):
    return weights + tf.random.normal(shape=weights.shape, stddev=noise_factor)

## Create and Train Model From Scratch

In [None]:
model = Sequential()

model.add(LSTM(128, return_sequences=True, activation="tanh", input_shape=(25,1662)))

model.add(LSTM(256, return_sequences=True, activation="tanh"))
model.add(LSTM(128, return_sequences=False, activation="tanh"))
model.add(Dense(128, activation="relu"))
model.add(Dense(64, activation="relu"))

model.add(Dense(all_actions.shape[0], activation="softmax"))

In [None]:
# Adding data augmentation for better generalization
for layer in model.layers:
    for weights in layer.trainable_variables:
        weights.assign(inject_noise(weights))

In [None]:
model.compile(optimizer=SGD(learning_rate=0.001), loss='categorical_crossentropy', metrics=["categorical_accuracy"])
model.summary()

In [None]:
tensorboard = TensorBoard(log_dir=os.path.join("TensorBoard Logs"))
model_checkpoint = ModelCheckpoint(file_name="ASL Recognition", epoch_freq=50)

model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=500, callbacks=[tensorboard, model_checkpoint])

## Load and Train Existing Model

In [None]:
model = tf.keras.models.load_model("ASL Recognition Model Checkpoints/ASL Recognition - Epoch 200.h5")

In [None]:
tensorboard = TensorBoard(log_dir=os.path.join("TensorBoard Logs"))
model_checkpoint = ModelCheckpoint(file_name="ASL Recognition", epoch_freq=50)

model.fit(x_train, y_train, validation_data=(x_test, y_test), initial_epoch=350, epochs=500, callbacks=[tensorboard, model_checkpoint])

# 6. Make Real-Time Predictions

In [10]:
sequence = []
threshold = 0.5

video_capture = cv2.VideoCapture(0)
model = tf.keras.models.load_model("ASL Recognition Model.h5")

# Setup the MediaPipe model
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while video_capture.isOpened():
        # Read video feed
        success, frame = video_capture.read()

        # Make detections
        image, results = mediapipe_detection(frame, holistic)

        # Draw landmarks
        draw_styled_landmarks(image, results)

        # Make predictions
        keypoints = extract_keypoints(results)
        sequence.append(keypoints)
        sequence = sequence[-25:]
        
        if len(sequence) == 25:
            result = model.predict(np.expand_dims(sequence, axis=0))[0]
            print(all_actions[np.argmax(result)])
        
        # Show to screen
        cv2.imshow("OpenCV Feed", image)

        # Quit application
        if cv2.waitKey(10) & 0xFF == ord("q"):
            break
            
    video_capture.release()
    cv2.destroyAllWindows()

NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
SOMETHING
SOMETHING
SOMETHING
SOMETHING
SOMETHING
SOMETHING
SOMETHING
HE
HE
HE
HE
HE
HE
HE
HE
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTI

NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
DUMB
DUMB
STUPID
STUPID
STUPID
STUPID
STUPID
STUPID
STUPID
STUPID
STUPID
STUPID
STUPID
STUPID
STUPID
STUPID
STUPID
STUPID
STUPID
STUPID
STUPID
STUPID
STUPID
STUPID
STUPID
STUPID
STUPID
STUPID
STUPID
STUPID
STUPID
WHY
WHY
WHY
WHY
WHY
WHY
WHY
WHY
WHY
WHY
WHY
WHY
WHY
WHY
WHY

DUMB
DUMB
DUMB
DUMB
DUMB
SKEPTICAL
SKEPTICAL
SKEPTICAL
SKEPTICAL
SKEPTICAL
SKEPTICAL
SKEPTICAL
SOMETHING
SOMETHING
SOMETHING
SOMETHING
SOMETHING
SOMETHING
SOMETHING
SOMETHING
SOMETHING
SOMETHING
SOMETHING
SOMETHING
SOMETHING
SOMETHING
SOMETHING
SOMETHING
FIND
FIND
FIND
FIND
FIND
FIND
FIND
FIND
FIND
FIND
FIND
FIND
WHY
WHY
WHY
WHY
WHY
WHY
WHY
WHY
WHY
WHY
WHY
WHY
WHY
WHY
WHY
WHY
WHY
WHY
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
NO_DETECTIONS
