### Import dependecies and utilities

In [None]:
# !pip3 install tensorflow tensorflow-gpu opencv-python mediapipe sklearn matplotlib

In [None]:
import cv2
import numpy as np
import os
import matplotlib.pyplot as plt
import time
import mediapipe as mp

### Detection and drawing functions

In [None]:
# Variables from Mediapipe for detection and drawing
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils

In [None]:
# detection(img, model): This function process an image and make predictions about what detects (hands, face, pose)
#     img: The image we want to process
#     model: The model that will make the predictions

def detection(img, model):
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # Color conversion to RGB
    img.flags.writeable = False
    results = model.process(img) # Image processing
    img.flags.writeable = True
    img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) # Back to original color (BGR)
    return img, results
    

In [None]:
# show_landmarks(img, results): This function show the conections and landmarks of face, hands and pose. Also adds styles.
#     img: The image we want to process
#     results: Results given by the predictor

def show_landmarks(img, results):
    mp_drawing.draw_landmarks(
        img, results.face_landmarks, mp_holistic.FACEMESH_TESSELATION,
        mp_drawing.DrawingSpec(color=(0,0,255), thickness=1, circle_radius=1),
        mp_drawing.DrawingSpec(color=(10,255,0), thickness=1, circle_radius=1)
    )
    mp_drawing.draw_landmarks(
        img, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
         mp_drawing.DrawingSpec(color=(0,0,255), thickness=2, circle_radius=1),
         mp_drawing.DrawingSpec(color=(234,232,24), thickness=2, circle_radius=2)
    )
    mp_drawing.draw_landmarks(
        img, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
         mp_drawing.DrawingSpec(color=(0,0,255), thickness=2, circle_radius=1),
         mp_drawing.DrawingSpec(color=(228,19,206), thickness=2, circle_radius=2)
    )
    mp_drawing.draw_landmarks(
        img, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
         mp_drawing.DrawingSpec(color=(0,0,255), thickness=2, circle_radius=1),
         mp_drawing.DrawingSpec(color=(228,19,206), thickness=2, circle_radius=2)
    )

### Webcam test

In [None]:

capture = cv2.VideoCapture(1)

if capture.isOpened() is False: print("Camera is not available")
    
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic: # Setting mediapipe model
    while capture.isOpened():
        
        # Read frames and show them
        _, frame = capture.read()
        
        # Model results prediction
        img, results = detection(frame, holistic)
        
        # Show landmarks
        show_landmarks(img, results)
        
        cv2.imshow("OpenCV video", img)

        if cv2.waitKey(1) == ord("q"):
            break
    capture.release()
    cv2.destroyAllWindows()


### Get landmarks

In [None]:
NUM_POSE_LANDMARKS = 33 * 4 # 33 landmarks. 3 coordinates and 1 visibility attribute per landmark
NUM_FACE_LANDMARKS = 468 * 3 # 468 landmarks. 3 coordinates per landmark
NUM_HAND_LANDMARKS = 21 * 3 # 21 landmarks. 3 coordinates per landmark
TOTAL_LANDMARKS = NUM_POSE_LANDMARKS + NUM_FACE_LANDMARKS + NUM_HAND_LANDMARKS * 2

In [None]:
'''TODO: Optimizar'''

def get_landmarks(results):
    
    # Face
    face = []
    if results.face_landmarks:
        for result in results.face_landmarks.landmark:
            landmark = [result.x, result.y, result.z]
            face.append(landmark)
        face = np.array(face).flatten()
    else:
        face = np.zeros(NUM_FACE_LANDMARKS)
    face = np.array(face).flatten()
    
    # Pose
    pose = []
    if results.pose_landmarks:
        for result in results.pose_landmarks.landmark:
            landmark = [result.x, result.y, result.z, result.visibility]
            pose.append(landmark)
        pose = np.array(pose).flatten()
    else:
        pose = np.zeros(NUM_POSE_LANDMARKS)
    pose = np.array(pose).flatten()
    
    #Left hand
    left_hand = []
    if results.left_hand_landmarks:
        for result in results.left_hand_landmarks.landmark:
            landmark = [result.x, result.y, result.z]
            left_hand.append(landmark)
    else:
        left_hand = np.zeros(NUM_HAND_LANDMARKS)
    left_hand = np.array(left_hand).flatten()
    
    # Right hand
    right_hand = []
    if results.right_hand_landmarks:
        for result in results.right_hand_landmarks.landmark:
            landmark = [result.x, result.y, result.z]
            right_hand.append(landmark)
    else:
        right_hand = np.zeros(NUM_HAND_LANDMARKS)
    right_hand = np.array(right_hand).flatten()
    
    # Return all landmarks concatenated
    return np.concatenate([face, pose, left_hand, right_hand])


### Setting up folders for datasets

#### Train

In [None]:

# # SOURCE_PATH = os.path.join("source")
# DATASET_PATH = os.path.join("train/dataset")
# signs = np.array(["hola", "gracias", "atencion"])
# # signs = np.array(["hola", "gracias", "atencion", "comoestas", "buenosdias", "bienvenidos"])
# # signs = np.array(["comoestas", "buenosdias", "bienvenidos"])

# '''Only used on Camera Version'''
# num_videos = 30
# len_videos = 30


#### Test

In [None]:
DATASET_PATH = os.path.join("test/dataset_D_8signs_60")
# signs = np.array(["hola", "gracias", "atencion"])
signs = np.array(["hola", "gracias", "atencion", "comoestas", "buenosdias", "bienvenidos", "porfavor", "adios"])
# signs = np.array(["comoestas", "buenosdias", "bienvenidos"])
# signs = np.array(["adios", "porfavor"])
# signs = np.array(["adios", "porfavor"])

'''Only used on Camera Version'''
num_videos = 60
len_videos = 30

#### Create folders for processed data

In [None]:
# import shutil

# '''Webcam Version'''
# try:
#     shutil.rmtree(DATASET_PATH)
# except Exception as e:
#     print("os.remove() failed: ", e.strerror)
    
for sign in signs:
    for video_index in range(num_videos):
        try:
            os.makedirs(os.path.join(DATASET_PATH, sign, str(video_index)))
        except:
            pass

'''Video File Version'''
# for sign in signs:
#     videos = os.listdir(os.path.join(SOURCE_PATH, sign))
#     for video_index, video in enumerate(videos):
#         try:
#             os.makedirs(os.path.join(DATASET_PATH, sign, str(video_index)))
#         except:
#             pass


### Code for data join

In [None]:
PATH1 = os.path.join("train/dataset_ABC_8signs")
PATH2 = os.path.join("train/dataset_ABC_8signs")
signs = np.array(["hola", "gracias", "atencion", "comoestas", "buenosdias", "bienvenidos","adios", "porfavor"])
# signs = np.array(["hola", "gracias", "atencion"])
# signs = np.array(["adios", "porfavor"])

for sign in signs:
    videos = os.listdir(os.path.join(PATH2, sign))
    videos = list(map(int, videos))
    print(videos)
    nextname = max(videos)
    print(nextname)
    videos2 = os.listdir(os.path.join(PATH1, sign))
    videos2 = list(map(int, videos2))
    print(videos2)
    for video_index, video in enumerate(videos2):
        nextname += 1
        os.rename(os.path.join(PATH1, sign, str(video_index)), os.path.join(PATH1, sign, str(nextname)))
        
        

In [None]:
# Reset folder names
PATH = os.path.join("test/dataset_C_6signs")
# signs = np.array(["hola", "gracias", "atencion"])
# signs = np.array(["adios", "porfavor"])
signs = np.array(["hola", "gracias", "atencion", "comoestas", "buenosdias", "bienvenidos"])
for sign in signs:
    videos = os.listdir(os.path.join(PATH, sign))
    videos = list(map(int, videos))
    print(videos)
    for video_index in range(60):
        os.rename(os.path.join(PATH, sign, str(videos[video_index])), os.path.join(PATH, sign, str(video_index)))

### Create datasets

#### Camera Version

In [None]:
camera = cv2.VideoCapture(1)
if not camera.isOpened(): print("error")
while camera.isOpened():
    _, frame = camera.read()
    cv2.imshow("webcam",frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break
camera.release()
cv2.destroyAllWindows()

In [None]:

camera = cv2.VideoCapture(1)

if camera.isOpened() is False: 
    print("Camera is not available.")
else:
    with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic: # Setting mediapipe model
        for sign in signs:
            for video in range(num_videos):
                for video_frame in range(len_videos):

                    # Read frames and show
                    _, frame = camera.read()

                    # Model results prediction
                    img, results = detection(frame, holistic)    

                    # Show landmarks
                    show_landmarks(img, results)

                    # Capture funtionality
                    if video_frame==0:
                        cv2.rectangle(img, (0,0), (640, 40), (255, 255, 255), -1)
                        cv2.putText(img, 'Comenzando captura de {} Video [{}]'
                                    .format(sign.upper(), str(video)), (3,30), cv2.FONT_HERSHEY_SIMPLEX, .75, (0,0,0), 1, cv2.LINE_AA)
                        
                        cv2.imshow('Sign Language Recognition', img)
                        cv2.waitKey(2000) # Wait 2sec
                    else:
                        cv2.imshow('Sign Language Recognition', img)

                    # Landmarks saving
                    all_landmarks = get_landmarks(results)
                    path = os.path.join(DATASET_PATH, sign, str(video), str(video_frame))
                    np.save(path, all_landmarks)

                    if cv2.waitKey(1) == ord('q'):
                        camera.release()
                        cv2.destroyAllWindows()
                        break
                        
                    
        camera.release()
        cv2.destroyAllWindows()


#### Video Files Version

In [None]:
'''
frames_length = []
NUM_POSE_LANDMARKS = 33 * 4 # 33 landmarks. 3 coordinates and 1 visibility attribute per landmark
NUM_FACE_LANDMARKS = 468 * 3 # 468 landmarks. 3 coordinates per landmark
NUM_HAND_LANDMARKS = 21 * 3 # 21 landmarks. 3 coordinates per landmark
TOTAL_LANDMARKS = NUM_POSE_LANDMARKS + NUM_FACE_LANDMARKS + NUM_HAND_LANDMARKS * 2

for sign in signs:
    videos = os.listdir(os.path.join(SOURCE_PATH, sign))
    print(videos)
    for video_index, video in enumerate(videos):
        video_path = os.path.join(SOURCE_PATH, sign, video)
        cap = cv2.VideoCapture(video_path)
        frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        frames_length.append(frame_count)

MAX_FRAME_LENGTH = max(frames_length)
print(MAX_FRAME_LENGTH)


frame_time = 1
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic: # Setting mediapipe model
    for sign in signs:
        videos = os.listdir(os.path.join(SOURCE_PATH, sign))
        for video_index, video in enumerate(videos):
            video_path = os.path.join(SOURCE_PATH, sign, video)
            cap = cv2.VideoCapture(video_path)
            frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
            print(frame_count)
            for video_frame in range(MAX_FRAME_LENGTH):
                # Read frames and show
                ret, frame = cap.read()
                
                if not ret:
                    zero_landmarks = [0 for x in range(TOTAL_LANDMARKS)]
                    path = os.path.join(DATASET_PATH, sign, str(video_index), str(video_frame))
                    np.save(path, zero_landmarks)
                    continue

                # Model results prediction
                img, results = detection(frame, holistic)    

                # Show landmarks
                show_landmarks(img, results)

                # Capture funtionality
                if video_frame == 0:
                    cv2.putText(img, 'Comenzando captura', (150,200), cv2.FONT_HERSHEY_SIMPLEX, 1, (60,35,239), 4, cv2.LINE_AA)
                    cv2.putText(img, 'Capturando frames para {} - Video [{}]'
                                .format(sign.upper(), video), (15,12), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (60,35,239), 1, cv2.LINE_AA)
                    cv2.imshow('Sign Language Recognition', img)
                    cv2.waitKey(2000) # Wait 2sec
                else:
                    cv2.putText(img, 'Capturando frames para {} - Video [{}]'
                                .format(sign.upper(), video), (15,12), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (60,35,239), 1, cv2.LINE_AA)
                    cv2.imshow('Sign Language Recognition', img)

                # Landmarks saving
                all_landmarks = get_landmarks(results)
                path = os.path.join(DATASET_PATH, sign, str(video_index), str(video_frame))
                np.save(path, all_landmarks)

                if cv2.waitKey(frame_time) == ord('q'):
                    break
    cap.release()
    cv2.destroyAllWindows()
'''

### Get data

In [None]:
# videos = []
# labels = []
# DATASET_PATH = "dataset_fran_3signs"

# '''Webcam Version'''
# for sign in signs:
#     for video in range(num_videos):
#         video_aux = []
#         for frame in range(len_videos):
#             frame_aux = np.load(os.path.join(DATASET_PATH, sign, str(video), "{}.npy".format(frame)))
#             video_aux.append(frame_aux)
#         videos.append(video_aux)
#         labels.append(label_map[sign])

'''Video File Version'''
# for sign in signs:
#     videos_list = os.listdir(os.path.join(DATASET_PATH, sign))
#     for video in videos_list:
#         video_aux = []
#         frames = os.listdir(os.path.join(DATASET_PATH, sign, video))
#         for frame in frames:
#             frame_aux = np.load(os.path.join(DATASET_PATH, sign, str(video), frame))
#             video_aux.append(frame_aux)
#         videos.append(video_aux)
#         labels.append(label_map[sign])


In [None]:
# NUM_POSE_LANDMARKS = 33 * 4 # 33 landmarks. 3 coordinates and 1 visibility attribute per landmark
# NUM_FACE_LANDMARKS = 468 * 3 # 468 landmarks. 3 coordinates per landmark
# NUM_HAND_LANDMARKS = 21 * 3 # 21 landmarks. 3 coordinates per landmark
# TOTAL_LANDMARKS = NUM_POSE_LANDMARKS + NUM_FACE_LANDMARKS + NUM_HAND_LANDMARKS * 2

In [None]:
# X = np.array(videos)
# y = to_categorical(np.array(labels)).astype(int)

In [None]:
# num_videos = X.shape[0]
# num_videos

## LSTM

In [None]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import TensorBoard, EarlyStopping

### 3 signs model (hola, gracias, atencion)

In [None]:
DATASET_PATH = os.path.join("train/dataset_AB_3signs")
signs = np.array(["hola", "gracias", "atencion"])

# '''Only used on Camera Version'''
num_videos = 60
len_videos = 30

In [None]:
label_map = {label:num for num, label in enumerate(signs)}
label_map

In [None]:
videos = []
labels = []

'''Webcam Version'''
for sign in signs:
    for video in range(num_videos):
        video_aux = []
        for frame in range(len_videos):
            frame_aux = np.load(os.path.join(DATASET_PATH, sign, str(video), "{}.npy".format(frame)))
            video_aux.append(frame_aux)
        videos.append(video_aux)
        labels.append(label_map[sign])

In [None]:
X = np.array(videos)
y = to_categorical(np.array(labels)).astype(int)

In [None]:
X

In [None]:
print(X.shape)
print(y.shape)

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05)

In [None]:
logs_path = os.path.join('logs/3signs_AB_3')
tensorboard = TensorBoard(log_dir=logs_path)


In [None]:
output = signs.shape[0]
output

In [None]:
# del model_3signs
model_3signs = Sequential()
model_3signs.add(LSTM(32, return_sequences=True, activation='relu', input_shape=(len_videos,TOTAL_LANDMARKS)))
model_3signs.add(LSTM(32, return_sequences=False, activation='relu'))
model_3signs.add(Dense(64, activation='relu'))
model_3signs.add(Dropout(0.2))
model_3signs.add(Dense(32, activation='relu'))
model_3signs.add(Dense(output, activation='softmax'))

# del model_3signs
# model_3signs = Sequential()
# model_3signs.add(LSTM(64, return_sequences=True, activation='relu', input_shape=(len_videos,TOTAL_LANDMARKS)))
# model_3signs.add(LSTM(128, return_sequences=True, activation='relu'))
# model_3signs.add(LSTM(64, return_sequences=False, activation='relu'))
# model_3signs.add(Dense(64, activation='relu'))
# model_3signs.add(Dense(32, activation='relu'))
# model_3signs.add(Dense(output, activation='softmax'))

model_3signs.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])


In [None]:
'''Loading model'''
# model_3signs.load_weights('model_3signs.h5')

In [None]:
model_3signs.fit(X, y, epochs=4000, callbacks=[tensorboard], shuffle=True)


In [None]:
model_3signs.save('models/model_3signs_AB.h5')

### 6 signs model (hola, gracias, atencion, como estas, buenos días, bienvenidos)

In [None]:
DATASET_PATH = os.path.join("train/dataset_ABC_8signs")
signs = np.array(["hola", "gracias", "atencion", "comoestas", "buenosdias", "bienvenidos","adios", "porfavor"])

'''Only used on Camera Version'''
num_videos = 180
len_videos = 30

In [None]:
label_map = {label:num for num, label in enumerate(signs)}
label_map

In [None]:
videos = []
labels = []

'''Webcam Version'''
for sign in signs:
    for video in range(num_videos):
        video_aux = []
        for frame in range(len_videos):
            frame_aux = np.load(os.path.join(DATASET_PATH, sign, str(video), "{}.npy".format(frame)))
            video_aux.append(frame_aux)
        videos.append(video_aux)
        labels.append(label_map[sign])

In [None]:
X = np.array(videos)
y = to_categorical(np.array(labels)).astype(int)

In [None]:
print(X.shape)
print(y.shape)

In [None]:
logs_path = os.path.join('logs/8signs_ABC_6')
tensorboard = TensorBoard(log_dir=logs_path)



In [None]:
output = signs.shape[0]
output

In [None]:
# del model_6signs


model_6signs = Sequential()
model_6signs.add(LSTM(256, return_sequences=False, activation='relu', input_shape=(len_videos,TOTAL_LANDMARKS)))
model_6signs.add(Dense(64, activation='relu'))
model_6signs.add(Dropout(0.1))
model_6signs.add(Dense(32, activation='relu'))
model_6signs.add(Dropout(0.1))
model_6signs.add(Dense(output, activation='softmax'))

# model_6signs = Sequential()
# model_6signs.add(LSTM(64, return_sequences=True, activation='relu', input_shape=(len_videos,TOTAL_LANDMARKS)))
# model_6signs.add(LSTM(64, return_sequences=True, activation='relu'))
# model_6signs.add(LSTM(64, return_sequences=False, activation='relu'))
# model_6signs.add(Dense(64, activation='relu'))
# model_6signs.add(Dense(output, activation='softmax'))


model_6signs.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])


In [None]:
model_6signs.fit(X, y, epochs=1000, callbacks=[tensorboard], shuffle=True)

In [None]:
model_6signs.save('models/8S-3_v2.h5')

# Evaluation

In [None]:
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import accuracy_score
from keras.models import load_model

In [None]:
DATASET_PATH = os.path.join("test/dataset_B_8signs_60")
# signs = np.array(["hola", "gracias", "atencion"])
signs = np.array(["hola", "gracias", "atencion", "comoestas", "buenosdias", "bienvenidos", "adios", "porfavor"])

# '''Only used on Camera Version'''
num_videos = 60
len_videos = 30

In [None]:
label_map = {label:num for num, label in enumerate(signs)}
label_map

In [None]:
videos = []
labels = []

'''Webcam Version'''
for sign in signs:
    for video in range(num_videos):
        video_aux = []
        for frame in range(len_videos):
            frame_aux = np.load(os.path.join(DATASET_PATH, sign, str(video), "{}.npy".format(frame)))
            video_aux.append(frame_aux)
        videos.append(video_aux)
        labels.append(label_map[sign])

In [None]:
X_test = np.array(videos)
y_test = to_categorical(np.array(labels)).astype(int)
print(X_test.shape)
print(y_test.shape)

In [None]:
model = load_model("models/8S-3_v2.h5")
y_pred = model.predict(X_test)
y_pred

In [None]:
# Get labels from one-hot-encoding
y_test_lab = np.argmax(y_test, axis=1)
y_pred_lab = np.argmax(y_pred, axis=1)
y_test_lab = [signs[pred] for pred in y_test_lab]
y_pred_lab = [signs[pred] for pred in y_pred_lab]

# matrix = metrics.confusion_matrix(y_test.argmax(axis=1), y_pred.argmax(axis=1))
# multilabel_confusion_matrix(y_test_lab, y_pred_lab)
print(y_test_lab)
print(y_pred_lab)


In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay

ConfusionMatrixDisplay.from_predictions(
   y_test_lab, y_pred_lab, normalize='true', cmap='bone', xticks_rotation=60)

plt.show()

### Real Time Test

In [None]:
''' ORIGINAL '''

len_videos = 30
video_frames = []
sign_sequence = []
confidence = 0.9
model_6signs.load_weights('models/8S-3_v2.h5')
signs = np.array(["hola", "gracias", "atencion", "comoestas", "buenosdias", "bienvenidos","adios", "porfavor"])
# signs = np.array(["hola", "gracias", "atencion"])
# model = load_model("models/model_6signs_2000ep_v2.h5")
model = model_6signs
out = cv2.VideoWriter('output.mp4', -1, 20.0, (640,480))

camera = cv2.VideoCapture(1)

if camera.isOpened() is False: print("Camera is not available.")
    
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic: # Setting mediapipe model
    while camera.isOpened():
        for num_frame in range(len_videos):
            # Read frames and show them
            _, frame = camera.read()

            # Model results prediction
            img, results = detection(frame, holistic)

            # Show landmarks
            show_landmarks(img, results)

            # Predictions
            landmarks = get_landmarks(results)
            video_frames.append(landmarks)
            video_frames = video_frames[-30:]

            if num_frame == 0:
                cv2.putText(img, 'Comenzando captura', (100,200), cv2.FONT_HERSHEY_SIMPLEX, .75, (0,0,255), 1, cv2.LINE_AA)
                
                cv2.imshow('Sign Language Recognition', img)
                cv2.waitKey(1000) # Wait 2sec
            else:
                if num_frame == len_videos - 1:
                    pred = model.predict(np.expand_dims(video_frames, axis=0))[0]
                    sign_pred = signs[np.argmax(pred)]
                    

                    if len(sign_sequence) > 0:
                        if sign_pred != sign_sequence[-1]:
                            sign_sequence.append(sign_pred)
                    else:
                        sign_sequence.append(sign_pred)

                if len(sign_sequence) > 4:
                    sign_sequence = sign_sequence[-4:]
                cv2.rectangle(img, (0,0), (640, 40), (255, 255, 255), -1)
                cv2.putText(img, ' '.join(sign_sequence), (3,30), cv2.FONT_HERSHEY_SIMPLEX, .75, (0,0,0), 1, cv2.LINE_AA)
                cv2.imshow('Sign Language Recognition', img)

            out.write(img)

            if cv2.waitKey(1) == ord("q"):
                camera.release()
                break
    out.release()
    cv2.destroyAllWindows()

In [None]:
''' TEST '''
'''
len_videos = 30
video_frames = []
sign_sequence = []
confidence = 0.9
model_3signs.load_weights('models/3S-2.h5')
# signs = np.array(["hola", "gracias", "atencion", "comoestas", "buenosdias", "bienvenidos","adios", "porfavor"])
signs = np.array(["hola", "gracias", "atencion"])
# model = load_model("models/model_6signs_2000ep_v2.h5")
model = model_3signs
out = cv2.VideoWriter('output.mp4', -1, 20.0, (640,480))

camera = cv2.VideoCapture(1)

if camera.isOpened() is False: print("Camera is not available.")
    
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic: # Setting mediapipe model
    while camera.isOpened():
        for num_frame in range(len_videos):
            # Read frames and show them
            _, frame = camera.read()

            # Model results prediction
            img, results = detection(frame, holistic)

            # Show landmarks
            show_landmarks(img, results)

            # Predictions
            landmarks = get_landmarks(results)
            video_frames.append(landmarks)
            video_frames = video_frames[-30:]
            
            if len(video_frames) == 30:
                pred = model.predict(np.expand_dims(video_frames, axis=0))[0]
                sign_pred = signs[np.argmax(pred)]
        
                if pred[np.argmax(pred)] > confidence:
                    if len(sign_sequence) > 0:
                        if sign_pred != sign_sequence[-1]:
                            sign_sequence.append(sign_pred)
                    else:
                        sign_sequence.append(sign_pred)

            if len(sign_sequence) > 4:
                sign_sequence = sign_sequence[-4:]
                
                
            img = show_probabilities(pred, signs, img, colors)
            cv2.rectangle(img, (0,0), (640, 40), (255, 255, 255), -1)
            cv2.putText(img, ' | '.join(sign_sequence), (3,30), cv2.FONT_HERSHEY_SIMPLEX, .75, (0,0,0), 1, cv2.LINE_AA)
            cv2.imshow('Sign Language Recognition', img)


            if cv2.waitKey(1) == ord("q"):
                camera.release()
                break
    out.release()
    cv2.destroyAllWindows()
    '''

In [None]:
colors = [(245,117,16),(117,245,16),(16,117,245)]
def show_probabilities(pred, signs, img, colors):
    output = img.copy()
    for index, prob in enumerate(pred):
        cv2.rectangle(output, (0,60+index*40), (int(prob*100), 90+index*40), colors[index], -1)
        cv2.putText(output, signs[index], (0, 85+index*40), cv2.FONT_HERSHEY_SIMPLEX, 1, (255,255,255), 2, cv2.LINE_AA)
        
    return output

In [None]:
plt.imshow(show_probabilities(results, signs, img, colors))