In [None]:
import cv2
import time
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
import mediapipe as mp
from collections import deque

# Rebuild THOSnet model structure
def transformer_decoder_block(query, key_value, head_size, num_heads, ff_dim, dropout=0.3):
    x = layers.LayerNormalization(epsilon=1e-6)(query)
    x1 = layers.MultiHeadAttention(key_dim=head_size, num_heads=num_heads, dropout=dropout)(x, x)
    x1 = layers.Dropout(dropout)(x1)
    x = layers.Add()([x1, query])
    y = layers.LayerNormalization(epsilon=1e-6)(x)
    y1 = layers.MultiHeadAttention(key_dim=head_size, num_heads=num_heads, dropout=dropout)(y, key_value)
    y1 = layers.Dropout(dropout)(y1)
    y = layers.Add()([y1, x])
    z = layers.LayerNormalization(epsilon=1e-6)(y)
    z1 = layers.Dense(ff_dim, activation='gelu')(z)
    z1 = layers.Dropout(dropout)(z1)
    z1 = layers.Dense(query.shape[-1])(z1)
    return layers.Add()([z1, y])

def build_thosnet(input_shape=(30,63)):
    left_in  = tf.keras.Input(shape=input_shape, name="left_hand")
    right_in = tf.keras.Input(shape=input_shape, name="right_hand")
    L = layers.Bidirectional(layers.LSTM(256, return_sequences=True))(left_in)
    R = layers.Bidirectional(layers.LSTM(256, return_sequences=True))(right_in)
    decA = transformer_decoder_block(L, R, head_size=256, num_heads=8, ff_dim=64)
    decB = transformer_decoder_block(R, L, head_size=256, num_heads=8, ff_dim=64)
    merged = layers.Concatenate()([decA, decB])
    flat   = layers.Flatten()(merged)
    x = layers.Dense(128, activation='gelu')(flat)
    x = layers.Dropout(0.3)(x)
    x = layers.Dense(64, activation='gelu')(x)
    x = layers.Dropout(0.3)(x)
    out = layers.Dense(9, activation='softmax')(x)
    return tf.keras.Model([left_in, right_in], out)

model = build_thosnet()

#You can find the weights file from the repo under Google Drive.
model.load_weights('models/thosnet_weights.h5')

mp_hands = mp.solutions.hands
hands = mp_hands.Hands(
    max_num_hands=2,
    min_detection_confidence=0.75,
    min_tracking_confidence=0.75)
mp_draw = mp.solutions.drawing_utils

buffer = deque(maxlen=30)
class_names = [f"Gesture_{i}" for i in range(9)]
cap = cv2.VideoCapture(0,cv2.CAP_AVFOUNDATION) #CHange 0 if you have multiple cameras. I added the second argument to try out GoPro's for predictions, you can remove it.

def draw_label(img, text, pos, bg_color=(0,0,0), text_color=(0,255,0), font_scale=2, thickness=1):
    """Draws a semi-transparent box with text and a subtle shadow."""
    x, y = pos
    font = cv2.FONT_HERSHEY_DUPLEX
    # get text size
    (w, h), _ = cv2.getTextSize(text, font, font_scale, thickness)
    padding = 10
    # draw semi-transparent rectangle
    overlay = img.copy()
    cv2.rectangle(overlay, (x, y - h - padding//2), (x + w + padding, y + padding), bg_color, cv2.FILLED)
    alpha = 0.6
    cv2.addWeighted(overlay, alpha, img, 1 - alpha, 0, img)
    # draw drop-shadow
    shadow_off = 2
    cv2.putText(img, text, (x + shadow_off, y + shadow_off), font, font_scale, (0,0,0), thickness+1, cv2.LINE_AA)
    # draw main text
    cv2.putText(img, text, (x, y), font, font_scale, text_color, thickness, cv2.LINE_AA)

#Currently, there is no "No Gesture" class, so the loop will always try to classify your hands if they are present within the frame.
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break
    img_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands.process(img_rgb)

    features = np.zeros((2,21,3))
    if results.multi_hand_landmarks:
        for lm, handedness in zip(results.multi_hand_landmarks, results.multi_handedness):
            coords = np.array([[p.x, p.y, p.z] for p in lm.landmark])
            idx = 0 if handedness.classification[0].label=='Left' else 1
            features[idx] = coords
            mp_draw.draw_landmarks(frame, lm, mp_hands.HAND_CONNECTIONS)

    lh_vec = features[0].reshape(-1)
    rh_vec = features[1].reshape(-1)
    buffer.append(np.concatenate([lh_vec, rh_vec]))

    if len(buffer) == 30:
        seq = np.array(buffer)[None,:,:]
        p = model.predict([seq[:,:, :63], seq[:,:,63:]], verbose=0)
        label = class_names[np.argmax(p)]
    else:
        label = "Collecting..."

    # draw top-left label box
    draw_label(frame, label, (15, 110), bg_color=(30,30,30), text_color=(50,230,50))

    cv2.imshow('THOSnet Live', frame)
    if cv2.waitKey(1) & 0xFF == 27:
        break

cap.release()
cv2.destroyAllWindows()
