In [2]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np

In [3]:
def build_encoder(embedding_dim=64):
    inp = keras.Input(shape=(None, 3))  # variable-length sequence of (x,y,z)

    x = layers.Masking(mask_value=0.0)(inp)
    x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
    x = layers.Bidirectional(layers.LSTM(32))(x)
    x = layers.Dense(128, activation="relu")(x)
    out = layers.Dense(embedding_dim)(x)   # final embed

    return keras.Model(inp, out, name="gesture_encoder")

encoder = build_encoder()
encoder.summary()

In [5]:
gest1 = keras.Input(shape=(None, 3))
gest2 = keras.Input(shape=(None, 3))

enc1 = encoder(gest1)
enc2 = encoder(gest2)

# L2 distance
distance = layers.Lambda(lambda x: tf.norm(x[0] - x[1], axis=1, keepdims=True))([enc1, enc2])

siamese = keras.Model([gest1, gest2], distance, name="siamese_network")
siamese.summary()

In [7]:
def contrastive_loss(y_true, y_pred, margin=1.0):
    # y_true: 0 = same gesture, 1 = different
    # y_pred: distance between embeddings
    return tf.reduce_mean(
        (1 - y_true) * tf.square(y_pred) +
        (y_true) * tf.square(tf.maximum(margin - y_pred, 0))
    )

In [8]:
siamese.compile(
    optimizer=keras.optimizers.Adam(1e-3),
    loss=contrastive_loss
)