In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models, Input
import numpy as np

%load_ext autoreload 
%autoreload 2

In [2]:

EMBED_DIM = 64  # final embedding size

# Serializable emb layer
@tf.keras.utils.register_keras_serializable()
class L2Normalization(tf.keras.layers.Layer):
    def call(self, inputs):
        return tf.math.l2_normalize(inputs, axis=1)

def build_encoder():
    # Input: 128x128 distance matrix
    mat_in = Input(shape=(128, 128, 1), name="dist_matrix")

    x = layers.Conv2D(32, (5,5), activation="relu", padding="same")(mat_in)
    x = layers.MaxPool2D((2,2))(x)
    x = layers.Conv2D(64, (3,3), activation="relu", padding="same")(x)
    x = layers.MaxPool2D((2,2))(x)
    x = layers.Conv2D(128, (3,3), activation="relu", padding="same")(x)
    x = layers.GlobalMaxPooling2D()(x)
    mat_feat = layers.Dense(128, activation="relu")(x)

    emb = layers.Dense(EMBED_DIM, activation=None, name="embedding")(mat_feat)
    emb = L2Normalization()(emb)

    return models.Model(mat_in, emb, name="gesture_encoder")



In [3]:
def build_siamese(encoder):
    matA = Input(shape=(128, 128, 1), name="matrix_A")
    matB = Input(shape=(128, 128, 1), name="matrix_B")

    embA = encoder(matA)
    embB = encoder(matB)

    dist = layers.Lambda(
        lambda x: tf.sqrt(tf.reduce_sum(tf.square(x[0] - x[1]), axis=1, keepdims=True))
    )([embA, embB])

    return models.Model([matA, matB], dist)

In [4]:
def contrastive_loss(y_true, y_pred, margin=1.0):
    """
    y_true: 1 if same class, 0 if different
    y_pred: distance between embeddings
    """
    squared = tf.square(y_pred)
    margin_squared = tf.square(tf.maximum(margin - y_pred, 0))
    return tf.reduce_mean(y_true * squared + (1 - y_true) * margin_squared)
    

In [5]:
import numpy as np
import random

def make_pairs(matrices, labels, batch_size=8):
    """
    Yield batches of distance matrix pairs and targets.
    """
    num_samples = len(matrices)
    
    # Pre-group indices by class for fast positive sampling
    class_to_idxs = {}
    for idx, c in enumerate(labels):
        class_to_idxs.setdefault(c, []).append(idx)

    while True:
        matA_batch, matB_batch, y_batch = [], [], []

        for _ in range(batch_size):
            anchor_idx = random.randrange(num_samples)
            anchor_label = labels[anchor_idx]

            # Positive pair 50%
            if random.random() < 0.5:
                pos_idx = random.choice(class_to_idxs[anchor_label])
                while pos_idx == anchor_idx:
                    pos_idx = random.choice(class_to_idxs[anchor_label])
                matA_batch.append(matrices[anchor_idx])
                matB_batch.append(matrices[pos_idx])
                y_batch.append(1.0)
            # Negative pair 50%
            else:
                neg_label = random.choice([l for l in class_to_idxs.keys() if l != anchor_label])
                neg_idx = random.choice(class_to_idxs[neg_label])
                matA_batch.append(matrices[anchor_idx])
                matB_batch.append(matrices[neg_idx])
                y_batch.append(0.0)

        yield (
            (np.array(matA_batch), np.array(matB_batch)),
            np.array(y_batch).reshape(-1, 1)
        )

In [6]:
encoder = build_encoder()
siamese = build_siamese(encoder)

siamese.compile(
    optimizer=tf.keras.optimizers.Adam(1e-3),
    loss=contrastive_loss
)




In [7]:
import numpy as np

data = np.load('gesture_dataset.npz', allow_pickle=True)  # must allow pickle for object arrays
X = data['X']  # gestures
y = data['y']  # labels

class_to_label = data['class_to_label'].item()  # convert from 0-d object to dict


In [None]:
def compute_distance_matrix(points):
    """
    points: (num_points, 3)
    returns: (num_points, num_points) distance matrix
    """
    points = np.asarray(points, dtype=np.float32)
    diff = points[:, np.newaxis, :] - points[np.newaxis, :, :]
    dist_matrix = np.linalg.norm(diff, axis=-1)
    return dist_matrix

# Conv to np arrays
X = np.array([np.array(p, dtype=np.float32) for p in X]) # Shape should be (128, 64, 3)
y = np.array(y)

# Build distance matrices for all gestures
matrices = np.array([compute_distance_matrix(p) for p in X])
matrices = matrices[..., np.newaxis]  # add channel dimension for Conv2D

# Parameters
batch_size = 8
steps_per_epoch = max(1, len(matrices) // batch_size)
epochs = 20

import tensorflow as tf


def generator_fn():
    return make_pairs(matrices, y, batch_size)
# Use TF dataset, mostly to avoid restructuring other code to use numpy types
train_dataset = tf.data.Dataset.from_generator(
    generator_fn,
    output_signature=(
        (
            tf.TensorSpec(shape=(None, 128, 128, 1), dtype=tf.float32),
            tf.TensorSpec(shape=(None, 128, 128, 1), dtype=tf.float32)
        ),
        tf.TensorSpec(shape=(None, 1), dtype=tf.float32)
    )
)

# Train using the TF Dataset
history = siamese.fit(
    train_dataset,
    steps_per_epoch=steps_per_epoch,
    epochs=epochs
)

Epoch 1/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 29ms/step - loss: 0.3037
Epoch 2/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - loss: 0.2150
Epoch 3/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - loss: 0.1608
Epoch 4/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - loss: 0.2074
Epoch 5/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - loss: 0.1249
Epoch 6/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - loss: 0.1826
Epoch 7/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - loss: 0.1687
Epoch 8/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - loss: 0.1290
Epoch 9/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - loss: 0.0811
Epoch 10/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - loss: 0.1099
Epoch 11/20
[1m5/5

In [9]:
encoder.save("gesture_encoder_model.keras")