# Load dataset

In [1]:
%%capture
!pip install mediapipe==0.9.0.1
!pip install protobuf==3.20.*
!pip install scikit-image

In [2]:
import os
import shutil
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import tensorflow as tf
import json
import mediapipe
import matplotlib
import matplotlib.pyplot as plt
import random

from skimage.transform import resize
from mediapipe.framework.formats import landmark_pb2
from tensorflow import keras
from tensorflow.keras import layers
from tqdm.notebook import tqdm
from matplotlib import animation, rc

In [4]:
random.seed(42)

In [3]:
print("TensorFlow v" + tf.__version__)
print("Mediapipe v" + mediapipe.__version__)

TensorFlow v2.10.1
Mediapipe v0.9.0.1


In [5]:
dataset_df = pd.read_csv('kaggle_dataset/train.csv')
print("Full train dataset shape is {}".format(dataset_df.shape))

Full train dataset shape is (94477, 4)


In [6]:
dataset_df.head()

Unnamed: 0,path,participant_id,sequence_id,sign
0,train_landmark_files/26734/1000035562.parquet,26734,1000035562,blow
1,train_landmark_files/28656/1000106739.parquet,28656,1000106739,wait
2,train_landmark_files/16069/100015657.parquet,16069,100015657,cloud
3,train_landmark_files/25571/1000210073.parquet,25571,1000210073,bird
4,train_landmark_files/62590/1000240708.parquet,62590,1000240708,owie


In [7]:
dataset_df.value_counts("sign")

sign
listen    415
look      414
shhh      411
donkey    410
mouse     408
         ... 
person    312
dance     312
beside    310
vacuum    307
zipper    299
Name: count, Length: 250, dtype: int64

In [8]:
signs = sorted(set(dataset_df.sign))
signs_to_idx = {sign : i for i, sign in enumerate(signs)}
idx_to_sign = {i : sign for sign, i in signs_to_idx.items()}

# Sample

In [9]:
# Transform the dataframe to be tha same format as the fingerspelling dataset (1 row for each frame)

with open("name_to_idx.json", "r") as json_file:
    name_to_idx = json.load(json_file)

def transform_df(df):
    new_rows = []
    
    # Sort the values by frame so that we can create the new row for each frame
    df = df.sort_values("frame")
    
    # Create new row for the first frame (NaN for default value)
    new_row = [np.nan]*len(name_to_idx)
    new_row[0] = df["frame"][0]
    for i, row in df.iterrows():
        # If we switch reach a new frame we save the old one and initialize a new empty one
        if new_row[0] != row["frame"]:
            new_rows.append(new_row)
            new_row = [np.nan]*len(name_to_idx)
            new_row[0] = row["frame"]
        
        new_row[name_to_idx[f"x_{row['type']}_{row['landmark_index']}"]] = row['x']
        new_row[name_to_idx[f"y_{row['type']}_{row['landmark_index']}"]] = row['y']
        new_row[name_to_idx[f"z_{row['type']}_{row['landmark_index']}"]] = row['z']
        
    return pd.DataFrame(new_rows, columns=name_to_idx.keys())

In [30]:
def random_sequence():
    # Fetch sequence_id, file_id, phrase from first row
    path, sequence_id, sign = dataset_df.sample(n=1).iloc[0][['path', 'sequence_id', 'sign']]
    print(f"sequence_id: {sequence_id}, path: {path}, sign: {sign}")
    
    # Fetch data from parquet file (This dataset stores each landmark in a separate row)
    sample_sequence_df_wrong_format = pq.read_table(f"kaggle_dataset/{path}",).to_pandas()
    print("Full sequence dataset shape is {}".format(sample_sequence_df_wrong_format.shape))
    
    return sample_sequence_df_wrong_format

In [31]:
sample_sequence_df_wrong_format = random_sequence()

sequence_id: 1895414557, path: train_landmark_files/34503/1895414557.parquet, sign: head
Full sequence dataset shape is (78735, 7)


In [32]:
%%time
sample_sequence_df = transform_df(sample_sequence_df_wrong_format)
sample_sequence_df.head()

CPU times: total: 8.02 s
Wall time: 11 s


Unnamed: 0,frame,x_face_0,x_face_1,x_face_2,x_face_3,x_face_4,x_face_5,x_face_6,x_face_7,x_face_8,...,z_right_hand_11,z_right_hand_12,z_right_hand_13,z_right_hand_14,z_right_hand_15,z_right_hand_16,z_right_hand_17,z_right_hand_18,z_right_hand_19,z_right_hand_20
0,0,0.40047,0.42121,0.413655,0.403119,0.422148,0.420262,0.412606,0.287118,0.410297,...,,,,,,,,,,
1,1,0.401217,0.419098,0.411615,0.401396,0.420222,0.418534,0.411466,0.286664,0.409502,...,,,,,,,,,,
2,2,0.400293,0.419002,0.411728,0.401248,0.420066,0.418372,0.411345,0.286973,0.409468,...,,,,,,,,,,
3,3,0.400219,0.419336,0.411985,0.401528,0.420417,0.418719,0.411653,0.286871,0.409748,...,,,,,,,,,,
4,4,0.407448,0.428098,0.42052,0.408788,0.428836,0.426585,0.418127,0.291277,0.415483,...,,,,,,,,,,


## Animation preview

In [33]:
matplotlib.rcParams['animation.embed_limit'] = 2**128
matplotlib.rcParams['savefig.pad_inches'] = 0
rc('animation', html='jshtml')

def create_animation(images):
    fig = plt.figure(figsize=(6, 9))
    ax = plt.Axes(fig, [0., 0., 1., 1.])
    ax.set_axis_off()
    fig.add_axes(ax)
    im=ax.imshow(images[0], cmap="gray")
    plt.close(fig)
    
    def animate_func(i):
        im.set_array(images[i])
        return [im]

    return animation.FuncAnimation(fig, animate_func, frames=len(images), interval=1000/10)

mp_pose = mediapipe.solutions.pose
mp_hands = mediapipe.solutions.hands
mp_holistic = mediapipe.solutions.holistic
mp_drawing = mediapipe.solutions.drawing_utils 
mp_drawing_styles = mediapipe.solutions.drawing_styles

def collect_landmarks_oftype(_type: str, frame_data: pd.Series):
    x_type = frame_data.filter(regex=f"x_{_type}.*").values
    y_type = frame_data.filter(regex=f"y_{_type}.*").values
    z_type = frame_data.filter(regex=f"z_{_type}.*").values
    landmarks = landmark_pb2.NormalizedLandmarkList()
    for x, y, z in zip(x_type, y_type, z_type):
        landmarks.landmark.add(x=x, y=y, z=z)
    return landmarks
        
def produce_video_from_seq(seq_df: pd.DataFrame):
    frames = []
    for seq_idx in range(len(seq_df)):
        frame = np.zeros((600, 600, 3))
        frame_data = seq_df.iloc[seq_idx]
        
        # Right hand
        right_hand_landmarks = collect_landmarks_oftype("right_hand", frame_data)
        mp_drawing.draw_landmarks(
                frame,
                right_hand_landmarks,
                mp_hands.HAND_CONNECTIONS,
                landmark_drawing_spec=mp_drawing_styles.get_default_hand_landmarks_style())
        
        # Left Hand
        left_hand_landmarks = collect_landmarks_oftype("left_hand", frame_data)
        mp_drawing.draw_landmarks(
                frame,
                left_hand_landmarks,
                mp_hands.HAND_CONNECTIONS,
                landmark_drawing_spec=mp_drawing_styles.get_default_hand_landmarks_style())
        
        # Face
        face_landmarks = collect_landmarks_oftype("face", frame_data)
        mp_drawing.draw_landmarks(
            frame,
            face_landmarks,
            mp_holistic.FACEMESH_CONTOURS,
            landmark_drawing_spec=None,
            connection_drawing_spec=mp_drawing_styles
            .get_default_face_mesh_contours_style())
        
        # Pose
        pose_landmarks = collect_landmarks_oftype("pose", frame_data)
        mp_drawing.draw_landmarks(
            frame,
            pose_landmarks,
            mp_holistic.POSE_CONNECTIONS,
            landmark_drawing_spec=mp_drawing_styles.
            get_default_pose_landmarks_style())
        
        frames.append(frame.astype(np.uint8))
    return frames

In [39]:
video = produce_video_from_seq(transform_df(random_sequence()))
create_animation(video)

sequence_id: 4010215284, path: train_landmark_files/55372/4010215284.parquet, sign: dance
Full sequence dataset shape is (26607, 7)


# Init feature vector

In [40]:
# Pose coordinates for hand movement.
LPOSE = [13, 15, 17, 19, 21]
RPOSE = [14, 16, 18, 20, 22]
POSE = LPOSE + RPOSE

X = [f'x_right_hand_{i}' for i in range(21)] + [f'x_left_hand_{i}' for i in range(21)] + [f'x_pose_{i}' for i in POSE] + [f'x_face_{i}' for i in range(468)]
Y = [f'y_right_hand_{i}' for i in range(21)] + [f'y_left_hand_{i}' for i in range(21)] + [f'y_pose_{i}' for i in POSE] + [f'y_face_{i}' for i in range(468)]
Z = [f'z_right_hand_{i}' for i in range(21)] + [f'z_left_hand_{i}' for i in range(21)] + [f'z_pose_{i}' for i in POSE] + [f'z_face_{i}' for i in range(468)]

FEATURE_COLUMNS = X + Y + Z

X_IDX = [i for i, col in enumerate(FEATURE_COLUMNS)  if "x_" in col]
Y_IDX = [i for i, col in enumerate(FEATURE_COLUMNS)  if "y_" in col]
Z_IDX = [i for i, col in enumerate(FEATURE_COLUMNS)  if "z_" in col]

RHAND_IDX = [i for i, col in enumerate(FEATURE_COLUMNS)  if "right" in col]
LHAND_IDX = [i for i, col in enumerate(FEATURE_COLUMNS)  if  "left" in col]
RPOSE_IDX = [i for i, col in enumerate(FEATURE_COLUMNS)  if  "pose" in col and int(col[-2:]) in RPOSE]
LPOSE_IDX = [i for i, col in enumerate(FEATURE_COLUMNS)  if  "pose" in col and int(col[-2:]) in LPOSE]

# Save to TFRecords

In [41]:
# Takes around 15 hours
def save_tfrecords_to(folder: str):
    # Loop through each file_id
    for _, dataset_df_row in tqdm(dataset_df.iterrows(), total=len(dataset_df)):
        pq_file = dataset_df_row["path"]
        
        # Fetch the parquet file
        parquet_df = transform_df(pq.read_table(f"kaggle_dataset/{pq_file}",).to_pandas())

        # File name for the updated data
        tf_file = f"{folder}/{dataset_df_row['participant_id']}_{dataset_df_row['sequence_id']}.tfrecord"
        parquet_numpy = parquet_df.to_numpy()
        # Initialize the pointer to write the output of each `for loop` below as a sequence into the file.
        with tf.io.TFRecordWriter(tf_file) as file_writer:
            frames = parquet_df.to_numpy()
            phrase = dataset_df_row["sign"]
            features = {FEATURE_COLUMNS[i]: tf.train.Feature(float_list=tf.train.FloatList(value=frames[:, i])) for i in range(len(FEATURE_COLUMNS))}
            features["phrase"] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[bytes(phrase, 'utf-8')]))
            record_bytes = tf.train.Example(features=tf.train.Features(feature=features)).SerializeToString()
            file_writer.write(record_bytes)

folder_name = "preprocessed"
if not os.path.isdir(folder_name):
    os.mkdir(folder_name)
    save_tfrecords_to(folder_name)
else:
    print("Warning: Tfrecords already exist! Delete the folder to regenerate them!")



# Fetch from TFRecords

In [42]:
tf_records = []
for _, dataset_df_row in dataset_df.iterrows():
    tf_records.append(f"{folder_name}/{dataset_df_row['participant_id']}_{dataset_df_row['sequence_id']}.tfrecord")
print(f"List of {len(tf_records)} TFRecord files.")

List of 94477 TFRecord files.


In [51]:
FRAME_LEN = 50

def pre_process(x):
    # Pad end of sequence to be exactly 50 frames long
    if tf.shape(x)[0] < FRAME_LEN:
        x = tf.pad(x, ([[0, FRAME_LEN-tf.shape(x)[0]], [0, 0]]))
    else:
        x = x[:FRAME_LEN, :]
        
    # Replace NaN values with 0
    x = tf.where(tf.math.is_nan(x), tf.zeros_like(x), x)
    
    return x

In [52]:
def decode_fn(record_bytes):
    schema = {COL: tf.io.VarLenFeature(dtype=tf.float32) for COL in FEATURE_COLUMNS}
    schema["phrase"] = tf.io.FixedLenFeature([], dtype=tf.string)
    features = tf.io.parse_single_example(record_bytes, schema)
    phrase = features["phrase"]
    landmarks = ([tf.sparse.to_dense(features[COL]) for COL in FEATURE_COLUMNS])
    # Transpose to maintain the original shape of landmarks data.
    landmarks = tf.transpose(landmarks)
    
    return landmarks, phrase

In [53]:
table = tf.lookup.StaticHashTable(
    initializer=tf.lookup.KeyValueTensorInitializer(
        keys=list(signs_to_idx.keys()),
        values=list(signs_to_idx.values()),
    ),
    default_value=tf.constant(-1),
    name="labels"
)

def convert_fn(landmarks, phrase):
    phrase = table.lookup(phrase)
    return pre_process(landmarks), phrase

In [54]:
batch_size = 64
train_len = int(0.8 * len(tf_records))

train_ds = tf.data.TFRecordDataset(tf_records[:train_len]).map(decode_fn).map(convert_fn).shuffle(buffer_size=batch_size).batch(batch_size).prefetch(buffer_size=tf.data.AUTOTUNE).cache()
valid_ds = tf.data.TFRecordDataset(tf_records[train_len:]).map(decode_fn).map(convert_fn).batch(batch_size).prefetch(buffer_size=tf.data.AUTOTUNE).cache()

In [115]:
# Define the number of data points you want to print
num_datapoints_to_print = 1

# Create an iterator for the train and valid datasets
train_iterator = iter(train_ds)
valid_iterator = iter(valid_ds)

# Print data points from the training dataset
print("Training Data:")
for _ in range(num_datapoints_to_print):
    try:
        landmarks, phrase = next(train_iterator)
        print("Landmarks:")
        print(type(landmarks))
        print(landmarks.shape)
        print(landmarks.numpy())
        print("Phrase:")
        print(phrase.shape)
        print(phrase)
        print("-" * 40)
    except StopIteration:
        break

Training Data:
Landmarks:
<class 'tensorflow.python.framework.ops.EagerTensor'>
(64, 50, 1560)
[[[ 0.00000000e+00  3.90668094e-01  3.99899721e-01 ... -3.42257321e-02
   -6.10955991e-02 -8.70539919e-02]
  [ 1.00000000e+00  3.82463247e-01  3.91410619e-01 ... -4.75108661e-02
   -7.99100548e-02 -1.13738149e-01]
  [ 2.00000000e+00  3.78545374e-01  3.91116053e-01 ...  0.00000000e+00
    0.00000000e+00  0.00000000e+00]
  ...
  [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
    0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
    0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
    0.00000000e+00  0.00000000e+00]]

 [[ 1.70000000e+01  4.66028064e-01  4.70217824e-01 ...  0.00000000e+00
    0.00000000e+00  0.00000000e+00]
  [ 1.80000000e+01  4.72612560e-01  4.75420505e-01 ...  0.00000000e+00
    0.00000000e+00  0.00000000e+00]
  [ 1.90000000e+01  4.75529402e-01  4.810

# Creating the Model

In [119]:
class LandmarkEmbedding(layers.Layer):
    def __init__(self, num_hid=64, maxlen=100):
        super().__init__()
        self.conv1 = tf.keras.layers.Conv1D(
            num_hid, 11, strides=2, padding="same", activation="relu"
        )
        self.conv2 = tf.keras.layers.Conv1D(
            num_hid, 11, strides=2, padding="same", activation="relu"
        )
        self.conv3 = tf.keras.layers.Conv1D(
            num_hid, 11, strides=2, padding="same", activation="relu"
        )
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=num_hid)

    def call(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        return self.conv3(x)

class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, num_heads, feed_forward_dim, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [
                layers.Dense(feed_forward_dim, activation="relu"),
                layers.Dense(embed_dim),
            ]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

class Transformer(keras.Model):
    def __init__(
        self,
        num_hid,
        num_head,
        num_feed_forward,
        source_maxlen,
        num_layers_enc,
        num_classes,
    ):
        super().__init__()
        self.loss_function = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)
        self.loss_metric = keras.metrics.Mean(name="loss")
        self.acc_metric = keras.metrics.Mean(name="edit_dist")

        self.model = keras.Sequential(
            [
                layers.Input(shape=(source_maxlen, 1560)),
                LandmarkEmbedding(num_hid=num_hid, maxlen=source_maxlen)
            ] + 
            [
                TransformerEncoder(num_hid, num_head, num_feed_forward)
                for _ in range(num_layers_enc)
            ] + 
            [
                layers.Flatten(),
                layers.Dense(256, activation="relu"),
                layers.Dense(num_classes, activation="softmax")
            ]
        )

    def call(self, inputs, training):
        return self.model(inputs, training)

    @property
    def metrics(self):
        return [self.loss_metric]
    
    def train_step(self, batch):
        source = batch[0] # Source sequence
        target_class_labels = batch[1] # target label

        with tf.GradientTape() as tape:
            preds = self(source, training=True)
            loss = self.loss_function(target_class_labels, preds)

        trainable_vars = self.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))

        # Update accuracy metric
        accuracy = tf.reduce_mean(tf.keras.metrics.sparse_categorical_accuracy(target_class_labels, preds))
        self.acc_metric.update_state(accuracy)

        # Update loss metric
        self.loss_metric.update_state(loss)

        return {"loss": self.loss_metric.result(), "accuracy": self.acc_metric.result()}

    def predict_class(self, input_sequence):
        input_sequence = tf.convert_to_tensor(input_sequence)
        predictions = self(input_sequence, training=False)
        predicted_class = tf.argmax(predictions, axis=-1).numpy()
        return predicted_class
    
    def predict_prob(self, input_sequence):
        input_sequence = tf.convert_to_tensor(input_sequence)
        predictions = self(input_sequence, training=False)
        return predictions

In [120]:
model = Transformer(
        num_hid=32,
        num_head=2,
        num_feed_forward=32,
        source_maxlen=FRAME_LEN,
        num_layers_enc=1,
        num_classes=len(signs_to_idx),
)

optimizer = keras.optimizers.Adam(0.0001)
model.compile(optimizer="adam")

In [121]:
history = model.fit(train_ds, validation_data=valid_ds, callbacks=[], epochs=1)

    246/Unknown - 26s 90ms/step - loss: 5.5356 - accuracy: 0.0058

KeyboardInterrupt: 

In [122]:
batch = next(iter(valid_ds))
pred = model.predict_class(batch[0])

In [123]:
pred

array([132,  35, 134, 134,  92, 133, 139, 133,  92,   4, 189, 133, 134,
       134, 134,   4,   4, 132, 134,  47, 134, 134, 134, 189, 133, 133,
       132, 134, 132, 132,  47, 132, 133, 133, 134, 134, 132,  92, 134,
       134,  47, 133, 134,  92, 133, 132, 133, 133, 189, 132,  92,  92,
       134, 133,   4,  92, 134, 133, 134, 134, 134, 133, 133,   4],
      dtype=int64)

In [124]:
probs = model.predict_prob(batch[0])
max(probs[0].numpy())

0.011367946

In [90]:
model.summary()

Model: "transformer_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 landmark_embedding_6 (Landm  (None, None, 32)         571744    
 arkEmbedding)                                                   
                                                                 
 sequential_13 (Sequential)  (None, None, 32)          582400    
                                                                 
 flatten_6 (Flatten)         multiple                  0         
                                                                 
 dense_26 (Dense)            multiple                  0 (unused)
                                                                 
 dense_27 (Dense)            multiple                  0 (unused)
                                                                 
Total params: 582,404
Trainable params: 582,400
Non-trainable params: 4
_______________________________________________

In [33]:
# Transformer variables are customized from original keras tutorial to suit this dataset.
# Reference: https://www.kaggle.com/code/shlomoron/aslfr-a-simple-transformer/notebook

# batch = next(iter(valid_ds))

# # The vocabulary to convert predicted indices into characters
# idx_to_char = list(char_to_num.keys())
# display_cb = DisplayOutputs(
#     batch, idx_to_char, target_start_token_idx=char_to_num['<'], target_end_token_idx=char_to_num['>']
# )  # set the arguments as per vocabulary index for '<' and '>'


# loss_fn = tf.keras.losses.CategoricalCrossentropy(
#     from_logits=True, label_smoothing=0.1,
# )




In [337]:
batches = [batch for batch in valid_ds]

preds_list = []
ground_truth_list = []

for batch in batches[:1]:
    source = batch[0]
    target = batch[1].numpy()
    bs = tf.shape(source)[0]
    preds = model.generate(source, start_token_idx)
    preds = preds.numpy()

    for i in range(bs):
        target_text = "".join([idx_to_char[_] for _ in target[i, :]])
        ground_truth_list.append(target_text.replace('P', ''))
        prediction = ""
        for idx in preds[i, :]:
            prediction += idx_to_char[idx]
            if idx == end_token_idx:
                break
        preds_list.append(prediction)

for i in range(30):
    print(ground_truth_list[i])
    print(preds_list[i])
    print('\n~~~\n')

InvalidArgumentError: {{function_node __wrapped__IteratorGetNext_output_types_2_device_/job:localhost/replica:0/task:0/device:CPU:0}} Cannot add tensor to the batch: number of elements does not match. Shapes are: [tensor]: [8,1560], [batch]: [5,1560] [Op:IteratorGetNext]