# Testing

This notebook is meant for testing the trained encoder-decoder models.

# Dependencies

In [1]:
%%capture
!pip install pandas
!pip install pyarrow
!pip install tensorflow
!pip install protobuf==3.20.*
!pip install mediapipe==0.9.0.1

In [2]:
import os
import json
import shutil
import random
import matplotlib
import numpy as np
from itertools import chain
from collections import deque
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from collections import Counter
from matplotlib import animation, rc
from IPython.display import display, Image

import cv2
import pandas as pd
import mediapipe as mp
import tensorflow as tf
from tensorflow import keras
import pyarrow.parquet as pq
from tensorflow.keras import layers
from mediapipe.framework.formats import landmark_pb2

In [3]:
!python --version
print("TensorFlow v" + tf.__version__)
print("Mediapipe v" + mp.__version__)

Python 3.9.13
TensorFlow v2.14.0
Mediapipe v0.9.0.1


In [4]:
seed = 42
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)
cv2.setRNGSeed(seed)

# Fetch from TfRecords

In [17]:
PATH_KAGGLE_DS = "kaggle_dataset"
dataset_df = pd.read_csv(os.path.join(PATH_KAGGLE_DS, "supplemental_metadata.csv"))
PATH_TFRECORD_DS = os.path.join(PATH_KAGGLE_DS, "test_tfrecords")
tf_records = dataset_df.file_id.map(lambda x: os.path.join(PATH_TFRECORD_DS, f"{x}.tfrecord")).unique()
print(f"List of {len(tf_records)} TFRecord files.")

List of 53 TFRecord files.


In [18]:
with open(os.path.join(PATH_TFRECORD_DS, "feature_columns.json"), 'r') as f:
    json_str = f.read()
FEATURE_COLUMNS = json.loads(json_str)
FEATURE_COLUMNS[:10]

['x_right_hand_0',
 'x_right_hand_1',
 'x_right_hand_2',
 'x_right_hand_3',
 'x_right_hand_4',
 'x_right_hand_5',
 'x_right_hand_6',
 'x_right_hand_7',
 'x_right_hand_8',
 'x_right_hand_9']

In [19]:
# These points represent the hands, elbows, and shoulders.
LPOSE = [13, 15, 17, 19, 21]
RPOSE = [14, 16, 18, 20, 22]

# Facial information isn't necessary, but the nose will serve as a midpoint for normalizing the data, as it is usually located in the middle of the frame.
FPOSE = [0] # Nose as midpoint

# Collecting the indices of certain important/distinct sets of features.
# This can be beneficial during the preprocessing step.
RHAND_IDX = [i for i, col in enumerate(FEATURE_COLUMNS)  if "right" in col]
LHAND_IDX = [i for i, col in enumerate(FEATURE_COLUMNS)  if  "left" in col]
RPOSE_IDX = [i for i, col in enumerate(FEATURE_COLUMNS)  if  "pose" in col and int(col.split("_")[-1]) in RPOSE]
LPOSE_IDX = [i for i, col in enumerate(FEATURE_COLUMNS)  if  "pose" in col and int(col.split("_")[-1]) in LPOSE]
MID_POINT_IDX = [i for i, col in enumerate(FEATURE_COLUMNS)  if  "pose" in col and int(col.split("_")[-1]) == 0] # Nose

In [20]:
def decode_fn(record_bytes):
    schema = {COL: tf.io.VarLenFeature(dtype=tf.float32) for COL in FEATURE_COLUMNS}
    schema["phrase"] = tf.io.FixedLenFeature([], dtype=tf.string)
    features = tf.io.parse_single_example(record_bytes, schema)
    phrase = features["phrase"]
    landmarks = ([tf.sparse.to_dense(features[COL]) for COL in FEATURE_COLUMNS])
    # Transpose to maintain the original shape of landmarks data.
    landmarks = tf.transpose(landmarks)
    
    return landmarks, phrase

In [21]:
# The default mapping that came with the dataset was changed:
# padding is represented with the number 0
# start_token is 60
# end_token is 61
with open (os.path.join(PATH_KAGGLE_DS, "character_to_prediction_index.json"), "r") as f:
    char_to_num = json.load(f)
    
char_to_num = {c:char_to_num[c]+1 for c in char_to_num}

# Add pad_token, start pointer and end pointer to the dict
pad_token = 'P'
pad_token_idx = 0
char_to_num[pad_token] = pad_token_idx

start_token = '<'
start_token_idx = 60
char_to_num[start_token] = start_token_idx

end_token = '>'
end_token_idx = 61
char_to_num[end_token] = end_token_idx

num_to_char = {j:i for i,j in char_to_num.items()}

## Preprocess phrase

In [22]:
table = tf.lookup.StaticHashTable(
    initializer=tf.lookup.KeyValueTensorInitializer(
        keys=list(char_to_num.keys()),
        values=list(char_to_num.values()),
    ),
    default_value=tf.constant(-1),
    name="tf_char_to_num"
)

# Function to decode the characters and pad the phrases
MAX_PHRASE_LEN = 31 + 2 # The start and end token take space as well
def preprocess_phrase(phrase):
    phrase = start_token + phrase + end_token
    phrase = tf.strings.bytes_split(phrase)
    phrase = table.lookup(phrase)
    
    max_len_plus = MAX_PHRASE_LEN + 1
    amount_to_pad = max_len_plus - tf.shape(phrase)[0]
    
    if amount_to_pad > 0:
        phrase = tf.pad(phrase, paddings=[[0, amount_to_pad]], mode = 'CONSTANT', constant_values = pad_token_idx)
    else:
        phrase = phrase[:max_len_plus]

    return phrase

Notice that landmarks don't need to be preprocessed, the saved model should contain it's own

In [39]:
def preprocess(landmark, phrase):
    phrase = preprocess_phrase(phrase)
    return (landmark, phrase[:-1]), phrase[1:] # Shifted phrase for encoder-decoder architecture

## Create TFDataset

In [40]:
def get_dataset(tfrecords, batch_size=1, repeat=False, shuffle=False, drop_remainder=False, cache=False):
    ds = tf.data.TFRecordDataset(tf_records)
    ds = ds.map(decode_fn, tf.data.AUTOTUNE)
    # Note: preprocessing can happen before and after the batching (if you can preprocess the whole batch at once to save computation time)
    ds = ds.map(preprocess, tf.data.AUTOTUNE)
    
    if repeat: 
        ds = ds.repeat()
    
    if shuffle:
        ds = ds.shuffle(shuffle)
        options = tf.data.Options()
        options.experimental_deterministic = (False)
        ds = ds.with_options(options)

    if batch_size >= 1:
        # There's also a padded_batch version of this function
        ds = ds.batch(batch_size, drop_remainder=drop_remainder)
        
    ds = ds.prefetch(tf.data.AUTOTUNE)

    # If the system doesn't have enough RAM caching might slow down the process
    if cache:
        ds = ds.cache()
    
    return ds

test_ds = get_dataset(tf_records, batch_size=1, cache=True)

In [41]:
lm_shape = None
phrase_shape = None

# Create an iterator for the train and valid datasets
test_iterator = iter(test_ds)

# Print data points from the training dataset
print("Training Data:\n")
(landmarks, context), phrase = next(test_iterator)

# Save shapes
lm_shape = landmarks.shape[2]
phrase_shape = phrase.shape[0]
print("Saved shapes:")
print(f"lm_shape: {lm_shape}")
print(f"phrase_shape: {phrase_shape}")
print("-" * 40)
print()

print("Encoder input - first in batch (Landmarks:)")
print(type(landmarks))
print(landmarks.shape)
print(landmarks[0])
print("-" * 40)
print()

print("Decoder input (Context):")
print(context.shape)
print(context[0])
print("-" * 40)
print()

print("Model target output (Phrase):")
print(phrase.shape)
print(phrase[0])
print("-" * 40)

Training Data:

Saved shapes:
lm_shape: 159
phrase_shape: 1
----------------------------------------

Encoder input - first in batch (Landmarks:)
<class 'tensorflow.python.framework.ops.EagerTensor'>
(1, 178, 159)
tf.Tensor(
[[ 0.12627919  0.22162548  0.3116899  ... -2.7103877  -2.565694
  -2.1914308 ]
 [        nan         nan         nan ... -2.791268   -2.6042914
  -2.3839378 ]
 [        nan         nan         nan ... -2.7606401  -2.5794535
  -2.3470478 ]
 ...
 [        nan         nan         nan ... -2.8067975  -2.6485422
  -1.3479928 ]
 [ 0.13762568  0.20138527  0.25962168 ... -2.979717   -2.6853716
  -1.9384248 ]
 [ 0.10258354  0.1709002   0.24561381 ... -3.1564484  -2.9503818
  -2.201033  ]], shape=(178, 159), dtype=float32)
----------------------------------------

Decoder input (Context):
(1, 33)
tf.Tensor(
[60 41 46 52 37 50 37 51 52 41 46 39  1 47 34 51 37 50 54 33 52 41 47 46
  1 55 33 51  1 45 33 36 37], shape=(33,), dtype=int32)
----------------------------------------


# Load Model

In [46]:
loaded_model = tf.saved_model.load("GRU_local_test")

In [47]:
loaded_model.predict(np.zeros((FRAME_LEN, len(FEATURE_COLUMNS))), "abc")

{'result': <tf.Tensor: shape=(), dtype=string, numpy=b'h'>,
 'confidence': <tf.Tensor: shape=(), dtype=float32, numpy=0.55950594>}

In [48]:
loaded_model.info()

<tf.Tensor: shape=(159,), dtype=string, numpy=
array([b'x_right_hand_0', b'x_right_hand_1', b'x_right_hand_2',
       b'x_right_hand_3', b'x_right_hand_4', b'x_right_hand_5',
       b'x_right_hand_6', b'x_right_hand_7', b'x_right_hand_8',
       b'x_right_hand_9', b'x_right_hand_10', b'x_right_hand_11',
       b'x_right_hand_12', b'x_right_hand_13', b'x_right_hand_14',
       b'x_right_hand_15', b'x_right_hand_16', b'x_right_hand_17',
       b'x_right_hand_18', b'x_right_hand_19', b'x_right_hand_20',
       b'x_left_hand_0', b'x_left_hand_1', b'x_left_hand_2',
       b'x_left_hand_3', b'x_left_hand_4', b'x_left_hand_5',
       b'x_left_hand_6', b'x_left_hand_7', b'x_left_hand_8',
       b'x_left_hand_9', b'x_left_hand_10', b'x_left_hand_11',
       b'x_left_hand_12', b'x_left_hand_13', b'x_left_hand_14',
       b'x_left_hand_15', b'x_left_hand_16', b'x_left_hand_17',
       b'x_left_hand_18', b'x_left_hand_19', b'x_left_hand_20',
       b'x_pose_13', b'x_pose_15', b'x_pose_17', b'x_pos