# Load dataset

In [1]:
%%capture
!pip install mediapipe==0.9.0.1
!pip install protobuf==3.20.*
!pip install scikit-image

In [2]:
import os
import shutil
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import tensorflow as tf
import json
import mediapipe as mp
import matplotlib
import matplotlib.pyplot as plt
import random

from skimage.transform import resize
from mediapipe.framework.formats import landmark_pb2
from tensorflow import keras
from tensorflow.keras import layers
from tqdm.notebook import tqdm
from matplotlib import animation, rc

In [3]:
random.seed(42)

In [4]:
print("TensorFlow v" + tf.__version__)
print("Mediapipe v" + mp.__version__)

TensorFlow v2.10.1
Mediapipe v0.9.0.1


In [5]:
dataset_df = pd.read_csv('kaggle_dataset/train.csv')
print("Full train dataset shape is {}".format(dataset_df.shape))

Full train dataset shape is (67208, 5)


In [6]:
dataset_df.head()

Unnamed: 0,path,file_id,sequence_id,participant_id,phrase
0,train_landmarks/5414471.parquet,5414471,1816796431,217,3 creekhouse
1,train_landmarks/5414471.parquet,5414471,1816825349,107,scales/kuhaylah
2,train_landmarks/5414471.parquet,5414471,1816909464,1,1383 william lanier
3,train_landmarks/5414471.parquet,5414471,1816967051,63,988 franklin lane
4,train_landmarks/5414471.parquet,5414471,1817123330,89,6920 northeast 661st road


In [7]:
def random_sequence():
    # Fetch sequence_id, file_id, phrase from first row
    sequence_id, file_id, phrase, path = dataset_df.sample(n=1).iloc[0][['sequence_id', 'file_id', 'phrase', 'path']]
    print(f"sequence_id: {sequence_id}, file_id: {file_id}, phrase: {phrase}")
    
    # Fetch data from parquet file
    sample_sequence_df = pq.read_table(f"kaggle_dataset/{path}",
        filters=[[('sequence_id', '=', sequence_id)],]).to_pandas()
    print("Full sequence dataset shape is {}".format(sample_sequence_df.shape))
    
    return sample_sequence_df

In [8]:
random_sequence()

sequence_id: 724075833, file_id: 1997878546, phrase: 497047 h curdln
Full sequence dataset shape is (168, 1630)


Unnamed: 0_level_0,frame,x_face_0,x_face_1,x_face_2,x_face_3,x_face_4,x_face_5,x_face_6,x_face_7,x_face_8,...,z_right_hand_11,z_right_hand_12,z_right_hand_13,z_right_hand_14,z_right_hand_15,z_right_hand_16,z_right_hand_17,z_right_hand_18,z_right_hand_19,z_right_hand_20
sequence_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
724075833,0,0.786434,0.776844,0.780788,0.766125,0.776602,0.778432,0.785005,0.677212,0.787416,...,-0.104818,-0.113127,-0.029980,-0.084545,-0.092706,-0.081955,-0.050120,-0.087263,-0.086312,-0.073613
724075833,1,0.787180,0.780678,0.784941,0.770578,0.780537,0.782694,0.790051,0.682130,0.792696,...,-0.099645,-0.106326,-0.061372,-0.097754,-0.096225,-0.087035,-0.085064,-0.109499,-0.100732,-0.088831
724075833,2,0.793597,0.782876,0.788007,0.773621,0.782761,0.785362,0.794120,0.685221,0.797460,...,-0.094775,-0.104225,-0.057479,-0.087467,-0.091252,-0.088096,-0.079550,-0.100361,-0.097426,-0.091146
724075833,3,0.802086,0.788098,0.793293,0.778835,0.787990,0.790697,0.799857,0.691625,0.803360,...,-0.076394,-0.085473,-0.041041,-0.065504,-0.070884,-0.069076,-0.062428,-0.077883,-0.076435,-0.071316
724075833,4,0.801681,0.788089,0.794191,0.780036,0.787941,0.791108,0.801792,0.696289,0.805896,...,-0.052315,-0.058263,-0.024027,-0.045400,-0.050463,-0.047200,-0.042314,-0.056826,-0.054951,-0.047908
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
724075833,163,0.791813,0.773413,0.779153,0.764597,0.773078,0.775914,0.785739,0.680122,0.789645,...,-0.069648,-0.062977,-0.019566,-0.070515,-0.072828,-0.054326,-0.036306,-0.061253,-0.050579,-0.032215
724075833,164,0.784764,0.769529,0.774728,0.760267,0.769236,0.771802,0.780711,0.673648,0.784197,...,-0.075887,-0.067889,-0.022807,-0.074588,-0.075767,-0.056188,-0.037176,-0.063427,-0.053399,-0.035096
724075833,165,0.781417,0.766167,0.771535,0.757285,0.765947,0.768689,0.778089,0.671787,0.781850,...,-0.075671,-0.068756,-0.023036,-0.072662,-0.073342,-0.054439,-0.037177,-0.061334,-0.050071,-0.031552
724075833,166,0.775487,0.761833,0.766774,0.752508,0.761624,0.764130,0.772850,0.667942,0.776583,...,,,,,,,,,,


In [9]:
# import json

# name_to_idx = {}
# for i, name in enumerate(sample_sequence_df.columns):
#     name_to_idx[name] = i
    
# json.dumps()

## Animation preview

In [10]:
matplotlib.rcParams['animation.embed_limit'] = 2**128
matplotlib.rcParams['savefig.pad_inches'] = 0
rc('animation', html='jshtml')

def create_animation(images):
    fig = plt.figure(figsize=(6, 9))
    ax = plt.Axes(fig, [0., 0., 1., 1.])
    ax.set_axis_off()
    fig.add_axes(ax)
    im=ax.imshow(images[0], cmap="gray")
    plt.close(fig)
    
    def animate_func(i):
        im.set_array(images[i])
        return [im]

    return animation.FuncAnimation(fig, animate_func, frames=len(images), interval=1000/10)

mp_pose = mp.solutions.pose
mp_hands = mp.solutions.hands
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils 
mp_drawing_styles = mp.solutions.drawing_styles

def collect_landmarks_oftype(_type: str, frame_data: pd.Series):
    x_type = frame_data.filter(regex=f"x_{_type}.*").values
    y_type = frame_data.filter(regex=f"y_{_type}.*").values
    z_type = frame_data.filter(regex=f"z_{_type}.*").values
    landmarks = landmark_pb2.NormalizedLandmarkList()
    for x, y, z in zip(x_type, y_type, z_type):
        landmarks.landmark.add(x=x, y=y, z=z)
    return landmarks
        
def produce_video_from_seq(seq_df: pd.DataFrame):
    frames = []
    for seq_idx in range(len(seq_df)):
        frame = np.zeros((600, 600, 3))
        frame_data = seq_df.iloc[seq_idx]
        
        # Right hand
        right_hand_landmarks = collect_landmarks_oftype("right_hand", frame_data)
        mp_drawing.draw_landmarks(
                frame,
                right_hand_landmarks,
                mp_hands.HAND_CONNECTIONS,
                landmark_drawing_spec=mp_drawing_styles.get_default_hand_landmarks_style())
        
        # Left Hand
        left_hand_landmarks = collect_landmarks_oftype("left_hand", frame_data)
        mp_drawing.draw_landmarks(
                frame,
                left_hand_landmarks,
                mp_hands.HAND_CONNECTIONS,
                landmark_drawing_spec=mp_drawing_styles.get_default_hand_landmarks_style())
        
        # Face
        face_landmarks = collect_landmarks_oftype("face", frame_data)
        mp_drawing.draw_landmarks(
            frame,
            face_landmarks,
            mp_holistic.FACEMESH_CONTOURS,
            landmark_drawing_spec=None,
            connection_drawing_spec=mp_drawing_styles
            .get_default_face_mesh_contours_style())
        
        # Pose
        pose_landmarks = collect_landmarks_oftype("pose", frame_data)
        mp_drawing.draw_landmarks(
            frame,
            pose_landmarks,
            mp_holistic.POSE_CONNECTIONS,
            landmark_drawing_spec=mp_drawing_styles.
            get_default_pose_landmarks_style())
        
        frames.append(frame.astype(np.uint8))
    return frames

In [11]:
video = produce_video_from_seq(random_sequence())
create_animation(video)

sequence_id: 2145563587, file_id: 450474571, phrase: 7216 springhill furnace
Full sequence dataset shape is (336, 1630)


# Save to TFRecords

In [12]:
# Pose coordinates for hand movement.
LPOSE = [13, 15, 17, 19, 21]
RPOSE = [14, 16, 18, 20, 22]
POSE = LPOSE + RPOSE

X = [f'x_right_hand_{i}' for i in range(21)] + [f'x_left_hand_{i}' for i in range(21)] + [f'x_pose_{i}' for i in POSE]
Y = [f'y_right_hand_{i}' for i in range(21)] + [f'y_left_hand_{i}' for i in range(21)] + [f'y_pose_{i}' for i in POSE]
Z = [f'z_right_hand_{i}' for i in range(21)] + [f'z_left_hand_{i}' for i in range(21)] + [f'z_pose_{i}' for i in POSE]

FEATURE_COLUMNS = X + Y + Z

X_IDX = [i for i, col in enumerate(FEATURE_COLUMNS)  if "x_" in col]
Y_IDX = [i for i, col in enumerate(FEATURE_COLUMNS)  if "y_" in col]
Z_IDX = [i for i, col in enumerate(FEATURE_COLUMNS)  if "z_" in col]

RHAND_IDX = [i for i, col in enumerate(FEATURE_COLUMNS)  if "right" in col]
LHAND_IDX = [i for i, col in enumerate(FEATURE_COLUMNS)  if  "left" in col]
RPOSE_IDX = [i for i, col in enumerate(FEATURE_COLUMNS)  if  "pose" in col and int(col[-2:]) in RPOSE]
LPOSE_IDX = [i for i, col in enumerate(FEATURE_COLUMNS)  if  "pose" in col and int(col[-2:]) in LPOSE]

In [13]:
def save_tfrecords_to(folder: str):
    # Loop through each file_id
    for file_path in tqdm(dataset_df.path.unique()):
        # Parquet file name
        pq_file = f"kaggle_dataset/{file_path}"
        
        parts = file_path.split('/')
        last_part = parts[-1]
        number_string = last_part.split('.')[0]
        file_id = int(number_string)
        
        # Filter train.csv and fetch entries only for the relevant file_id
        file_df = dataset_df.loc[dataset_df["file_id"] == file_id]
        # Fetch the parquet file
        parquet_df = pq.read_table(pq_file,columns=['sequence_id'] + FEATURE_COLUMNS).to_pandas()

        # File name for the updated data
        tf_file = f"{folder}/{file_id}.tfrecord"
        parquet_numpy = parquet_df.to_numpy()
        # Initialize the pointer to write the output of each `for loop` below as a sequence into the file.
        with tf.io.TFRecordWriter(tf_file) as file_writer:
            # Loop through each sequence in file.
            for seq_id, phrase in zip(file_df.sequence_id, file_df.phrase):
                # Fetch sequence data
                frames = parquet_numpy[parquet_df.index == seq_id]

                # Calculate the number of rows that only have NaN values -> on how many frames each hand is fully visible
                r_nonan = np.sum(np.sum(np.isnan(frames[:, RHAND_IDX]), axis = 1) == 0)
                l_nonan = np.sum(np.sum(np.isnan(frames[:, LHAND_IDX]), axis = 1) == 0)
                # Number of frames where the dominant hand is visible (dominant = visible on more frames than the other hand)
                num_of_visible = max(r_nonan, l_nonan)

                # TODO experiment with this
                if 2*len(phrase) < num_of_visible:
                    features = {FEATURE_COLUMNS[i]: tf.train.Feature(float_list=tf.train.FloatList(value=frames[:, i])) for i in range(len(FEATURE_COLUMNS))}
                    features["phrase"] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[bytes(phrase, 'utf-8')]))
                    record_bytes = tf.train.Example(features=tf.train.Features(feature=features)).SerializeToString()
                    file_writer.write(record_bytes)

In [14]:
folder_name = "preprocessed"
if not os.path.isdir(folder_name):
    os.mkdir(folder_name)
    save_tfrecords_to(folder_name)
else:
    print(f"Warning: Tfrecords already exist! Delete the {folder_name} folder to regenerate them!")



# Fetch from TFRecords

In [15]:
tf_records = dataset_df.file_id.map(lambda x: f'{folder_name}/{x}.tfrecord').unique()
print(f"List of {len(tf_records)} TFRecord files.")

List of 68 TFRecord files.


In [16]:
with open ("kaggle_dataset/character_to_prediction_index.json", "r") as f:
    char_to_num = json.load(f)

# Add pad_token, start pointer and end pointer to the dict
pad_token = 'P'
start_token = '<'
end_token = '>'
pad_token_idx = 59
start_token_idx = 60
end_token_idx = 61

char_to_num[pad_token] = pad_token_idx
char_to_num[start_token] = start_token_idx
char_to_num[end_token] = end_token_idx
num_to_char = {j:i for i,j in char_to_num.items()}

In [17]:
FRAME_LEN = 128

# Function to resize and add padding.
def resize_pad(x):
    if tf.shape(x)[0] < FRAME_LEN:
        x = tf.pad(x, ([[0, FRAME_LEN-tf.shape(x)[0]], [0, 0], [0, 0]]))
    else:
        x = tf.image.resize(x, (FRAME_LEN, tf.shape(x)[1]))
    return x

# Detect the dominant hand from the number of NaN values.
# Dominant hand will have less NaN values since it is in frame moving.
def pre_process(x):
    rhand = tf.gather(x, RHAND_IDX, axis=1)
    lhand = tf.gather(x, LHAND_IDX, axis=1)
    rpose = tf.gather(x, RPOSE_IDX, axis=1)
    lpose = tf.gather(x, LPOSE_IDX, axis=1)
    
    rnan_idx = tf.reduce_any(tf.math.is_nan(rhand), axis=1)
    lnan_idx = tf.reduce_any(tf.math.is_nan(lhand), axis=1)
    
    rnans = tf.math.count_nonzero(rnan_idx)
    lnans = tf.math.count_nonzero(lnan_idx)
    
    # For dominant hand
    if rnans > lnans:
        hand = lhand
        pose = lpose
        
        hand_x = hand[:, 0*(len(LHAND_IDX)//3) : 1*(len(LHAND_IDX)//3)]
        hand_y = hand[:, 1*(len(LHAND_IDX)//3) : 2*(len(LHAND_IDX)//3)]
        hand_z = hand[:, 2*(len(LHAND_IDX)//3) : 3*(len(LHAND_IDX)//3)]
        hand = tf.concat([1-hand_x, hand_y, hand_z], axis=1)
        
        pose_x = pose[:, 0*(len(LPOSE_IDX)//3) : 1*(len(LPOSE_IDX)//3)]
        pose_y = pose[:, 1*(len(LPOSE_IDX)//3) : 2*(len(LPOSE_IDX)//3)]
        pose_z = pose[:, 2*(len(LPOSE_IDX)//3) : 3*(len(LPOSE_IDX)//3)]
        pose = tf.concat([1-pose_x, pose_y, pose_z], axis=1)
    else:
        hand = rhand
        pose = rpose
    
    hand_x = hand[:, 0*(len(LHAND_IDX)//3) : 1*(len(LHAND_IDX)//3)]
    hand_y = hand[:, 1*(len(LHAND_IDX)//3) : 2*(len(LHAND_IDX)//3)]
    hand_z = hand[:, 2*(len(LHAND_IDX)//3) : 3*(len(LHAND_IDX)//3)]
    hand = tf.concat([hand_x[..., tf.newaxis], hand_y[..., tf.newaxis], hand_z[..., tf.newaxis]], axis=-1)
    
    mean = tf.math.reduce_mean(hand, axis=1)[:, tf.newaxis, :]
    std = tf.math.reduce_std(hand, axis=1)[:, tf.newaxis, :]
    hand = (hand - mean) / std

    pose_x = pose[:, 0*(len(LPOSE_IDX)//3) : 1*(len(LPOSE_IDX)//3)]
    pose_y = pose[:, 1*(len(LPOSE_IDX)//3) : 2*(len(LPOSE_IDX)//3)]
    pose_z = pose[:, 2*(len(LPOSE_IDX)//3) : 3*(len(LPOSE_IDX)//3)]
    pose = tf.concat([pose_x[..., tf.newaxis], pose_y[..., tf.newaxis], pose_z[..., tf.newaxis]], axis=-1)
    
    x = tf.concat([hand, pose], axis=1)
    x = resize_pad(x)
    
    x = tf.where(tf.math.is_nan(x), tf.zeros_like(x), x)
    x = tf.reshape(x, (FRAME_LEN, len(LHAND_IDX) + len(LPOSE_IDX)))
    return x

In [18]:
def decode_fn(record_bytes):
    schema = {COL: tf.io.VarLenFeature(dtype=tf.float32) for COL in FEATURE_COLUMNS}
    schema["phrase"] = tf.io.FixedLenFeature([], dtype=tf.string)
    features = tf.io.parse_single_example(record_bytes, schema)
    phrase = features["phrase"]
    landmarks = ([tf.sparse.to_dense(features[COL]) for COL in FEATURE_COLUMNS])
    # Transpose to maintain the original shape of landmarks data.
    landmarks = tf.transpose(landmarks)
    
    return landmarks, phrase

In [19]:
table = tf.lookup.StaticHashTable(
    initializer=tf.lookup.KeyValueTensorInitializer(
        keys=list(char_to_num.keys()),
        values=list(char_to_num.values()),
    ),
    default_value=tf.constant(-1),
    name="class_weight"
)

def convert_fn(landmarks, phrase):
    # Add start and end pointers to phrase.
    phrase = start_token + phrase + end_token
    phrase = tf.strings.bytes_split(phrase)
    phrase = table.lookup(phrase)
    # Vectorize and add padding.
    phrase = tf.pad(phrase, paddings=[[0, 64 - tf.shape(phrase)[0]]], mode = 'CONSTANT', constant_values = pad_token_idx)
    # Apply pre_process function to the landmarks.
    return pre_process(landmarks), phrase

In [20]:
batch_size = 32
train_len = int(0.8 * len(tf_records))

train_ds = tf.data.TFRecordDataset(tf_records[:train_len]).map(decode_fn).map(convert_fn).batch(batch_size).shuffle(buffer_size=4*batch_size).prefetch(buffer_size=tf.data.AUTOTUNE).cache()
valid_ds = tf.data.TFRecordDataset(tf_records[train_len:]).map(decode_fn).map(convert_fn).batch(batch_size).prefetch(buffer_size=tf.data.AUTOTUNE).cache()

In [52]:
lm_shape = None
phrase_shape = None

print("Training Data:")
landmarks, phrase = next(iter(train_ds))

print("Landmarks:")
print(landmarks.shape)
print(landmarks)
lm_shape = (None, landmarks.shape[1], landmarks.shape[2])

print("Phrase:")
print(phrase.shape)
print(phrase)
print("-" * 40)
phrase_shape = (None, phrase.shape[1]-1)

Training Data:
Landmarks:
(32, 128, 78)
tf.Tensor(
[[[-1.4810245   2.0565      1.5850285  ...  0.55934006  0.73609525
   -3.4163632 ]
  [-1.3199145   1.6872301   1.8427417  ...  0.5486241   0.7277294
   -3.4686215 ]
  [-1.1565096   1.7119513   2.4802382  ...  0.50233     0.7655075
   -3.231312  ]
  ...
  [ 0.36633322  2.2592149   1.6072173  ...  0.17582156  0.72810966
   -2.698698  ]
  [ 0.53779     2.2021127   1.6466563  ...  0.16285028  0.73231
   -2.866583  ]
  [ 0.71902776  2.318088    1.8093301  ...  0.18219906  0.75319046
   -2.8884072 ]]

 [[-0.5239163   1.992671    2.496867   ...  0.36825734  0.75486326
   -3.2191818 ]
  [-0.43437085  2.094459    2.424247   ...  0.36232826  0.7228026
   -2.9221902 ]
  [-0.78552026  2.3062243   1.0469462  ...  0.37401614  0.7569099
   -2.912945  ]
  ...
  [ 0.          0.          0.         ...  0.42549008  0.74501777
   -3.1533344 ]
  [ 0.          0.          0.         ...  0.43058455  0.7065478
   -3.1033032 ]
  [ 0.          0.          0.

In [53]:
print(lm_shape)
print(phrase_shape)

(None, 128, 78)
(None, 63)


# Creating the Model

In [62]:
# class TokenEmbedding(layers.Layer):
#     def __init__(self, num_vocab=1000, maxlen=100, num_hid=64):
#         super().__init__()
#         self.num_hid = num_hid
#         self.emb = tf.keras.layers.Embedding(num_vocab, num_hid)
#         self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=num_hid)
#         '''
#         self.pos_emb = tf.math.divide(
#             self.positional_encoding(maxlen-1, num_hid),
#             tf.math.sqrt(tf.cast(num_hid, tf.float32)))
#         '''
#         self.pos_emb = self.positional_encoding(maxlen-1, num_hid)

#     def call(self, x):
#         maxlen = tf.shape(x)[-1]
#         x = self.emb(x)
#         x = tf.math.multiply(x, tf.math.sqrt(tf.cast(self.num_hid, tf.float32)))
#         '''
#         positions = tf.range(start=0, limit=maxlen, delta=1)
#         positions = self.pos_emb(positions)
#         return x + positions
#         '''
#         return x + self.pos_emb[:maxlen, :]
    
#     def positional_encoding(self, maxlen, num_hid):
#         depth = num_hid/2
#         positions = tf.range(maxlen, dtype = tf.float32)[..., tf.newaxis]
#         depths = tf.range(depth, dtype = tf.float32)[np.newaxis, :]/depth
#         angle_rates = tf.math.divide(1, tf.math.pow(tf.cast(10000, tf.float32), depths))
#         angle_rads = tf.linalg.matmul(positions, angle_rates)
#         pos_encoding = tf.concat(
#           [tf.math.sin(angle_rads), tf.math.cos(angle_rads)],
#           axis=-1) 
#         return pos_encoding

# class LandmarkEmbedding(layers.Layer):
#     def __init__(self, num_hid=64, maxlen=100):
#         super().__init__()
#         self.conv1 = tf.keras.layers.Conv1D(
#             num_hid, 11, padding="same", activation="relu"
#         )
#         self.conv2 = tf.keras.layers.Conv1D(
#             num_hid, 11, padding="same", activation="relu"
#         )
#         self.conv3 = tf.keras.layers.Conv1D(
#             num_hid, 11, padding="same", activation="relu"
#         )
#         self.pos_emb = self.positional_encoding(maxlen, num_hid)
#         self.maxlen = maxlen
#         self.num_hid = num_hid

#     def call(self, x):
#         x = self.conv1(x)
#         x = self.conv2(x)
#         x = self.conv3(x)
        
#         x = tf.math.multiply(x, tf.math.sqrt(tf.cast(self.num_hid, tf.float32)))
#         x = x + self.pos_emb
        
#         return x
    
#     def positional_encoding(self, maxlen, num_hid):
#         depth = num_hid/2
#         positions = tf.range(maxlen, dtype = tf.float32)[..., tf.newaxis]
#         depths = tf.range(depth, dtype = tf.float32)[np.newaxis, :]/depth
#         angle_rates = tf.math.divide(1, tf.math.pow(tf.cast(10000, tf.float32), depths))
#         angle_rads = tf.linalg.matmul(positions, angle_rates)
#         pos_encoding = tf.concat(
#           [tf.math.sin(angle_rads), tf.math.cos(angle_rads)],
#           axis=-1) 
#         return pos_encoding

# class TransformerEncoder(layers.Layer):
#     def __init__(self, embed_dim, num_heads, feed_forward_dim, rate=0.1):
#         super().__init__()
#         self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
#         self.ffn = keras.Sequential(
#             [
#                 layers.Dense(feed_forward_dim, activation="relu"),
#                 layers.Dense(embed_dim),
#             ]
#         )
#         self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
#         self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
#         self.dropout1 = layers.Dropout(rate)
#         self.dropout2 = layers.Dropout(rate)

#     def call(self, inputs, training):
#         attn_output = self.att(inputs, inputs)
#         attn_output = self.dropout1(attn_output, training=training)
#         out1 = self.layernorm1(inputs + attn_output)
#         ffn_output = self.ffn(out1)
#         ffn_output = self.dropout2(ffn_output, training=training)
#         return self.layernorm2(out1 + ffn_output)

# class TransformerDecoder(layers.Layer):
#     def __init__(self, embed_dim, num_heads, feed_forward_dim, dropout_rate=0.1):
#         super().__init__()
#         self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
#         self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
#         self.layernorm3 = layers.LayerNormalization(epsilon=1e-6)
#         self.self_att = layers.MultiHeadAttention(
#             num_heads=num_heads, key_dim=embed_dim
#         )
#         self.enc_att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
#         self.self_dropout = layers.Dropout(0.5)
#         self.enc_dropout = layers.Dropout(0.1)
#         self.ffn_dropout = layers.Dropout(0.1)
#         self.ffn = keras.Sequential(
#             [
#                 layers.Dense(feed_forward_dim, activation="relu"),
#                 layers.Dense(embed_dim),
#             ]
#         )

#     def causal_attention_mask(self, batch_size, n_dest, n_src, dtype):
#         """Masks the upper half of the dot product matrix in self attention.

#         This prevents flow of information from future tokens to current token.
#         1's in the lower triangle, counting from the lower right corner.
#         """
#         i = tf.range(n_dest)[:, None]
#         j = tf.range(n_src)
#         m = i >= j - n_src + n_dest
#         mask = tf.cast(m, dtype)
#         mask = tf.reshape(mask, [1, n_dest, n_src])
#         mult = tf.concat(
#             [batch_size[..., tf.newaxis], tf.constant([1, 1], dtype=tf.int32)], 0
#         )
#         return tf.tile(mask, mult)

#     def call(self, enc_out, target, training):
#         input_shape = tf.shape(target)
#         batch_size = input_shape[0]
#         seq_len = input_shape[1]
#         causal_mask = self.causal_attention_mask(batch_size, seq_len, seq_len, tf.bool)
#         target_att = self.self_att(target, target, attention_mask=causal_mask)
#         target_norm = self.layernorm1(target + self.self_dropout(target_att, training = training))
#         enc_out = self.enc_att(target_norm, enc_out)
#         enc_out_norm = self.layernorm2(self.enc_dropout(enc_out, training = training) + target_norm)
#         ffn_out = self.ffn(enc_out_norm)
#         ffn_out_norm = self.layernorm3(enc_out_norm + self.ffn_dropout(ffn_out, training = training))
#         return ffn_out_norm

# class Transformer(keras.Model):
#     def __init__(
#         self,
#         num_hid=64,
#         num_head=4,
#         num_feed_forward=128,
#         source_maxlen=100,
#         target_maxlen=100,
#         num_layers_enc=2,
#         num_layers_dec=1,
#         num_classes=60,
#     ):
#         super().__init__()
#         self.loss_metric = keras.metrics.Mean(name="loss")
#         self.acc_metric = keras.metrics.Mean(name="edit_dist")
#         self.num_layers_enc = num_layers_enc
#         self.num_layers_dec = num_layers_dec
#         self.target_maxlen = target_maxlen
#         self.num_classes = num_classes

#         self.enc_input = LandmarkEmbedding(num_hid=num_hid, maxlen=source_maxlen)
#         self.dec_input = TokenEmbedding(
#             num_vocab=num_classes, maxlen=target_maxlen, num_hid=num_hid
#         )

#         self.encoder = keras.Sequential(
#             [self.enc_input]
#             + [
#                 TransformerEncoder(num_hid, num_head, num_feed_forward)
#                 for _ in range(num_layers_enc)
#             ]
#         )

#         for i in range(num_layers_dec):
#             setattr(
#                 self,
#                 f"dec_layer_{i}",
#                 TransformerDecoder(num_hid, num_head, num_feed_forward),
#             )

#         self.classifier = layers.Dense(num_classes)

#     def decode(self, enc_out, target, training):
#         y = self.dec_input(target)
#         for i in range(self.num_layers_dec):
#             y = getattr(self, f"dec_layer_{i}")(enc_out, y, training)
#         return y

#     def call(self, inputs, training):
#         source = inputs[0]
#         target = inputs[1]
#         x = self.encoder(source, training)
#         y = self.decode(x, target, training)
#         return self.classifier(y)

#     @property
#     def metrics(self):
#         return [self.loss_metric]

#     def train_step(self, batch):
#         """Processes one batch inside model.fit()."""
#         source = batch[0]
#         target = batch[1]

#         input_shape = tf.shape(target)
#         batch_size = input_shape[0]
        
#         dec_input = target[:, :-1]
#         dec_target = target[:, 1:]
#         with tf.GradientTape() as tape:
#             preds = self([source, dec_input])
#             one_hot = tf.one_hot(dec_target, depth=self.num_classes)
#             mask = tf.math.logical_not(tf.math.equal(dec_target, pad_token_idx))
#             loss = self.compiled_loss(one_hot, preds, sample_weight=mask)
#         trainable_vars = self.trainable_variables
#         gradients = tape.gradient(loss, trainable_vars)
#         self.optimizer.apply_gradients(zip(gradients, trainable_vars))
#         # Computes the Levenshtein distance between sequences since the evaluation
#         # metric for this contest is the normalized total levenshtein distance.
#         edit_dist = tf.edit_distance(tf.sparse.from_dense(target), 
#                                      tf.sparse.from_dense(tf.cast(tf.argmax(preds, axis=1), tf.int32)))
#         edit_dist = tf.reduce_mean(edit_dist)
#         self.acc_metric.update_state(edit_dist)
#         self.loss_metric.update_state(loss)
#         return {"loss": self.loss_metric.result(), "edit_dist": self.acc_metric.result()}

#     def test_step(self, batch):
#         source = batch[0]
#         target = batch[1]

#         input_shape = tf.shape(target)
#         batch_size = input_shape[0]
        
#         dec_input = target[:, :-1]
#         dec_target = target[:, 1:]
#         preds = self([source, dec_input])
#         one_hot = tf.one_hot(dec_target, depth=self.num_classes)
#         mask = tf.math.logical_not(tf.math.equal(dec_target, pad_token_idx))
#         loss = self.compiled_loss(one_hot, preds, sample_weight=mask)
#         # Computes the Levenshtein distance between sequences since the evaluation
#         # metric for this contest is the normalized total levenshtein distance.
#         edit_dist = tf.edit_distance(tf.sparse.from_dense(target), 
#                                      tf.sparse.from_dense(tf.cast(tf.argmax(preds, axis=1), tf.int32)))
#         edit_dist = tf.reduce_mean(edit_dist)
#         self.acc_metric.update_state(edit_dist)
#         self.loss_metric.update_state(loss)
#         return {"loss": self.loss_metric.result(), "edit_dist": self.acc_metric.result()}

#     def generate(self, source, target_start_token_idx):
#         """Performs inference over one batch of inputs using greedy decoding."""
#         bs = tf.shape(source)[0]
#         enc = self.encoder(source, training = False)
#         dec_input = tf.ones((bs, 1), dtype=tf.int32) * target_start_token_idx
#         dec_logits = []
#         for i in range(self.target_maxlen - 1):
#             dec_out = self.decode(enc, dec_input, training = False)
#             logits = self.classifier(dec_out)
#             logits = tf.argmax(logits, axis=-1, output_type=tf.int32)
#             last_logit = logits[:, -1][..., tf.newaxis]
#             dec_logits.append(last_logit)
#             dec_input = tf.concat([dec_input, last_logit], axis=-1)
#         return dec_input
    
#     def generate_precisely(self, source, target_start_token_idx, how_many):
#         bs = tf.shape(source)[0]
#         enc = self.encoder(source, training = False)
#         dec_input = tf.ones((bs, 1), dtype=tf.int32) * target_start_token_idx
#         dec_logits = []
#         for i in range(how_many):
#             dec_out = self.decode(enc, dec_input, training = False)
#             logits = self.classifier(dec_out)
#             logits = tf.argmax(logits, axis=-1, output_type=tf.int32)
#             last_logit = logits[:, -1][..., tf.newaxis]
#             dec_logits.append(last_logit)
#             dec_input = tf.concat([dec_input, last_logit], axis=-1)
#         return dec_input
    
#     def generate_one(self, source, target_start_token_idx):
#         bs = tf.shape(source)[0]
#         enc = self.encoder(source, training = False)
#         dec_input = tf.ones((bs, 1), dtype=tf.int32) * target_start_token_idx
#         dec_logits = []
#         dec_out = self.decode(enc, dec_input, training = False)
#         logits = self.classifier(dec_out)
#         results = tf.argmax(logits, axis=-1, output_type=tf.int32)
#         return results[0][0].numpy(), max(tf.nn.softmax(logits)[0][0]).numpy()

In [39]:
# class DisplayOutputs(keras.callbacks.Callback):
#     def __init__(
#         self, batch, idx_to_token, target_start_token_idx=60, target_end_token_idx=61
#     ):
#         """Displays a batch of outputs after every 4 epoch

#         Args:
#             batch: A test batch
#             idx_to_token: A List containing the vocabulary tokens corresponding to their indices
#             target_start_token_idx: A start token index in the target vocabulary
#             target_end_token_idx: An end token index in the target vocabulary
#         """
#         self.batch = batch
#         self.target_start_token_idx = target_start_token_idx
#         self.target_end_token_idx = target_end_token_idx
#         self.idx_to_char = idx_to_token

#     def on_epoch_end(self, epoch, logs=None):
#         if epoch % 4 != 0:
#             return
#         source = self.batch[0]
#         target = self.batch[1].numpy()
#         bs = tf.shape(source)[0]
#         preds = self.model.generate(source, self.target_start_token_idx)
#         preds = preds.numpy()
#         for i in range(bs):
#             target_text = "".join([self.idx_to_char[_] for _ in target[i, :]])
#             prediction = ""
#             for idx in preds[i, :]:
#                 prediction += self.idx_to_char[idx]
#                 if idx == self.target_end_token_idx:
#                     break
#             print(f"target:     {target_text.replace('-','')}")
#             print(f"prediction: {prediction}\n")

# batch = next(iter(valid_ds))

# idx_to_char = list(char_to_num.keys())
# display_cb = DisplayOutputs(
#     batch, idx_to_char, target_start_token_idx=char_to_num['<'], target_end_token_idx=char_to_num['>']
# )

In [63]:
# model = Transformer(
#     num_hid=200,
#     num_head=4,
#     num_feed_forward=256, # was 400 originally
#     source_maxlen = FRAME_LEN,
#     target_maxlen=64,
#     num_layers_enc=2,
#     num_layers_dec=1,
#     num_classes=len(char_to_num),
# )

# loss_fn = tf.keras.losses.CategoricalCrossentropy(from_logits=True, label_smoothing=0.1,)
# optimizer = keras.optimizers.Adam(0.0001)
# model.compile(optimizer=optimizer, loss=loss_fn)

# model.build([lm_shape, phrase_shape])

# model.summary()

In [70]:
# https://medium.com/@max_garber/simple-keras-transformer-model-74724a83bb83

# class EncoderLayer(tf.keras.layers.Layer):
#     def __init__(self,  d_model = 512, num_heads = 8, dff = 2048, dropout = 0.0):
#         super(EncoderLayer, self).__init__()
    
#         self.multi_head_attention =  MultiHeadAttention(d_model, num_heads)
#         self.dropout_attention = tf.keras.layers.Dropout(dropout)
#         self.add_attention = tf.keras.layers.Add()
#         self.layer_norm_attention = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    
#         self.dense1 = tf.keras.layers.Dense(dff, activation='relu')
#         self.dense2 = tf.keras.layers.Dense(d_model)
#         self.dropout_dense = tf.keras.layers.Dropout(dropout)
#         self.add_dense = tf.keras.layers.Add()
#         self.layer_norm_dense = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    
#     def call(self, inputs, mask=None, training=None):
#         # print(mask)
#         attention = self.multi_head_attention([inputs,inputs,inputs], mask = [mask,mask])
#         attention = self.dropout_attention(attention, training = training)
#         x = self.add_attention([inputs , attention])
#         x = self.layer_norm_attention(x)
#         # x = inputs
    
#         ## Feed Forward
#         dense = self.dense1(x)
#         dense = self.dense2(dense)
#         dense = self.dropout_dense(dense, training = training)
#         x = self.add_dense([x , dense])
#         x = self.layer_norm_dense(x)
    
#         return x

# class Encoder(tf.keras.layers.Layer):
#     def __init__(self, input_vocab_size, num_layers = 4, d_model = 512, num_heads = 8, dff = 2048, maximum_position_encoding = 10000, dropout = 0.0):
#         super(Encoder, self).__init__()

#         self.d_model = d_model

#         self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model, mask_zero=True)
#         self.pos = positional_encoding(maximum_position_encoding, d_model)

#         self.encoder_layers = [ EncoderLayer(d_model = d_model, num_heads = num_heads, dff = dff, dropout = dropout) for _ in range(num_layers)]

#         self.dropout = tf.keras.layers.Dropout(dropout)

#     def call(self, inputs, mask=None, training=None):
#         x = self.embedding(inputs)
#         # positional encoding
#         x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32)) 
#         x += self.pos[: , :tf.shape(x)[1], :]

#         x = self.dropout(x, training=training)

#         #Encoder layer
#         embedding_mask = self.embedding.compute_mask(inputs)
#         for encoder_layer in self.encoder_layers:
#             x = encoder_layer(x, mask = embedding_mask)

#         return x

#     def compute_mask(self, inputs, mask=None):
#         return self.embedding.compute_mask(inputs)
    
# class DecoderLayer(tf.keras.layers.Layer):
#     def __init__(self,  d_model = 512, num_heads = 8, dff = 2048, dropout = 0.0):
#         super(DecoderLayer, self).__init__()
    
#         self.multi_head_attention1 =  MultiHeadAttention(d_model, num_heads, causal = True)
#         self.dropout_attention1 = tf.keras.layers.Dropout(dropout)
#         self.add_attention1 = tf.keras.layers.Add()
#         self.layer_norm_attention1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    
#         self.multi_head_attention2 =  MultiHeadAttention(d_model, num_heads)
#         self.dropout_attention2 = tf.keras.layers.Dropout(dropout)
#         self.add_attention2 = tf.keras.layers.Add()
#         self.layer_norm_attention2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    
    
#         self.dense1 = tf.keras.layers.Dense(dff, activation='relu')
#         self.dense2 = tf.keras.layers.Dense(d_model)
#         self.dropout_dense = tf.keras.layers.Dropout(dropout)
#         self.add_dense = tf.keras.layers.Add()
#         self.layer_norm_dense = tf.keras.layers.LayerNormalization(epsilon=1e-6)

#     def call(self, inputs, mask=None, training=None):
#         # print(mask)
#         attention = self.multi_head_attention1([inputs[0],inputs[0],inputs[0]], mask = [mask[0],mask[0]])
#         attention = self.dropout_attention1(attention, training = training)
#         x = self.add_attention1([inputs[0] , attention])
#         x = self.layer_norm_attention1(x)
        
#         attention = self.multi_head_attention2([x, inputs[1],inputs[1]], mask = [mask[0],mask[1]])
#         attention = self.dropout_attention2(attention, training = training)
#         x = self.add_attention1([x , attention])
#         x = self.layer_norm_attention1(x)
    
    
#         ## Feed Forward
#         dense = self.dense1(x)
#         dense = self.dense2(dense)
#         dense = self.dropout_dense(dense, training = training)
#         x = self.add_dense([x , dense])
#         x = self.layer_norm_dense(x)
    
#         return x
    
# class Decoder(tf.keras.layers.Layer):
#     def __init__(self, target_vocab_size, num_layers = 4, d_model = 512, num_heads = 8, dff = 2048, maximum_position_encoding = 10000, dropout = 0.0):
#         super(Decoder, self).__init__()
#         self.d_model = d_model
#         self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model, mask_zero=True)
#         self.pos = positional_encoding(maximum_position_encoding, d_model)
#         self.decoder_layers = [ DecoderLayer(d_model = d_model, num_heads = num_heads, dff = dff, dropout = dropout)  for _ in range(num_layers)]
#         self.dropout = tf.keras.layers.Dropout(dropout)

#     def call(self, inputs, mask=None, training=None):
#         x = self.embedding(inputs[0])
#         # positional encoding
#         x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
#         x += self.pos[: , :tf.shape(x)[1], :]
#         x = self.dropout(x, training=training)
#         #Decoder layer
#         embedding_mask = self.embedding.compute_mask(inputs[0])
#         for decoder_layer in self.decoder_layers:
#             x = decoder_layer([x,inputs[1]], mask = [embedding_mask, mask])
#         return x

#     # Comment this out if you want to use the masked_loss()
#     def compute_mask(self, inputs, mask=None):
#         return self.embedding.compute_mask(inputs[0])
    
# class MultiHeadAttention(tf.keras.layers.Layer):
#     def __init__(self, d_model = 512, num_heads = 8, causal=False, dropout=0.0):
#         super(MultiHeadAttention, self).__init__()
    
#         assert d_model % num_heads == 0
#         depth = d_model // num_heads
    
#         self.w_query = tf.keras.layers.Dense(d_model)
#         self.split_reshape_query = tf.keras.layers.Reshape((-1,num_heads,depth))  
#         self.split_permute_query = tf.keras.layers.Permute((2,1,3))      
    
#         self.w_value = tf.keras.layers.Dense(d_model)
#         self.split_reshape_value = tf.keras.layers.Reshape((-1,num_heads,depth))
#         self.split_permute_value = tf.keras.layers.Permute((2,1,3))
    
#         self.w_key = tf.keras.layers.Dense(d_model)
#         self.split_reshape_key = tf.keras.layers.Reshape((-1,num_heads,depth))
#         self.split_permute_key = tf.keras.layers.Permute((2,1,3))
    
#         self.attention = tf.keras.layers.Attention(causal=causal, dropout=dropout)
#         self.join_permute_attention = tf.keras.layers.Permute((2,1,3))
#         self.join_reshape_attention = tf.keras.layers.Reshape((-1,d_model))
    
#         self.dense = tf.keras.layers.Dense(d_model)

#     def call(self, inputs, mask=None, training=None):
#         q = inputs[0]
#         v = inputs[1]
#         k = inputs[2] if len(inputs) > 2 else v
    
#         query = self.w_query(q)
#         query = self.split_reshape_query(query)    
#         query = self.split_permute_query(query)                 
    
#         value = self.w_value(v)
#         value = self.split_reshape_value(value)
#         value = self.split_permute_value(value)
    
#         key = self.w_key(k)
#         key = self.split_reshape_key(key)
#         key = self.split_permute_key(key)
    
#         if mask is not None:
#             if mask[0] is not None:
#                 mask[0] = tf.keras.layers.Reshape((-1,1))(mask[0])
#                 mask[0] = tf.keras.layers.Permute((2,1))(mask[0])
#             if mask[1] is not None:
#                 mask[1] = tf.keras.layers.Reshape((-1,1))(mask[1])
#                 mask[1] = tf.keras.layers.Permute((2,1))(mask[1])
    
#         attention = self.attention([query, value, key], mask=mask)
#         attention = self.join_permute_attention(attention)
#         attention = self.join_reshape_attention(attention)
    
#         x = self.dense(attention)
    
#         return x
    
# # Hyperparameters
# num_layers = 4
# d_model = 128
# dff = 512
# num_heads = 8
# dropout_rate = 0.1

# # Size of input vocab plus start and end tokens
# input_vocab_size = tokenizer_pt.vocab_size + 2
# target_vocab_size = tokenizer_en.vocab_size + 2

# input = tf.keras.layers.Input(shape=(None,))
# target = tf.keras.layers.Input(shape=(None,))

# encoder = Encoder(input_vocab_size, num_layers = num_layers, d_model = d_model, num_heads = num_heads, dff = dff, dropout = dropout_rate)
# decoder = Decoder(target_vocab_size, num_layers = num_layers, d_model = d_model, num_heads = num_heads, dff = dff, dropout = dropout_rate)

# x = encoder(input)
# x = decoder([target, x] , mask = encoder.compute_mask(input))
# x = tf.keras.layers.Dense(target_vocab_size)(x)

# model = tf.keras.models.Model(inputs=[input, target], outputs=x)

# model.summary()

# optimizer = tf.keras.optimizers.Adam(CustomSchedule(d_model), beta_1=0.9, beta_2=0.98, 
#                                      epsilon=1e-9)

# loss = tf.keras.losses.SparseCategoricalCrossentropy(
#     from_logits=True, reduction='none')

# def masked_loss(y_true, y_pred):
#     mask = tf.math.logical_not(tf.math.equal(y_true, 0))
#     _loss = loss(y_true, y_pred)
#     mask = tf.cast(mask, dtype=_loss.dtype)
#     _loss *= mask
#     return tf.reduce_sum(_loss)/tf.reduce_sum(mask)

# metrics = [loss, masked_loss, tf.keras.metrics.SparseCategoricalAccuracy()]

# model.compile(optimizer=optimizer, loss = loss, metrics = metrics) # masked_

In [87]:
# https://colab.research.google.com/github/tensorflow/text/blob/master/docs/tutorials/transformer.ipynb#scrollTo=1Rz82wEs5biZ

def positional_encoding(length, depth):
    depth = depth/2

    positions = np.arange(length)[:, np.newaxis]     # (seq, 1)
    depths = np.arange(depth)[np.newaxis, :]/depth   # (1, depth)
    
    angle_rates = 1 / (10000**depths)         # (1, depth)
    angle_rads = positions * angle_rates      # (pos, depth)

    pos_encoding = np.concatenate(
        [np.sin(angle_rads), np.cos(angle_rads)],
        axis=-1) 

    return tf.cast(pos_encoding, dtype=tf.float32)

MAX_LEN = 2048

class PositionalTokenEmbedding(tf.keras.layers.Layer):
    def __init__(self, vocab_size, d_model):
        super().__init__()
        self.d_model = d_model
        self.embedding = tf.keras.layers.Embedding(vocab_size, d_model)  # , mask_zero=True
        self.pos_encoding = positional_encoding(length=MAX_LEN, depth=d_model)

#     def compute_mask(self, *args, **kwargs):
#         return self.embedding.compute_mask(*args, **kwargs)

    def call(self, x):
        length = tf.shape(x)[1]
        x = self.embedding(x)
        # This factor sets the relative scale of the embedding and positonal_encoding.
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x = x + self.pos_encoding[tf.newaxis, :length, :]
        return x

NUM_OF_FILTERS_CONV = 64
NUM_CONV_LAYERS = 3
FILTER_SIZE = 11
    
class PositionalLandmarkEmbedding(tf.keras.layers.Layer):
    def __init__(self, len_of_seq, d_model):
        super().__init__()
        self.d_model = d_model
        self.len_of_seq = len_of_seq
        # maybe add input for Convs
        self.conv_block = tf.keras.models.Sequential([
            tf.keras.layers.Conv1D(NUM_OF_FILTERS_CONV, FILTER_SIZE, padding="same", activation="relu")
            for _ in range(NUM_CONV_LAYERS)
        ])
        self.pos_encoding = positional_encoding(length=MAX_LEN, depth=d_model)
        
    def call(self, x):
        #length = tf.shape(x)[1]
        x = self.conv_block(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x = x + self.pos_encoding[tf.newaxis, :self.len_of_seq, :]
        return x
    
class BaseAttention(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__()
        self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
        self.layernorm = tf.keras.layers.LayerNormalization()
        self.add = tf.keras.layers.Add()

class CrossAttention(BaseAttention):
    def call(self, x, context):
        attn_output, attn_scores = self.mha(
            query=x,
            key=context,
            value=context,
            return_attention_scores=True)
       
        # Cache the attention scores for plotting later.
        self.last_attn_scores = attn_scores
    
        x = self.add([x, attn_output])
        x = self.layernorm(x)
    
        return x
    
class GlobalSelfAttention(BaseAttention):
    def call(self, x):
        attn_output = self.mha(
            query=x,
            value=x,
            key=x)
        x = self.add([x, attn_output])
        x = self.layernorm(x)
        return x
    
class CausalSelfAttention(BaseAttention):
    def call(self, x):
        attn_output = self.mha(
            query=x,
            value=x,
            key=x,
            use_causal_mask = True)
        x = self.add([x, attn_output])
        x = self.layernorm(x)
        return x
    
class FeedForward(tf.keras.layers.Layer):
    def __init__(self, d_model, dff, dropout_rate=0.1):
        super().__init__()
        self.seq = tf.keras.Sequential([
          tf.keras.layers.Dense(dff, activation='relu'),
          tf.keras.layers.Dense(d_model),
          tf.keras.layers.Dropout(dropout_rate)
        ])
        self.add = tf.keras.layers.Add()
        self.layer_norm = tf.keras.layers.LayerNormalization()

    def call(self, x):
        x = self.add([x, self.seq(x)])
        x = self.layer_norm(x) 
        return x
    
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self,*, d_model, num_heads, dff, dropout_rate=0.1):
        super().__init__()
    
        self.self_attention = GlobalSelfAttention(
            num_heads=num_heads,
            key_dim=d_model,
            dropout=dropout_rate)
    
        self.ffn = FeedForward(d_model, dff)
    
    def call(self, x):
        x = self.self_attention(x)
        x = self.ffn(x)
        return x
    
class Encoder(tf.keras.layers.Layer):
    def __init__(self, *, len_of_seq, num_layers, d_model, num_heads,
               dff, dropout_rate=0.1):
        super().__init__()
    
        self.d_model = d_model
        self.num_layers = num_layers
    
        self.pos_embedding = PositionalLandmarkEmbedding(
            len_of_seq, d_model)
    
        self.enc_layers = [
            EncoderLayer(d_model=d_model,
                         num_heads=num_heads,
                         dff=dff,
                         dropout_rate=dropout_rate)
            for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(dropout_rate)

    def call(self, x):
        # `x` is landmark sequences with shape: (batch, seq_len, features)
        x = self.pos_embedding(x)  # Shape `(batch_size, seq_len, d_model)`.
        
        x = self.dropout(x)
    
        for i in range(self.num_layers):
            x = self.enc_layers[i](x)
    
        return x  # Shape `(batch_size, seq_len, d_model)`.
    
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, *, d_model, num_heads, dff, dropout_rate=0.1):
        super(DecoderLayer, self).__init__()
    
        self.causal_self_attention = CausalSelfAttention(
            num_heads=num_heads,
            key_dim=d_model,
            dropout=dropout_rate)
        
        self.cross_attention = CrossAttention(
            num_heads=num_heads,
            key_dim=d_model,
            dropout=dropout_rate)
    
        self.ffn = FeedForward(d_model, dff)

    def call(self, x, context):
        x = self.causal_self_attention(x=x)
        x = self.cross_attention(x=x, context=context)
    
        # Cache the last attention scores for plotting later
        self.last_attn_scores = self.cross_attention.last_attn_scores
    
        x = self.ffn(x)  # Shape `(batch_size, seq_len, d_model)`.
        return x
    
class Decoder(tf.keras.layers.Layer):
    def __init__(self, *, num_layers, d_model, num_heads, dff, vocab_size, dropout_rate=0.1):
        super(Decoder, self).__init__()
    
        self.d_model = d_model
        self.num_layers = num_layers
    
        self.pos_embedding = PositionalTokenEmbedding(vocab_size=vocab_size, d_model=d_model)
        self.dropout = tf.keras.layers.Dropout(dropout_rate)
        self.dec_layers = [
            DecoderLayer(d_model=d_model, num_heads=num_heads, dff=dff, dropout_rate=dropout_rate)
            for _ in range(num_layers)]
    
        self.last_attn_scores = None

    def call(self, x, context):
        # `x` is token-IDs shape (batch, target_seq_len)
        x = self.pos_embedding(x)  # (batch_size, target_seq_len, d_model)
    
        x = self.dropout(x)
    
        for i in range(self.num_layers):
            x  = self.dec_layers[i](x, context)
    
        self.last_attn_scores = self.dec_layers[-1].last_attn_scores
    
        # The shape of x is (batch_size, target_seq_len, d_model).
        return x
    
class Transformer(tf.keras.Model):
    def __init__(self, *, len_lm_seq, num_enc_layers, num_dec_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size, dropout_rate=0.1):
        super().__init__()
        self.encoder = Encoder(len_of_seq=len_lm_seq,
                               num_layers=num_enc_layers, d_model=d_model,
                               num_heads=num_heads, dff=dff,
                               dropout_rate=dropout_rate)
        
        self.decoder = Decoder(num_layers=num_dec_layers, d_model=d_model,
                               num_heads=num_heads, dff=dff,
                               vocab_size=target_vocab_size,
                               dropout_rate=dropout_rate)
    
        self.final_layer = tf.keras.layers.Dense(target_vocab_size)

    def call(self, inputs):
        # To use a Keras model with `.fit` you must pass all your inputs in the
        # first argument.
        landmark_seq, prev_gen_context  = inputs
    
        encoded_lm_seq = self.encoder(landmark_seq)  # (batch_size, landmark_seq_len, d_model)
        x = self.decoder(prev_gen_context, encoded_lm_seq)  # (batch_size, target_len, d_model)
        logits = self.final_layer(x)  # (batch_size, target_len, target_vocab_size)

        try:
            # Drop the keras mask, so it doesn't scale the losses/metrics.
            # b/250038731
            del logits._keras_mask
        except AttributeError:
            pass
    
        return logits

In [88]:
transformer = Transformer(
    len_lm_seq=FRAME_LEN,
    num_enc_layers=2,
    num_dec_layers=1,
    d_model=512,
    num_heads=4,
    dff=512,
    input_vocab_size=len(char_to_num),
    target_vocab_size=len(char_to_num),
    dropout_rate=0.1)

In [68]:
model.predict(next(iter(train_ds)))

ValueError: in user code:

    File "C:\Users\mdancso\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\training.py", line 2041, in predict_function  *
        return step_function(self, iterator)
    File "C:\Users\mdancso\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\training.py", line 2027, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\mdancso\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\training.py", line 2015, in run_step  **
        outputs = model.predict_step(data)
    File "C:\Users\mdancso\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\training.py", line 1983, in predict_step
        return self(x, training=False)
    File "C:\Users\mdancso\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "C:\Users\mdancso\AppData\Local\Temp\__autograph_generated_file5c9eh3mb.py", line 13, in tf__call
        y = ag__.converted_call(ag__.ld(self).decode, (ag__.ld(x), ag__.ld(target), ag__.ld(training)), None, fscope)
    File "C:\Users\mdancso\AppData\Local\Temp\__autograph_generated_filejcfmrm9x.py", line 10, in tf__decode
        y = ag__.converted_call(ag__.ld(self).dec_input, (ag__.ld(target),), None, fscope)
    File "C:\Users\mdancso\AppData\Local\Temp\__autograph_generated_file1uzb_ts_.py", line 16, in tf__call
        retval_ = ag__.ld(x) + ag__.ld(self).pos_emb[:ag__.ld(maxlen), :]

    ValueError: Exception encountered when calling layer "transformer_3" "                 f"(type Transformer).
    
    in user code:
    
        File "C:\Users\mdancso\AppData\Local\Temp\ipykernel_21316\2149693067.py", line 197, in call  *
            y = self.decode(x, target, training)
        File "C:\Users\mdancso\AppData\Local\Temp\ipykernel_21316\2149693067.py", line 188, in decode  *
            y = self.dec_input(target)
        File "C:\Users\mdancso\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler  **
            raise e.with_traceback(filtered_tb) from None
        File "C:\Users\mdancso\AppData\Local\Temp\__autograph_generated_file1uzb_ts_.py", line 16, in tf__call
            retval_ = ag__.ld(x) + ag__.ld(self).pos_emb[:ag__.ld(maxlen), :]
    
        ValueError: Exception encountered when calling layer "token_embedding_3" "                 f"(type TokenEmbedding).
        
        in user code:
        
            File "C:\Users\mdancso\AppData\Local\Temp\ipykernel_21316\2149693067.py", line 23, in call  *
                return x + self.pos_emb[:maxlen, :]
        
            ValueError: Dimensions must be equal, but are 64 and 63 for '{{node transformer_3/token_embedding_3/add}} = AddV2[T=DT_FLOAT](transformer_3/token_embedding_3/Mul, transformer_3/token_embedding_3/strided_slice_1)' with input shapes: [32,64,200], [63,200].
        
        
        Call arguments received by layer "token_embedding_3" "                 f"(type TokenEmbedding):
          • x=tf.Tensor(shape=(32, 64), dtype=int32)
    
    
    Call arguments received by layer "transformer_3" "                 f"(type Transformer):
      • inputs=('tf.Tensor(shape=(32, 128, 78), dtype=float32)', 'tf.Tensor(shape=(32, 64), dtype=int32)')
      • training=False


In [125]:
%%time
history = model.fit(train_ds, validation_data=valid_ds, verbose = 2, callbacks=[], epochs=2)

Epoch 1/2


KeyboardInterrupt: 

In [58]:
model_name = "test"

tf.saved_model.save(model, "test_saving_the_model")

# # Save weights
# model.save_weights(f'{model_name}_weights.h5')

# # Save the model's configuration to a file
# model_json = model.to_json()
# with open(f'{model_name}_config.json', 'w') as json_file:
#     json_file.write(model_json)

ValueError: Model <__main__.Transformer object at 0x000001B3F0900460> cannot be saved either because the input shape is not available or because the forward pass of the model is not defined.To define a forward pass, please override `Model.call()`. To specify an input shape, either call `build(input_shape)` directly, or call the model on actual data using `Model()`, `Model.fit()`, or `Model.predict()`. If you have a custom training step, please make sure to invoke the forward pass in train step through `Model.__call__`, i.e. `model(inputs)`, as opposed to `model.call()`.

# Testing

In [136]:
model_to_be_loaded = "test"

# model = keras.models.load_model(f"{model_to_be_loaded}_weights.h5")

with open(f'{model_to_be_loaded}_config.json', 'r') as json_file:
    loaded_model_json = json_file.read()

# Recreate the model from the loaded configuration
loaded_model = tf.keras.models.model_from_json(loaded_model_json, custom_objects={'Transformer': Transformer})

In [137]:
loaded_model.compile(optimizer=optimizer, loss=loss_fn)

In [138]:
%%capture
history = loaded_model.fit(train_ds.take(1), validation_data=valid_ds.take(1), verbose = 2, callbacks=[], epochs=1)

ValueError: in user code:

    File "C:\Users\mdancso\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\training.py", line 1160, in train_function  *
        return step_function(self, iterator)
    File "C:\Users\mdancso\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\training.py", line 1146, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\mdancso\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\training.py", line 1135, in run_step  **
        outputs = model.train_step(data)
    File "C:\Users\mdancso\AppData\Local\Temp\ipykernel_16940\2055258342.py", line 215, in train_step
        preds = self([source, dec_input])
    File "C:\Users\mdancso\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "C:\Users\mdancso\AppData\Local\Temp\__autograph_generated_filedg9fi4be.py", line 12, in tf__call
        x = ag__.converted_call(ag__.ld(self).encoder, (ag__.ld(source), ag__.ld(training)), None, fscope)
    File "C:\Users\mdancso\AppData\Local\Temp\__autograph_generated_filefigk7vpz.py", line 14, in tf__call
        x = ag__.ld(x) + ag__.ld(self).pos_emb

    ValueError: Exception encountered when calling layer "transformer_16" "                 f"(type Transformer).
    
    in user code:
    
        File "C:\Users\mdancso\AppData\Local\Temp\ipykernel_16940\2055258342.py", line 196, in call  *
            x = self.encoder(source, training)
        File "C:\Users\mdancso\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler  **
            raise e.with_traceback(filtered_tb) from None
        File "C:\Users\mdancso\AppData\Local\Temp\__autograph_generated_filefigk7vpz.py", line 14, in tf__call
            x = ag__.ld(x) + ag__.ld(self).pos_emb
    
        ValueError: Exception encountered when calling layer "landmark_embedding_16" "                 f"(type LandmarkEmbedding).
        
        in user code:
        
            File "C:\Users\mdancso\AppData\Local\Temp\ipykernel_16940\2055258342.py", line 58, in call  *
                x = x + self.pos_emb
        
            ValueError: Dimensions must be equal, but are 250 and 100 for '{{node transformer_16/sequential_66/landmark_embedding_16/add}} = AddV2[T=DT_FLOAT](transformer_16/sequential_66/landmark_embedding_16/Mul, transformer_16/sequential_66/landmark_embedding_16/add/y)' with input shapes: [?,250,64], [100,64].
        
        
        Call arguments received by layer "landmark_embedding_16" "                 f"(type LandmarkEmbedding):
          • x=tf.Tensor(shape=(None, 250, 78), dtype=float32)
    
    
    Call arguments received by layer "transformer_16" "                 f"(type Transformer):
      • inputs=['tf.Tensor(shape=(None, 250, 78), dtype=float32)', 'tf.Tensor(shape=(None, None), dtype=int32)']
      • training=False


In [134]:
batch = next(iter(train_ds))
loaded_model.generate_one(batch[0], start_token_idx)

# Load the model's weights
loaded_model.load_weights(f"{model_to_be_loaded}_weights.h5")

InvalidArgumentError: Exception encountered when calling layer "landmark_embedding_15" "                 f"(type LandmarkEmbedding).

{{function_node __wrapped__AddV2_device_/job:localhost/replica:0/task:0/device:CPU:0}} Incompatible shapes: [32,250,64] vs. [100,64] [Op:AddV2]

Call arguments received by layer "landmark_embedding_15" "                 f"(type LandmarkEmbedding):
  • x=tf.Tensor(shape=(32, 250, 78), dtype=float32)

## On Test dataset

In [56]:
batches = [batch for batch in valid_ds]

preds_list = []
ground_truth_list = []

for batch in batches[:1]:
    source = batch[0]
    target = batch[1].numpy()
    bs = tf.shape(source)[0]
    preds = model.generate(source, start_token_idx)
    preds = preds.numpy()

    for i in range(bs):
        target_text = "".join([idx_to_char[_] for _ in target[i, :]])
        ground_truth_list.append(target_text.replace('P', ''))
        prediction = ""
        for idx in preds[i, :]:
            prediction += idx_to_char[idx]
            if idx == end_token_idx:
                break
        preds_list.append(prediction)

for i in range(30):
    print(ground_truth_list[i])
    print(preds_list[i])
    print('\n~~~\n')

AttributeError: 'Transformer' object has no attribute 'generate'

## Real life testing

In [43]:
import cv2
from IPython.display import display, Image

In [44]:
# # Pose coordinates for hand movement.
# LPOSE = [13, 15, 17, 19, 21]
# RPOSE = [14, 16, 18, 20, 22]
# POSE = LPOSE + RPOSE

# X = [f'x_right_hand_{i}' for i in range(21)] + [f'x_left_hand_{i}' for i in range(21)] + [f'x_pose_{i}' for i in POSE]
# Y = [f'y_right_hand_{i}' for i in range(21)] + [f'y_left_hand_{i}' for i in range(21)] + [f'y_pose_{i}' for i in POSE]
# Z = [f'z_right_hand_{i}' for i in range(21)] + [f'z_left_hand_{i}' for i in range(21)] + [f'z_pose_{i}' for i in POSE]

# FEATURE_COLUMNS = X + Y + Z

from itertools import chain

# Pose coordinates for hand movement.
LPOSE = [13, 15, 17, 19, 21]
RPOSE = [14, 16, 18, 20, 22]
POSE = LPOSE + RPOSE

def extract_from_result(res):
    # Extract specific pose landmarks if available
    px = []
    py = []
    pz = []
    if res.pose_landmarks:
        for i in POSE:
            lm = res.pose_landmarks.landmark[i]
            px.append(lm.x)
            py.append(lm.y)
            pz.append(lm.z)
    else:
        px = [0.0]*len(POSE)
        py = [0.0]*len(POSE)
        pz = [0.0]*len(POSE)

    # Extract left hand landmarks if available
    lx = []
    ly = []
    lz = []
    if res.left_hand_landmarks:
        for lm in res.left_hand_landmarks.landmark:
            lx.append(lm.x)
            ly.append(lm.y)
            lz.append(lm.z)
    else:
        lx = [0.0]*21
        ly = [0.0]*21
        lz = [0.0]*21

    # Extract right hand landmarks if available
    rx = []
    ry = []
    rz = []
    if res.right_hand_landmarks:
        for lm in res.right_hand_landmarks.landmark:
            rx.append(lm.x)
            ry.append(lm.y)
            rz.append(lm.z)
    else:
        rx = [0.0]*21
        ry = [0.0]*21
        rz = [0.0]*21

    return list(chain(rx, lx, px, ry, ly, py, rz, lz, pz))

In [45]:
is_signing_model = keras.models.load_model("detect_signing.hdf5")
is_signing_model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_3 (LSTM)               (None, 1)                 632       
                                                                 
 dense_3 (Dense)             (None, 2)                 4         
                                                                 
Total params: 636
Trainable params: 636
Non-trainable params: 0
_________________________________________________________________


In [46]:
# Többségi döntéses kiértékelés with signing detection

# camera - 0
# video - "path/to/file"
file_or_camera = "bear.mp4"

from collections import Counter
from collections import deque

inner_fifo = deque(maxlen=10)
confidence_number = 7
INP_LEN = 15
signing_detection_model_input = list(np.zeros((INP_LEN, 156)))
model_data = []
last_output = start_token_idx
whole_pred = ""

def process_model_output(new_output: int):
    global last_output
    global whole_pred
    
    whole_pred += idx_to_char[new_output]
    
    inner_fifo.append(new_output)
    
    predicted_idx, count = Counter(inner_fifo).most_common(1)[0]
    if count >= confidence_number:
        if last_output != predicted_idx:
            last_output = predicted_idx
            
            # Predicted the end
            if predicted_idx == end_token_idx:
                # restart the detection
                last_output = start_token_idx
                inner_fifo.clear()
                print("\\n")
            else:
                print(idx_to_char[predicted_idx], end="")


video = cv2.VideoCapture(file_or_camera)
display_handle=display(None, display_id=True)
try:
    with mp_holistic.Holistic(min_detection_confidence=0.5,min_tracking_confidence=0.5) as holistic:
        while True:
            _, frame = video.read()

            if frame is None:
                break

            # Resize the frame to the target width and height
            image = cv2.resize(frame, (360, 240))

            # To improve performance, optionally mark the image as not writeable to
            # pass by reference.
            image.flags.writeable = False
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

            results = holistic.process(image)
            data = extract_from_result(results)
            
            signing_detection_model_input.pop(0)
            signing_detection_model_input.append(data)
            
#             is_signing = np.argmax(is_signing_model.predict(np.array([signing_detection_model_input]), verbose=0)) == 1
#             if is_signing:
#                 print("*", end="")
#                 if len(model_data) >= INP_LEN:
#                     model_data.pop(0)
#                 model_data.append(data)
#                 inp = pre_process(model_data)
#                 preds = model.generate_precisely(np.array([inp]), last_output, 1)
#                 process_model_output(preds[0, 1].numpy())
#             else:
#                 print("-", end="")

            if len(model_data) >= INP_LEN:
                model_data.pop(0)
            model_data.append(data)
            inp = pre_process(model_data)
            preds = model.generate_precisely(np.array([inp]), last_output, 1)
            process_model_output(preds[0, 1].numpy())
            

            # Draw landmark annotation on the image.
#             image.flags.writeable = True
#             image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
#             mp_drawing.draw_landmarks(
#                 image,
#                 results.face_landmarks,
#                 mp_holistic.FACEMESH_CONTOURS,
#                 landmark_drawing_spec=None,
#                 connection_drawing_spec=mp_drawing_styles
#                 .get_default_face_mesh_contours_style())

#             mp_drawing.draw_landmarks(
#                 image,
#                 results.pose_landmarks,
#                 mp_holistic.POSE_CONNECTIONS,
#                 landmark_drawing_spec=mp_drawing_styles
#                 .get_default_pose_landmarks_style())

#             mp_drawing.draw_landmarks(
#                 image,
#                 results.left_hand_landmarks,
#                 mp_holistic.HAND_CONNECTIONS,
#                 landmark_drawing_spec=mp_drawing_styles.get_default_hand_landmarks_style()
#             )

#             mp_drawing.draw_landmarks(
#                 image,
#                 results.right_hand_landmarks,
#                 mp_holistic.HAND_CONNECTIONS,
#                 landmark_drawing_spec=mp_drawing_styles.get_default_hand_landmarks_style()
#             )

            #frame = cv2.flip(image, 1)
            _, frame = cv2.imencode('.jpeg', image)
            display_handle.update(Image(data=frame.tobytes()))
except KeyboardInterrupt:
    pass
finally:
    video.release()
    display_handle.update(None)

None

beloun

In [49]:
# várjunk amíg megtelik egy frame aztán pred greedy módon

model_data = []
video = cv2.VideoCapture("whale.mp4")
display_handle=display(None, display_id=True)
try:
    with mp_holistic.Holistic(min_detection_confidence=0.5,min_tracking_confidence=0.5) as holistic:
        while True:
            _, frame = video.read()
            
            if frame is None:
                break

            # Resize the frame to the target width and height
            image = cv2.resize(frame, (360, 240))

            # To improve performance, optionally mark the image as not writeable to
            # pass by reference.
            image.flags.writeable = False
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

            results = holistic.process(image)
            data = extract_from_result(results)
            if len(model_data) == 100:
                print("Frames filled up")
                break
                
            model_data.append(data)
           
            # Draw landmark annotation on the image.
            image.flags.writeable = True
            image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
            mp_drawing.draw_landmarks(
                image,
                results.face_landmarks,
                mp_holistic.FACEMESH_CONTOURS,
                landmark_drawing_spec=None,
                connection_drawing_spec=mp_drawing_styles
                .get_default_face_mesh_contours_style())

            mp_drawing.draw_landmarks(
                image,
                results.pose_landmarks,
                mp_holistic.POSE_CONNECTIONS,
                landmark_drawing_spec=mp_drawing_styles
                .get_default_pose_landmarks_style())

            mp_drawing.draw_landmarks(
                image,
                results.left_hand_landmarks,
                mp_holistic.HAND_CONNECTIONS,
                landmark_drawing_spec=mp_drawing_styles.get_default_hand_landmarks_style()
            )

            mp_drawing.draw_landmarks(
                image,
                results.right_hand_landmarks,
                mp_holistic.HAND_CONNECTIONS,
                landmark_drawing_spec=mp_drawing_styles.get_default_hand_landmarks_style()
            )

            #frame = cv2.flip(image, 1)
            _, frame = cv2.imencode('.jpeg', image)
            display_handle.update(Image(data=frame.tobytes()))
except KeyboardInterrupt:
    pass
finally:
    video.release()
    display_handle.update(None)
    

# Run the model
inp = pre_process(model_data)
preds = model.generate(np.array([inp]), start_token_idx)
prediction = ""
for idx in preds[0, 1:]:
    prediction += idx_to_char[idx]
    last_char = idx_to_char[idx]
    if idx == end_token_idx:
        break
        
prediction

None

Frames filled up


'whe whal a lew ho>'

In [219]:
# Változtatások:
# *teljes alakos kirajzolás
# *positional encoding eredmények jobbak lettek:
#     <2796 west golden willow drive>
#     <279796 west gollen willow drive>
#     ~~~
#     <973-471-9887>
#     <+44-21-41-99>
#     ~~~
#     <497-723-6992>
#     <497-723-6928>
#     ~~~
#     <reallyloud.co.uk/simaii>
#     <realloud.com.ru/simai>
#     ~~~
#     <kkaicd1.pixnet.net>
#     <aicol-pig-ponet>
#     ~~~
#     <8260 john r bowdoin>
#     <820 johnrbowdoin>
#     ~~~
#     <56 paper birch drive>
#     <566 penbirch rdive>
#     ~~~
#     <gand-chudai-hardcor.html>
#     <randandingarcarcer.com.com.tk>
#     ~~~
#     <2708 west 77th>
#     <2708 west7th>
#     ~~~
#     <https://www.keainfo.gr>
#     <https://www.kreafo.hth>
#     ~~~
#     <288 fuller lake>
#     <28858 forlanerl>
#     ~~~
#     <mser/oki-guide>
#     <mser/okide>
#     ~~~
#     <220 north 47th avenue east>
#     <20 north panust peareas>
#     ~~~
#     <www.sudinfo.be>
#     <www.suturafa.com>
#     ~~~
#     <69 grant point>
#     <6691 paroira>
#     ~~~
#     <fibrain.pl/chapter-145/thanghr>
#     <fibrain-1choter-4lan.hau>
#     ~~~
#     <+351-521-8895>
#     <+335-3572-895>
#     ~~~
#     <via-piero-gobetti/brandi-love>
#     <vingiera-gro-betindi.ndi.ve>
#     ~~~
#     <amir le>
#     <amoir le>
#     ~~~
#     <automantenimiento-sa>
#     <monymimiento-to-sa>
#     ~~~
#     <6499 nfd 5053>
#     <649-998-5053>
#     ~~~
#     <7870 preston place>
#     <9753 parroplangl plane>
#     ~~~
#     <6870 scabisuit lane>
#     <6870 scabisuisuit lane>
#     ~~~
#     <sunshine mayer>
#     <sutrashines/mayer>
#     ~~~
#     <408-249-4707>
#     <487-823-4430>
#     ~~~
#     <arabradio.us/vana>
#     <arabrasdio.rs/vana>
#     ~~~
#     <televisoresponse>
#     <60146000009>
#     ~~~
#     <www.voices.com/bitesize>
#     <www.voropes.com/besit>
#     ~~~
#     <qd.razavi.ac.ir/bunya>
#     <stda-da-vir-acirra>
#     ~~~
#     <+246-987-50-80-02-294>
#     <+246-987-50-80-80294>
#     ~~~
# *classification modellt nem sikerült betanítani továbbra sem
# (más modellekkel is próbálkoztam azok sem mentek, szóval valahogy az adattal lehet gond)
# találtam másik adathalmazt is hozzá következőkben nem 250 hanem sokkal kevesebb signt fogok neki betanítani
# *real worlddel való összekötés sikerei? olvasd tovább:
#     Kísérletek:
#     van előre egy szekvencia, és azt kell fordítani, az egész jól megy
#     ha viszont egy folyamatos videóból kell ráadásul egymással nem kapcsolatban lévő részeket fordítani az nem az igazi
#     1. ötlet: mindig egy újabb karaktert generálunk, és egy fifóból többségi döntéssel prediktálunk
#        eredmények:
#             Ha mindig start tokentől generáltatok akkor nem zavarodik össze de rossz a kimenet
#             Ha mindig az utolsó többségi döntés alapján meghatározott karakter szerint döntünk akkor az elején magabiztos, de utána összekavarodik
#             Ha mindig az utolsó predikciót használjuk akkor káosz
#             Szerintem nem kellene függni a generált kimeneti szekvenciától a következő kimenetnek
#             Implementing stopping migtht also help (so implement signing detection)
#             e kapcsán kipróbáltam mivan ha csak 1-1 szavakat adok oda neki nem sokkal jobb de kicsit igen
#                 pl.: 2747473|hyenana|an
#                 be bearur
#             További ötletek, máshogy tanítsuk a modellt, ne csak az utolsó karaktert vágjuk le hanem bármit

#     2.ötlet: várjunk amíg össze nem gyűlik egy frame és greedy kiértékelést alkalmazva fordítsuk le
#     eredmények: (jobb, de van zaj benne bőven)
#     'bearu>' - bear
#     'janna ag>' - hyena
#     '/zebra>' - zebra
#     'jattiger>' - tiger
#     'www.whale>' - whale
    
# *címek:
#     1. Hallássérult jelelők támogatása Mesterséges intelligencián alapuló automatizált fordítassal,
#     szekvenciális kamerakép adatok felhasználásával
    
#     2. Hallássérult jelelők támogatasa Mesterséges intelligencián alapuló kamerakép szöveggé leképzésével,
#     valamint elterjedt szekvencia feldolgozásra alkalmas architektúrák összehasonlítása
    
#     3. Jelnyelv szöveggé fordítása, szekvenciafeldolgozásra alkalmas mesterséges intelligencia modellekkel

# Továbbiakban:
# Ha sikerülne az összekötés lehetne más modelleknek utánanézni kipróbálni (már találtam pár cikket erről)
# A klasszifikációt meg kell végre szerelni (jó kérdés hogyan:)
# Saját kamerával való tesztelés nagyon lassan (hátha nagyon basic jelelőknek működni fog)
# Hogyan detektáljuk, hogy valaki jelel-e éppen (szerintem a modellek pontosságán lehetne vele javítani)


# Kérdések:
# Mennyire van megnyomkodva a program vagy a kód tdk/szakdogán
# Az egy oldalasba mit kell belerakni? (mivan ha nem tudom összehozni a kettőt együtt? fingerspelling and sign pred)
# Forráskódok hivatkozása?


# Save as TFlite

In [315]:
 class TFLiteModel(tf.Module):
    def __init__(self, model):
        super(TFLiteModel, self).__init__()
        self.target_start_token_idx = start_token_idx
        self.target_end_token_idx = end_token_idx
        # Load the feature generation and main models
        self.model = model
    
    @tf.function(input_signature=[
        tf.TensorSpec(shape=[None, len(FEATURE_COLUMNS)], dtype=tf.float32, name='inputs'),
        tf.TensorSpec(shape=[], dtype=tf.int32, name='last_output'),
        tf.TensorSpec(shape=[], dtype=tf.int32, name='max_len'),
        tf.TensorSpec(shape=[], dtype=tf.bool, name='training')
    ])
    def call(self, inputs, last_output, max_len, training=False):
        # Preprocess Data
        x = tf.cast(inputs, tf.float32)
        x = x[None]
        x = tf.cond(tf.shape(x)[1] == 0, lambda: tf.zeros((1, 1, len(FEATURE_COLUMNS))), lambda: tf.identity(x))
        x = x[0]
        x = pre_process(x)
        x = x[None]
        x = self.model.generate_precisely(x, last_output, max_len)
        x = x[0]
        idx = tf.argmax(tf.cast(tf.equal(x, self.target_end_token_idx), tf.int32))
        idx = tf.where(tf.math.less(idx, 1), tf.constant(2, dtype=tf.int64), idx)
        x = x[1:idx]
        x = tf.one_hot(x, 59)
        return {'outputs': x}
    
    @tf.function(input_signature=[
        tf.TensorSpec(shape=[None, len(FEATURE_COLUMNS)], dtype=tf.float32, name='inputs'),
        tf.TensorSpec(shape=[], dtype=tf.int32, name='last_output'),
        tf.TensorSpec(shape=[], dtype=tf.int32, name='max_len'),
        tf.TensorSpec(shape=[], dtype=tf.bool, name='training')
    ])
    def __call__(self, inputs, last_output, max_len, training=False):
        # Preprocess Data
        x = tf.cast(inputs, tf.float32)
        x = x[None]
        x = tf.cond(tf.shape(x)[1] == 0, lambda: tf.zeros((1, 1, len(FEATURE_COLUMNS))), lambda: tf.identity(x))
        x = x[0]
        x = pre_process(x)
        x = x[None]
        x = self.model.generate_precisely(x, last_output, max_len)
        x = x[0]
        idx = tf.argmax(tf.cast(tf.equal(x, self.target_end_token_idx), tf.int32))
        idx = tf.where(tf.math.less(idx, 1), tf.constant(2, dtype=tf.int64), idx)
        x = x[1:idx]
        x = tf.one_hot(x, 59)
        return {'outputs': x}
    
tflitemodel_base = TFLiteModel(model)

In [316]:
model.save_weights("transformer.h5")

In [317]:
keras_model_converter = tf.lite.TFLiteConverter.from_keras_model(tflitemodel_base)
keras_model_converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS]#, tf.lite.OpsSet.SELECT_TF_OPS]
tflite_model = keras_model_converter.convert()
with open('transformer.tflite', 'wb') as f:
    f.write(tflite_model)

TypeError: __call__(inputs, last_output, max_len, training) missing required arguments: last_output, max_len.