# Load dataset

In [1]:
%%capture
!pip install mediapipe==0.9.0.1
!pip install protobuf==3.20.*
!pip install scikit-image

In [2]:
import os
import shutil
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import tensorflow as tf
import json
import mediapipe as mp
import matplotlib
import matplotlib.pyplot as plt
import random

from skimage.transform import resize
from mediapipe.framework.formats import landmark_pb2
from tensorflow import keras
from tensorflow.keras import layers
from tqdm.notebook import tqdm
from matplotlib import animation, rc

In [3]:
random.seed(42)

In [4]:
print("TensorFlow v" + tf.__version__)
print("Mediapipe v" + mp.__version__)

TensorFlow v2.10.1
Mediapipe v0.9.0.1


In [5]:
dataset_df = pd.read_csv('kaggle_dataset/train.csv')
print("Full train dataset shape is {}".format(dataset_df.shape))

Full train dataset shape is (67208, 5)


In [6]:
dataset_df.head()

Unnamed: 0,path,file_id,sequence_id,participant_id,phrase
0,train_landmarks/5414471.parquet,5414471,1816796431,217,3 creekhouse
1,train_landmarks/5414471.parquet,5414471,1816825349,107,scales/kuhaylah
2,train_landmarks/5414471.parquet,5414471,1816909464,1,1383 william lanier
3,train_landmarks/5414471.parquet,5414471,1816967051,63,988 franklin lane
4,train_landmarks/5414471.parquet,5414471,1817123330,89,6920 northeast 661st road


In [7]:
def random_sequence():
    # Fetch sequence_id, file_id, phrase from first row
    sequence_id, file_id, phrase = dataset_df.sample(n=1).iloc[0][['sequence_id', 'file_id', 'phrase']]
    print(f"sequence_id: {sequence_id}, file_id: {file_id}, phrase: {phrase}")
    
    # Fetch data from parquet file
    sample_sequence_df = pq.read_table(f"kaggle_dataset/train_landmarks/{str(file_id)}.parquet",
        filters=[[('sequence_id', '=', sequence_id)],]).to_pandas()
    print("Full sequence dataset shape is {}".format(sample_sequence_df.shape))
    
    return sample_sequence_df

In [8]:
random_sequence()

sequence_id: 53030664, file_id: 474255203, phrase: www.anfis.net
Full sequence dataset shape is (116, 1630)


Unnamed: 0_level_0,frame,x_face_0,x_face_1,x_face_2,x_face_3,x_face_4,x_face_5,x_face_6,x_face_7,x_face_8,...,z_right_hand_11,z_right_hand_12,z_right_hand_13,z_right_hand_14,z_right_hand_15,z_right_hand_16,z_right_hand_17,z_right_hand_18,z_right_hand_19,z_right_hand_20
sequence_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
53030664,0,0.530842,0.515960,0.522205,0.508281,0.515136,0.517146,0.524425,0.437660,0.525860,...,-0.090730,-0.099647,-0.043395,-0.083442,-0.091687,-0.084706,-0.052167,-0.077950,-0.074363,-0.063247
53030664,1,0.520698,0.513968,0.520709,0.506615,0.513041,0.515001,0.522036,0.432432,0.522813,...,-0.105827,-0.124586,-0.057947,-0.089035,-0.111434,-0.125782,-0.069354,-0.099526,-0.108586,-0.111513
53030664,2,0.522492,0.508267,0.515425,0.501700,0.507253,0.509458,0.517589,0.434301,0.519483,...,-0.101133,-0.118927,-0.057003,-0.090000,-0.110448,-0.122024,-0.071178,-0.101662,-0.108915,-0.110276
53030664,3,0.532963,0.508277,0.516833,0.500466,0.506789,0.508699,0.516122,0.433806,0.517046,...,-0.098701,-0.117609,-0.054274,-0.085762,-0.105237,-0.116657,-0.066429,-0.094741,-0.095997,-0.093443
53030664,4,0.527446,0.506865,0.514316,0.499597,0.505757,0.507919,0.515705,0.432278,0.516804,...,-0.105924,-0.123557,-0.055986,-0.099904,-0.116467,-0.120133,-0.068876,-0.110006,-0.115161,-0.111158
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53030664,111,0.492751,0.462393,0.476929,0.459667,0.460510,0.464301,0.478123,0.413001,0.480535,...,-0.057819,-0.051423,-0.013008,-0.060685,-0.051231,-0.030315,-0.026352,-0.053981,-0.039515,-0.020559
53030664,112,0.488510,0.463094,0.477605,0.461078,0.461400,0.465489,0.480044,0.413224,0.482895,...,-0.059172,-0.053366,-0.013070,-0.060777,-0.053220,-0.033255,-0.026398,-0.052642,-0.038594,-0.019982
53030664,113,0.493473,0.464982,0.479076,0.462758,0.463343,0.467348,0.481601,0.413828,0.484426,...,-0.061561,-0.058218,-0.023809,-0.065159,-0.053866,-0.034784,-0.038531,-0.060639,-0.044552,-0.025618
53030664,114,0.498007,0.467312,0.481916,0.464733,0.465473,0.469365,0.483684,0.418353,0.486642,...,-0.051555,-0.050471,-0.015218,-0.047847,-0.041781,-0.028375,-0.026410,-0.042944,-0.031145,-0.016445


In [9]:
# import json

# name_to_idx = {}
# for i, name in enumerate(sample_sequence_df.columns):
#     name_to_idx[name] = i
    
# json.dumps()

## Animation preview

In [10]:
matplotlib.rcParams['animation.embed_limit'] = 2**128
matplotlib.rcParams['savefig.pad_inches'] = 0
rc('animation', html='jshtml')

def create_animation(images):
    fig = plt.figure(figsize=(6, 9))
    ax = plt.Axes(fig, [0., 0., 1., 1.])
    ax.set_axis_off()
    fig.add_axes(ax)
    im=ax.imshow(images[0], cmap="gray")
    plt.close(fig)
    
    def animate_func(i):
        im.set_array(images[i])
        return [im]

    return animation.FuncAnimation(fig, animate_func, frames=len(images), interval=1000/10)

mp_pose = mp.solutions.pose
mp_hands = mp.solutions.hands
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils 
mp_drawing_styles = mp.solutions.drawing_styles

def collect_landmarks_oftype(_type: str, frame_data: pd.Series):
    x_type = frame_data.filter(regex=f"x_{_type}.*").values
    y_type = frame_data.filter(regex=f"y_{_type}.*").values
    z_type = frame_data.filter(regex=f"z_{_type}.*").values
    landmarks = landmark_pb2.NormalizedLandmarkList()
    for x, y, z in zip(x_type, y_type, z_type):
        landmarks.landmark.add(x=x, y=y, z=z)
    return landmarks
        
def produce_video_from_seq(seq_df: pd.DataFrame):
    frames = []
    for seq_idx in range(len(seq_df)):
        frame = np.zeros((600, 600, 3))
        frame_data = seq_df.iloc[seq_idx]
        
        # Right hand
        right_hand_landmarks = collect_landmarks_oftype("right_hand", frame_data)
        mp_drawing.draw_landmarks(
                frame,
                right_hand_landmarks,
                mp_hands.HAND_CONNECTIONS,
                landmark_drawing_spec=mp_drawing_styles.get_default_hand_landmarks_style())
        
        # Left Hand
        left_hand_landmarks = collect_landmarks_oftype("left_hand", frame_data)
        mp_drawing.draw_landmarks(
                frame,
                left_hand_landmarks,
                mp_hands.HAND_CONNECTIONS,
                landmark_drawing_spec=mp_drawing_styles.get_default_hand_landmarks_style())
        
        # Face
        face_landmarks = collect_landmarks_oftype("face", frame_data)
        mp_drawing.draw_landmarks(
            frame,
            face_landmarks,
            mp_holistic.FACEMESH_CONTOURS,
            landmark_drawing_spec=None,
            connection_drawing_spec=mp_drawing_styles
            .get_default_face_mesh_contours_style())
        
        # Pose
        pose_landmarks = collect_landmarks_oftype("pose", frame_data)
        mp_drawing.draw_landmarks(
            frame,
            pose_landmarks,
            mp_holistic.POSE_CONNECTIONS,
            landmark_drawing_spec=mp_drawing_styles.
            get_default_pose_landmarks_style())
        
        frames.append(frame.astype(np.uint8))
    return frames

In [217]:
video = produce_video_from_seq(random_sequence())
create_animation(video)

sequence_id: 376918879, file_id: 2118949241, phrase: /tradicia/lamodepourhomme2013
Full sequence dataset shape is (172, 1630)


# Save to TFRecords

In [12]:
# Pose coordinates for hand movement.
LPOSE = [13, 15, 17, 19, 21]
RPOSE = [14, 16, 18, 20, 22]
POSE = LPOSE + RPOSE

X = [f'x_right_hand_{i}' for i in range(21)] + [f'x_left_hand_{i}' for i in range(21)] + [f'x_pose_{i}' for i in POSE]
Y = [f'y_right_hand_{i}' for i in range(21)] + [f'y_left_hand_{i}' for i in range(21)] + [f'y_pose_{i}' for i in POSE]
Z = [f'z_right_hand_{i}' for i in range(21)] + [f'z_left_hand_{i}' for i in range(21)] + [f'z_pose_{i}' for i in POSE]

FEATURE_COLUMNS = X + Y + Z

X_IDX = [i for i, col in enumerate(FEATURE_COLUMNS)  if "x_" in col]
Y_IDX = [i for i, col in enumerate(FEATURE_COLUMNS)  if "y_" in col]
Z_IDX = [i for i, col in enumerate(FEATURE_COLUMNS)  if "z_" in col]

RHAND_IDX = [i for i, col in enumerate(FEATURE_COLUMNS)  if "right" in col]
LHAND_IDX = [i for i, col in enumerate(FEATURE_COLUMNS)  if  "left" in col]
RPOSE_IDX = [i for i, col in enumerate(FEATURE_COLUMNS)  if  "pose" in col and int(col[-2:]) in RPOSE]
LPOSE_IDX = [i for i, col in enumerate(FEATURE_COLUMNS)  if  "pose" in col and int(col[-2:]) in LPOSE]

In [13]:
def save_tfrecords_to(folder: str):
    # Loop through each file_id
    for file_id in tqdm(dataset_df.file_id.unique()):
        # Parquet file name
        pq_file = f"kaggle_dataset/train_landmarks/{file_id}.parquet"
        # Filter train.csv and fetch entries only for the relevant file_id
        file_df = dataset_df.loc[dataset_df["file_id"] == file_id]
        # Fetch the parquet file
        parquet_df = pq.read_table(f"kaggle_dataset/train_landmarks/{str(file_id)}.parquet",
                                  columns=['sequence_id'] + FEATURE_COLUMNS).to_pandas()

        # File name for the updated data
        tf_file = f"{folder}/{file_id}.tfrecord"
        parquet_numpy = parquet_df.to_numpy()
        # Initialize the pointer to write the output of each `for loop` below as a sequence into the file.
        with tf.io.TFRecordWriter(tf_file) as file_writer:
            # Loop through each sequence in file.
            for seq_id, phrase in zip(file_df.sequence_id, file_df.phrase):
                # Fetch sequence data
                frames = parquet_numpy[parquet_df.index == seq_id]

                # Calculate the number of rows that only have NaN values -> on how many frames each hand is fully visible
                r_nonan = np.sum(np.sum(np.isnan(frames[:, RHAND_IDX]), axis = 1) == 0)
                l_nonan = np.sum(np.sum(np.isnan(frames[:, LHAND_IDX]), axis = 1) == 0)
                # Number of frames where the dominant hand is visible (dominant = visible on more frames than the other hand)
                num_of_visible = max(r_nonan, l_nonan)

                # TODO experiment with this
                if 2*len(phrase) < num_of_visible:
                    features = {FEATURE_COLUMNS[i]: tf.train.Feature(float_list=tf.train.FloatList(value=frames[:, i])) for i in range(len(FEATURE_COLUMNS))}
                    features["phrase"] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[bytes(phrase, 'utf-8')]))
                    record_bytes = tf.train.Example(features=tf.train.Features(feature=features)).SerializeToString()
                    file_writer.write(record_bytes)

In [14]:
folder_name = "preprocessed"
if not os.path.isdir(folder_name):
    os.mkdir(folder_name)
    save_tfrecords_to(folder_name)
else:
    print(f"Warning: Tfrecords already exist! Delete the {folder_name} folder to regenerate them!")



# Fetch from TFRecords

In [15]:
tf_records = dataset_df.file_id.map(lambda x: f'{folder_name}/{x}.tfrecord').unique()
print(f"List of {len(tf_records)} TFRecord files.")

List of 68 TFRecord files.


In [16]:
with open ("kaggle_dataset/character_to_prediction_index.json", "r") as f:
    char_to_num = json.load(f)

# Add pad_token, start pointer and end pointer to the dict
pad_token = 'P'
start_token = '<'
end_token = '>'
pad_token_idx = 59
start_token_idx = 60
end_token_idx = 61

char_to_num[pad_token] = pad_token_idx
char_to_num[start_token] = start_token_idx
char_to_num[end_token] = end_token_idx
num_to_char = {j:i for i,j in char_to_num.items()}

In [17]:
FRAME_LEN = 128

# Function to resize and add padding.
def resize_pad(x):
    if tf.shape(x)[0] < FRAME_LEN:
        x = tf.pad(x, ([[0, FRAME_LEN-tf.shape(x)[0]], [0, 0], [0, 0]]))
    else:
        x = tf.image.resize(x, (FRAME_LEN, tf.shape(x)[1]))
    return x

# Detect the dominant hand from the number of NaN values.
# Dominant hand will have less NaN values since it is in frame moving.
def pre_process(x):
    rhand = tf.gather(x, RHAND_IDX, axis=1)
    lhand = tf.gather(x, LHAND_IDX, axis=1)
    rpose = tf.gather(x, RPOSE_IDX, axis=1)
    lpose = tf.gather(x, LPOSE_IDX, axis=1)
    
    rnan_idx = tf.reduce_any(tf.math.is_nan(rhand), axis=1)
    lnan_idx = tf.reduce_any(tf.math.is_nan(lhand), axis=1)
    
    rnans = tf.math.count_nonzero(rnan_idx)
    lnans = tf.math.count_nonzero(lnan_idx)
    
    # For dominant hand
    if rnans > lnans:
        hand = lhand
        pose = lpose
        
        hand_x = hand[:, 0*(len(LHAND_IDX)//3) : 1*(len(LHAND_IDX)//3)]
        hand_y = hand[:, 1*(len(LHAND_IDX)//3) : 2*(len(LHAND_IDX)//3)]
        hand_z = hand[:, 2*(len(LHAND_IDX)//3) : 3*(len(LHAND_IDX)//3)]
        hand = tf.concat([1-hand_x, hand_y, hand_z], axis=1)
        
        pose_x = pose[:, 0*(len(LPOSE_IDX)//3) : 1*(len(LPOSE_IDX)//3)]
        pose_y = pose[:, 1*(len(LPOSE_IDX)//3) : 2*(len(LPOSE_IDX)//3)]
        pose_z = pose[:, 2*(len(LPOSE_IDX)//3) : 3*(len(LPOSE_IDX)//3)]
        pose = tf.concat([1-pose_x, pose_y, pose_z], axis=1)
    else:
        hand = rhand
        pose = rpose
    
    hand_x = hand[:, 0*(len(LHAND_IDX)//3) : 1*(len(LHAND_IDX)//3)]
    hand_y = hand[:, 1*(len(LHAND_IDX)//3) : 2*(len(LHAND_IDX)//3)]
    hand_z = hand[:, 2*(len(LHAND_IDX)//3) : 3*(len(LHAND_IDX)//3)]
    hand = tf.concat([hand_x[..., tf.newaxis], hand_y[..., tf.newaxis], hand_z[..., tf.newaxis]], axis=-1)
    
    mean = tf.math.reduce_mean(hand, axis=1)[:, tf.newaxis, :]
    std = tf.math.reduce_std(hand, axis=1)[:, tf.newaxis, :]
    hand = (hand - mean) / std

    pose_x = pose[:, 0*(len(LPOSE_IDX)//3) : 1*(len(LPOSE_IDX)//3)]
    pose_y = pose[:, 1*(len(LPOSE_IDX)//3) : 2*(len(LPOSE_IDX)//3)]
    pose_z = pose[:, 2*(len(LPOSE_IDX)//3) : 3*(len(LPOSE_IDX)//3)]
    pose = tf.concat([pose_x[..., tf.newaxis], pose_y[..., tf.newaxis], pose_z[..., tf.newaxis]], axis=-1)
    
    x = tf.concat([hand, pose], axis=1)
    x = resize_pad(x)
    
    x = tf.where(tf.math.is_nan(x), tf.zeros_like(x), x)
    x = tf.reshape(x, (FRAME_LEN, len(LHAND_IDX) + len(LPOSE_IDX)))
    return x

In [18]:
def decode_fn(record_bytes):
    schema = {COL: tf.io.VarLenFeature(dtype=tf.float32) for COL in FEATURE_COLUMNS}
    schema["phrase"] = tf.io.FixedLenFeature([], dtype=tf.string)
    features = tf.io.parse_single_example(record_bytes, schema)
    phrase = features["phrase"]
    landmarks = ([tf.sparse.to_dense(features[COL]) for COL in FEATURE_COLUMNS])
    # Transpose to maintain the original shape of landmarks data.
    landmarks = tf.transpose(landmarks)
    
    return landmarks, phrase

In [19]:
table = tf.lookup.StaticHashTable(
    initializer=tf.lookup.KeyValueTensorInitializer(
        keys=list(char_to_num.keys()),
        values=list(char_to_num.values()),
    ),
    default_value=tf.constant(-1),
    name="class_weight"
)

def convert_fn(landmarks, phrase):
    # Add start and end pointers to phrase.
    phrase = start_token + phrase + end_token
    phrase = tf.strings.bytes_split(phrase)
    phrase = table.lookup(phrase)
    # Vectorize and add padding.
    phrase = tf.pad(phrase, paddings=[[0, 64 - tf.shape(phrase)[0]]], mode = 'CONSTANT', constant_values = pad_token_idx)
    # Apply pre_process function to the landmarks.
    return pre_process(landmarks), phrase

In [20]:
batch_size = 32
train_len = int(0.8 * len(tf_records))

train_ds = tf.data.TFRecordDataset(tf_records[:train_len]).map(decode_fn).map(convert_fn).batch(batch_size).shuffle(buffer_size=4*batch_size).prefetch(buffer_size=tf.data.AUTOTUNE).cache()
valid_ds = tf.data.TFRecordDataset(tf_records[train_len:]).map(decode_fn).map(convert_fn).batch(batch_size).prefetch(buffer_size=tf.data.AUTOTUNE).cache()

In [21]:
# Define the number of data points you want to print
num_batches_to_print = 1

# Create an iterator for the train and valid datasets
train_iterator = iter(train_ds)
valid_iterator = iter(valid_ds)

# Print data points from the training dataset
print("Training Data:")
for _ in range(num_batches_to_print):
    try:
        landmarks, phrase = next(train_iterator)
        print("Landmarks:")
        print(type(landmarks))
        print(landmarks.shape)
        print(landmarks.numpy())
        print("Phrase:")
        print(phrase.shape)
        print(phrase)
        print("-" * 40)
    except StopIteration:
        break

Training Data:
Landmarks:
<class 'tensorflow.python.framework.ops.EagerTensor'>
(32, 128, 78)
[[[-0.83641106  1.8348206   2.516369   ...  0.5250211   0.74593127
   -2.3379188 ]
  [-0.637652    1.8677006   2.6974423  ...  0.4998536   0.7128169
   -2.6081285 ]
  [-0.5520263   1.8656126   2.7049568  ...  0.48201883  0.68131167
   -2.1073463 ]
  ...
  [ 0.          0.          0.         ...  0.          0.
    0.        ]
  [ 0.          0.          0.         ...  0.          0.
    0.        ]
  [ 0.          0.          0.         ...  0.          0.
    0.        ]]

 [[ 0.          0.          0.         ...  0.5122966   0.87864864
   -2.9377735 ]
  [ 0.          0.          0.         ...  0.5499121   0.87477225
   -2.9149811 ]
  [-1.4413708   2.7023942   1.4361221  ...  0.6070491   0.83566314
   -3.20782   ]
  ...
  [ 0.          0.          0.         ...  0.35960558  0.76781934
   -2.9576807 ]
  [ 0.          0.          0.         ...  0.43007067  0.95855016
   -2.3931198 ]
  [ 

# Creating the Model

In [104]:
class TokenEmbedding(layers.Layer):
    def __init__(self, num_vocab=1000, maxlen=100, num_hid=64):
        super().__init__()
        self.num_hid = num_hid
        self.emb = tf.keras.layers.Embedding(num_vocab, num_hid)
        #self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=num_hid)
        '''
        self.pos_emb = tf.math.divide(
            self.positional_encoding(maxlen-1, num_hid),
            tf.math.sqrt(tf.cast(num_hid, tf.float32)))
        '''
        self.pos_emb = self.positional_encoding(maxlen-1, num_hid)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        x = self.emb(x)
        x = tf.math.multiply(x, tf.math.sqrt(tf.cast(self.num_hid, tf.float32)))
        '''
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        return x + positions
        '''
        return x + self.pos_emb[:maxlen, :]
    
    def positional_encoding(self, maxlen, num_hid):
        depth = num_hid/2
        positions = tf.range(maxlen, dtype = tf.float32)[..., tf.newaxis]
        depths = tf.range(depth, dtype = tf.float32)[np.newaxis, :]/depth
        angle_rates = tf.math.divide(1, tf.math.pow(tf.cast(10000, tf.float32), depths))
        angle_rads = tf.linalg.matmul(positions, angle_rates)
        pos_encoding = tf.concat(
          [tf.math.sin(angle_rads), tf.math.cos(angle_rads)],
          axis=-1) 
        return pos_encoding


class LandmarkEmbedding(layers.Layer):
    def __init__(self, num_hid=64, maxlen=100):
        super().__init__()
        self.conv1 = tf.keras.layers.Conv1D(
            num_hid, 11, padding="same", activation="relu"
        )
        self.conv2 = tf.keras.layers.Conv1D(
            num_hid, 11, padding="same", activation="relu"
        )
        self.conv3 = tf.keras.layers.Conv1D(
            num_hid, 11, padding="same", activation="relu"
        )
        self.pos_emb = self.positional_encoding(maxlen, num_hid)
        self.maxlen = maxlen
        self.num_hid = num_hid

    def call(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        
        x = tf.math.multiply(x, tf.math.sqrt(tf.cast(self.num_hid, tf.float32)))
        x = x + self.pos_emb
        
        return x
    
    def positional_encoding(self, maxlen, num_hid):
        depth = num_hid/2
        positions = tf.range(maxlen, dtype = tf.float32)[..., tf.newaxis]
        depths = tf.range(depth, dtype = tf.float32)[np.newaxis, :]/depth
        angle_rates = tf.math.divide(1, tf.math.pow(tf.cast(10000, tf.float32), depths))
        angle_rads = tf.linalg.matmul(positions, angle_rates)
        pos_encoding = tf.concat(
          [tf.math.sin(angle_rads), tf.math.cos(angle_rads)],
          axis=-1) 
        return pos_encoding

class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, num_heads, feed_forward_dim, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [
                layers.Dense(feed_forward_dim, activation="relu"),
                layers.Dense(embed_dim),
            ]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, num_heads, feed_forward_dim, dropout_rate=0.1):
        super().__init__()
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = layers.LayerNormalization(epsilon=1e-6)
        self.self_att = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.enc_att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.self_dropout = layers.Dropout(0.5)
        self.enc_dropout = layers.Dropout(0.1)
        self.ffn_dropout = layers.Dropout(0.1)
        self.ffn = keras.Sequential(
            [
                layers.Dense(feed_forward_dim, activation="relu"),
                layers.Dense(embed_dim),
            ]
        )

    def causal_attention_mask(self, batch_size, n_dest, n_src, dtype):
        """Masks the upper half of the dot product matrix in self attention.

        This prevents flow of information from future tokens to current token.
        1's in the lower triangle, counting from the lower right corner.
        """
        i = tf.range(n_dest)[:, None]
        j = tf.range(n_src)
        m = i >= j - n_src + n_dest
        mask = tf.cast(m, dtype)
        mask = tf.reshape(mask, [1, n_dest, n_src])
        mult = tf.concat(
            [batch_size[..., tf.newaxis], tf.constant([1, 1], dtype=tf.int32)], 0
        )
        return tf.tile(mask, mult)

    def call(self, enc_out, target, training):
        input_shape = tf.shape(target)
        batch_size = input_shape[0]
        seq_len = input_shape[1]
        causal_mask = self.causal_attention_mask(batch_size, seq_len, seq_len, tf.bool)
        target_att = self.self_att(target, target, attention_mask=causal_mask)
        target_norm = self.layernorm1(target + self.self_dropout(target_att, training = training))
        enc_out = self.enc_att(target_norm, enc_out)
        enc_out_norm = self.layernorm2(self.enc_dropout(enc_out, training = training) + target_norm)
        ffn_out = self.ffn(enc_out_norm)
        ffn_out_norm = self.layernorm3(enc_out_norm + self.ffn_dropout(ffn_out, training = training))
        return ffn_out_norm

class Transformer(keras.Model):
    def __init__(
        self,
        num_hid=64,
        num_head=4,
        num_feed_forward=128,
        source_maxlen=100,
        target_maxlen=100,
        num_layers_enc=2,
        num_layers_dec=1,
        num_classes=60,
    ):
        super().__init__()
        self.loss_metric = keras.metrics.Mean(name="loss")
        self.acc_metric = keras.metrics.Mean(name="edit_dist")
        self.num_layers_enc = num_layers_enc
        self.num_layers_dec = num_layers_dec
        self.target_maxlen = target_maxlen
        self.num_classes = num_classes

        self.enc_input = LandmarkEmbedding(num_hid=num_hid, maxlen=source_maxlen)
        self.dec_input = TokenEmbedding(
            num_vocab=num_classes, maxlen=target_maxlen, num_hid=num_hid
        )

        self.encoder = keras.Sequential(
            [self.enc_input]
            + [
                TransformerEncoder(num_hid, num_head, num_feed_forward)
                for _ in range(num_layers_enc)
            ]
        )

        for i in range(num_layers_dec):
            setattr(
                self,
                f"dec_layer_{i}",
                TransformerDecoder(num_hid, num_head, num_feed_forward),
            )

        self.classifier = layers.Dense(num_classes)

    def decode(self, enc_out, target, training):
        y = self.dec_input(target)
        for i in range(self.num_layers_dec):
            y = getattr(self, f"dec_layer_{i}")(enc_out, y, training)
        return y

    def call(self, inputs, training):
        source = inputs[0]
        target = inputs[1]
        x = self.encoder(source, training)
        y = self.decode(x, target, training)
        return self.classifier(y)

    @property
    def metrics(self):
        return [self.loss_metric]

    def train_step(self, batch):
        """Processes one batch inside model.fit()."""
        source = batch[0]
        target = batch[1]

        input_shape = tf.shape(target)
        batch_size = input_shape[0]
        
        dec_input = target[:, :-1]
        dec_target = target[:, 1:]
        with tf.GradientTape() as tape:
            preds = self([source, dec_input])
            one_hot = tf.one_hot(dec_target, depth=self.num_classes)
            mask = tf.math.logical_not(tf.math.equal(dec_target, pad_token_idx))
            loss = self.compiled_loss(one_hot, preds, sample_weight=mask)
        trainable_vars = self.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))
        # Computes the Levenshtein distance between sequences since the evaluation
        # metric for this contest is the normalized total levenshtein distance.
        edit_dist = tf.edit_distance(tf.sparse.from_dense(target), 
                                     tf.sparse.from_dense(tf.cast(tf.argmax(preds, axis=1), tf.int32)))
        edit_dist = tf.reduce_mean(edit_dist)
        self.acc_metric.update_state(edit_dist)
        self.loss_metric.update_state(loss)
        return {"loss": self.loss_metric.result(), "edit_dist": self.acc_metric.result()}

    def test_step(self, batch):        
        source = batch[0]
        target = batch[1]

        input_shape = tf.shape(target)
        batch_size = input_shape[0]
        
        dec_input = target[:, :-1]
        dec_target = target[:, 1:]
        preds = self([source, dec_input])
        one_hot = tf.one_hot(dec_target, depth=self.num_classes)
        mask = tf.math.logical_not(tf.math.equal(dec_target, pad_token_idx))
        loss = self.compiled_loss(one_hot, preds, sample_weight=mask)
        # Computes the Levenshtein distance between sequences since the evaluation
        # metric for this contest is the normalized total levenshtein distance.
        edit_dist = tf.edit_distance(tf.sparse.from_dense(target), 
                                     tf.sparse.from_dense(tf.cast(tf.argmax(preds, axis=1), tf.int32)))
        edit_dist = tf.reduce_mean(edit_dist)
        self.acc_metric.update_state(edit_dist)
        self.loss_metric.update_state(loss)
        return {"loss": self.loss_metric.result(), "edit_dist": self.acc_metric.result()}

    def generate(self, source, target_start_token_idx):
        """Performs inference over one batch of inputs using greedy decoding."""
        bs = tf.shape(source)[0]
        enc = self.encoder(source, training = False)
        dec_input = tf.ones((bs, 1), dtype=tf.int32) * target_start_token_idx
        dec_logits = []
        for i in range(self.target_maxlen - 1):
            dec_out = self.decode(enc, dec_input, training = False)
            logits = self.classifier(dec_out)
            logits = tf.argmax(logits, axis=-1, output_type=tf.int32)
            last_logit = logits[:, -1][..., tf.newaxis]
            dec_logits.append(last_logit)
            dec_input = tf.concat([dec_input, last_logit], axis=-1)
        return dec_input
    
    def generate_precisely(self, source, target_start_token_idx, how_many):
        bs = tf.shape(source)[0]
        enc = self.encoder(source, training = False)
        dec_input = tf.ones((bs, 1), dtype=tf.int32) * target_start_token_idx
        dec_logits = []
        for i in range(how_many):
            dec_out = self.decode(enc, dec_input, training = False)
            logits = self.classifier(dec_out)
            logits = tf.argmax(logits, axis=-1, output_type=tf.int32)
            last_logit = logits[:, -1][..., tf.newaxis]
            dec_logits.append(last_logit)
            dec_input = tf.concat([dec_input, last_logit], axis=-1)
        return dec_input

In [23]:
class DisplayOutputs(keras.callbacks.Callback):
    def __init__(
        self, batch, idx_to_token, target_start_token_idx=60, target_end_token_idx=61
    ):
        """Displays a batch of outputs after every 4 epoch

        Args:
            batch: A test batch
            idx_to_token: A List containing the vocabulary tokens corresponding to their indices
            target_start_token_idx: A start token index in the target vocabulary
            target_end_token_idx: An end token index in the target vocabulary
        """
        self.batch = batch
        self.target_start_token_idx = target_start_token_idx
        self.target_end_token_idx = target_end_token_idx
        self.idx_to_char = idx_to_token

    def on_epoch_end(self, epoch, logs=None):
        if epoch % 4 != 0:
            return
        source = self.batch[0]
        target = self.batch[1].numpy()
        bs = tf.shape(source)[0]
        preds = self.model.generate(source, self.target_start_token_idx)
        preds = preds.numpy()
        for i in range(bs):
            target_text = "".join([self.idx_to_char[_] for _ in target[i, :]])
            prediction = ""
            for idx in preds[i, :]:
                prediction += self.idx_to_char[idx]
                if idx == self.target_end_token_idx:
                    break
            print(f"target:     {target_text.replace('-','')}")
            print(f"prediction: {prediction}\n")

In [105]:
batch = next(iter(valid_ds))

idx_to_char = list(char_to_num.keys())
display_cb = DisplayOutputs(
    batch, idx_to_char, target_start_token_idx=char_to_num['<'], target_end_token_idx=char_to_num['>']
)

model = Transformer(
    num_hid=200,
    num_head=4,
    num_feed_forward=400,
    source_maxlen = FRAME_LEN,
    target_maxlen=64,
    num_layers_enc=2,
    num_layers_dec=1,
    num_classes=62,
)
loss_fn = tf.keras.losses.CategoricalCrossentropy(
    from_logits=True, label_smoothing=0.1,
)


optimizer = keras.optimizers.Adam(0.0001)
model.compile(optimizer=optimizer, loss=loss_fn)

In [123]:
%%time
history = model.fit(train_ds, validation_data=valid_ds, verbose = 2, callbacks=[], epochs=7)

Epoch 1/7
1263/1263 - 720s - loss: 0.8114 - edit_dist: 1.0730 - val_loss: 0.6843 - val_edit_dist: 1.0715 - 720s/epoch - 570ms/step
Epoch 2/7
1263/1263 - 711s - loss: 0.5983 - edit_dist: 1.0668 - val_loss: 0.5544 - val_edit_dist: 1.0648 - 711s/epoch - 563ms/step
Epoch 3/7
1263/1263 - 683s - loss: 0.5230 - edit_dist: 1.0607 - val_loss: 0.5182 - val_edit_dist: 1.0595 - 683s/epoch - 541ms/step
Epoch 4/7
1263/1263 - 684s - loss: 0.4912 - edit_dist: 1.0567 - val_loss: 0.5009 - val_edit_dist: 1.0559 - 684s/epoch - 542ms/step
Epoch 5/7
1263/1263 - 684s - loss: 0.4706 - edit_dist: 1.0539 - val_loss: 0.4898 - val_edit_dist: 1.0532 - 684s/epoch - 542ms/step
Epoch 6/7
1263/1263 - 685s - loss: 0.4560 - edit_dist: 1.0517 - val_loss: 0.4827 - val_edit_dist: 1.0513 - 685s/epoch - 542ms/step
Epoch 7/7
1263/1263 - 685s - loss: 0.4444 - edit_dist: 1.0504 - val_loss: 0.4781 - val_edit_dist: 1.0501 - 685s/epoch - 542ms/step
CPU times: total: 16h 8min 15s
Wall time: 1h 20min 52s


# Testing

## On Test dataset

In [124]:
batches = [batch for batch in valid_ds]

preds_list = []
ground_truth_list = []

for batch in batches[:1]:
    source = batch[0]
    target = batch[1].numpy()
    bs = tf.shape(source)[0]
    preds = model.generate(source, start_token_idx)
    preds = preds.numpy()

    for i in range(bs):
        target_text = "".join([idx_to_char[_] for _ in target[i, :]])
        ground_truth_list.append(target_text.replace('P', ''))
        prediction = ""
        for idx in preds[i, :]:
            prediction += idx_to_char[idx]
            if idx == end_token_idx:
                break
        preds_list.append(prediction)

for i in range(30):
    print(ground_truth_list[i])
    print(preds_list[i])
    print('\n~~~\n')

<2796 west golden willow drive>
<279796 west gollen willow drive>

~~~

<973-471-9887>
<+44-21-41-99>

~~~

<497-723-6992>
<497-723-6928>

~~~

<reallyloud.co.uk/simaii>
<realloud.com.ru/simai>

~~~

<kkaicd1.pixnet.net>
<aicol-pig-ponet>

~~~

<8260 john r bowdoin>
<820 johnrbowdoin>

~~~

<56 paper birch drive>
<566 penbirch rdive>

~~~

<gand-chudai-hardcor.html>
<randandingarcarcer.com.com.tk>

~~~

<2708 west 77th>
<2708 west7th>

~~~

<https://www.keainfo.gr>
<https://www.kreafo.hth>

~~~

<288 fuller lake>
<28858 forlanerl>

~~~

<mser/oki-guide>
<mser/okide>

~~~

<220 north 47th avenue east>
<20 north panust peareas>

~~~

<www.sudinfo.be>
<www.suturafa.com>

~~~

<69 grant point>
<6691 paroira>

~~~

<fibrain.pl/chapter-145/thanghr>
<fibrain-1choter-4lan.hau>

~~~

<+351-521-8895>
<+335-3572-895>

~~~

<via-piero-gobetti/brandi-love>
<vingiera-gro-betindi.ndi.ve>

~~~

<amir le>
<amoir le>

~~~

<automantenimiento-sa>
<monymimiento-to-sa>

~~~

<6499 nfd 5053>
<649-998-5053>


## Real life testing

In [27]:
import cv2
from IPython.display import display, Image

In [201]:
# # Pose coordinates for hand movement.
# LPOSE = [13, 15, 17, 19, 21]
# RPOSE = [14, 16, 18, 20, 22]
# POSE = LPOSE + RPOSE

# X = [f'x_right_hand_{i}' for i in range(21)] + [f'x_left_hand_{i}' for i in range(21)] + [f'x_pose_{i}' for i in POSE]
# Y = [f'y_right_hand_{i}' for i in range(21)] + [f'y_left_hand_{i}' for i in range(21)] + [f'y_pose_{i}' for i in POSE]
# Z = [f'z_right_hand_{i}' for i in range(21)] + [f'z_left_hand_{i}' for i in range(21)] + [f'z_pose_{i}' for i in POSE]

# FEATURE_COLUMNS = X + Y + Z

from itertools import chain

# Pose coordinates for hand movement.
LPOSE = [13, 15, 17, 19, 21]
RPOSE = [14, 16, 18, 20, 22]
POSE = LPOSE + RPOSE

def extract_from_result(result):
    min_landmarks = 5
    is_empty = results.left_hand_landmarks and len(results.left_hand_landmarks.landmark) < min_landmarks and results.right_hand_landmarks and len(results.right_hand_landmarks.landmark) < min_landmarks
    
    
    # Extract specific pose landmarks if available
    px = []
    py = []
    pz = []
    if results.left_hand_landmarks and not is_empty:
        for i in POSE:
            lm = results.pose_landmarks.landmark[i]
            px.append(lm.x)
            py.append(lm.y)
            pz.append(lm.z)
    else:
        px = [0.0]*len(POSE)
        py = [0.0]*len(POSE)
        pz = [0.0]*len(POSE)

    # Extract left hand landmarks if available
    lx = []
    ly = []
    lz = []
    if results.left_hand_landmarks and not is_empty:
        for lm in results.left_hand_landmarks.landmark:
            lx.append(lm.x)
            ly.append(lm.y)
            lz.append(lm.z)
    else:
        lx = [0.0]*21
        ly = [0.0]*21
        lz = [0.0]*21

    # Extract right hand landmarks if available
    rx = []
    ry = []
    rz = []
    if results.right_hand_landmarks and not is_empty:
        for lm in results.right_hand_landmarks.landmark:
            rx.append(lm.x)
            ry.append(lm.y)
            rz.append(lm.z)
    else:
        rx = [0.0]*21
        ry = [0.0]*21
        rz = [0.0]*21

    return list(chain(rx, lx, px, ry, ly, py, rz, lz, pz))

In [209]:
# camera - 0
# video - "path/to/file"
file_or_camera = "bear.mp4"

In [210]:
# Többségi döntéses kiértékelés

from collections import Counter

fifo_size = 20
confidence_number = 13

model_data = []

inner_fifo = []
last_output = start_token_idx
whole_pred = ""
def process_model_output(new_output: int):
    global last_output
    global whole_pred
    
    whole_pred += idx_to_char[new_output]
    
    if len(inner_fifo) == fifo_size:
        predicted_idx = Counter(inner_fifo).most_common(1)[0][0]
        if last_output != predicted_idx:
            last_output = predicted_idx
            if predicted_idx == end_token_idx:
                last_output = start_token_idx
                print("stop")
                inner_fifo.clear()
            else:
                print(idx_to_char[predicted_idx])
            
    if len(inner_fifo) == fifo_size:
        inner_fifo.pop(0)
    inner_fifo.append(new_output)

model_start_token = start_token_idx
video = cv2.VideoCapture(file_or_camera)
display_handle=display(None, display_id=True)
i = -1
last_frame = None
try:
    with mp_holistic.Holistic(min_detection_confidence=0.5,min_tracking_confidence=0.5) as holistic:
        while True:
            i += 1
            if i % 2 == 0:
                _, frame = video.read()
                last_frame = frame
            else:
                frame = last_frame

            if frame is None:
                break

            # Resize the frame to the target width and height
            image = cv2.resize(frame, (360, 240))

            # To improve performance, optionally mark the image as not writeable to
            # pass by reference.
            image.flags.writeable = False
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

            results = holistic.process(image)
            data = extract_from_result(results)
            if len(model_data) >= 50:
                model_data.pop(0)
            model_data.append(data)
            inp = pre_process(model_data)
            preds = model.generate_precisely(np.array([inp]), last_output, 1)
#             prediction = ""
#             for idx in preds[0, 1:]:
#                 prediction += idx_to_char[idx]
#                 last_char = idx_to_char[idx]
#                 if idx == end_token_idx:
#                     break
            process_model_output(preds[0, 1].numpy())
            

            # Draw landmark annotation on the image.
            image.flags.writeable = True
            image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
            mp_drawing.draw_landmarks(
                image,
                results.face_landmarks,
                mp_holistic.FACEMESH_CONTOURS,
                landmark_drawing_spec=None,
                connection_drawing_spec=mp_drawing_styles
                .get_default_face_mesh_contours_style())

            mp_drawing.draw_landmarks(
                image,
                results.pose_landmarks,
                mp_holistic.POSE_CONNECTIONS,
                landmark_drawing_spec=mp_drawing_styles
                .get_default_pose_landmarks_style())

            mp_drawing.draw_landmarks(
                image,
                results.left_hand_landmarks,
                mp_holistic.HAND_CONNECTIONS,
                landmark_drawing_spec=mp_drawing_styles.get_default_hand_landmarks_style()
            )

            mp_drawing.draw_landmarks(
                image,
                results.right_hand_landmarks,
                mp_holistic.HAND_CONNECTIONS,
                landmark_drawing_spec=mp_drawing_styles.get_default_hand_landmarks_style()
            )

            #frame = cv2.flip(image, 1)
            _, frame = cv2.imencode('.jpeg', image)
            display_handle.update(Image(data=frame.tobytes()))
except KeyboardInterrupt:
    pass
finally:
    video.release()
    display_handle.update(None)

None

b
e
 
b
e
a
r
u
r


In [188]:
whole_pred

''

In [216]:
# várjunk amíg megtelik egy frame aztán pred greedy módon

model_data = []
video = cv2.VideoCapture("whale.mp4")
display_handle=display(None, display_id=True)
try:
    with mp_holistic.Holistic(min_detection_confidence=0.5,min_tracking_confidence=0.5) as holistic:
        while True:
            _, frame = video.read()
            
            if frame is None:
                break

            # Resize the frame to the target width and height
            image = cv2.resize(frame, (360, 240))

            # To improve performance, optionally mark the image as not writeable to
            # pass by reference.
            image.flags.writeable = False
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

            results = holistic.process(image)
            data = extract_from_result(results)
            if len(model_data) == FRAME_LEN:
                print("Frames filled up")
                break
                
            model_data.append(data)
           
            # Draw landmark annotation on the image.
            image.flags.writeable = True
            image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
            mp_drawing.draw_landmarks(
                image,
                results.face_landmarks,
                mp_holistic.FACEMESH_CONTOURS,
                landmark_drawing_spec=None,
                connection_drawing_spec=mp_drawing_styles
                .get_default_face_mesh_contours_style())

            mp_drawing.draw_landmarks(
                image,
                results.pose_landmarks,
                mp_holistic.POSE_CONNECTIONS,
                landmark_drawing_spec=mp_drawing_styles
                .get_default_pose_landmarks_style())

            mp_drawing.draw_landmarks(
                image,
                results.left_hand_landmarks,
                mp_holistic.HAND_CONNECTIONS,
                landmark_drawing_spec=mp_drawing_styles.get_default_hand_landmarks_style()
            )

            mp_drawing.draw_landmarks(
                image,
                results.right_hand_landmarks,
                mp_holistic.HAND_CONNECTIONS,
                landmark_drawing_spec=mp_drawing_styles.get_default_hand_landmarks_style()
            )

            #frame = cv2.flip(image, 1)
            _, frame = cv2.imencode('.jpeg', image)
            display_handle.update(Image(data=frame.tobytes()))
except KeyboardInterrupt:
    pass
finally:
    video.release()
    display_handle.update(None)
    

# Run the model
inp = pre_process(model_data)
preds = model.generate(np.array([inp]), start_token_idx)
prediction = ""
for idx in preds[0, 1:]:
    prediction += idx_to_char[idx]
    last_char = idx_to_char[idx]
    if idx == end_token_idx:
        break
        
prediction

None

'www.whale>'

In [218]:
# Változtatások:
# *teljes alakos kirajzolás
# *positional encoding eredmények jobbak lettek:
    # <2796 west golden willow drive>
    # <279796 west gollen willow drive>
    # ~~~
    # <973-471-9887>
    # <+44-21-41-99>
    # ~~~
    # <497-723-6992>
    # <497-723-6928>
    # ~~~
    # <reallyloud.co.uk/simaii>
    # <realloud.com.ru/simai>
    # ~~~
    # <kkaicd1.pixnet.net>
    # <aicol-pig-ponet>
    # ~~~
    # <8260 john r bowdoin>
    # <820 johnrbowdoin>
    # ~~~
    # <56 paper birch drive>
    # <566 penbirch rdive>
    # ~~~
    # <gand-chudai-hardcor.html>
    # <randandingarcarcer.com.com.tk>
    # ~~~
    # <2708 west 77th>
    # <2708 west7th>
    # ~~~
    # <https://www.keainfo.gr>
    # <https://www.kreafo.hth>
    # ~~~
    # <288 fuller lake>
    # <28858 forlanerl>
    # ~~~
    # <mser/oki-guide>
    # <mser/okide>
    # ~~~
    # <220 north 47th avenue east>
    # <20 north panust peareas>
    # ~~~
    # <www.sudinfo.be>
    # <www.suturafa.com>
    # ~~~
    # <69 grant point>
    # <6691 paroira>
    # ~~~
    # <fibrain.pl/chapter-145/thanghr>
    # <fibrain-1choter-4lan.hau>
    # ~~~
    # <+351-521-8895>
    # <+335-3572-895>
    # ~~~
    # <via-piero-gobetti/brandi-love>
    # <vingiera-gro-betindi.ndi.ve>
    # ~~~
    # <amir le>
    # <amoir le>
    # ~~~
    # <automantenimiento-sa>
    # <monymimiento-to-sa>
    # ~~~
    # <6499 nfd 5053>
    # <649-998-5053>
    # ~~~
    # <7870 preston place>
    # <9753 parroplangl plane>
    # ~~~
    # <6870 scabisuit lane>
    # <6870 scabisuisuit lane>
    # ~~~
    # <sunshine mayer>
    # <sutrashines/mayer>
    # ~~~
    # <408-249-4707>
    # <487-823-4430>
    # ~~~
    # <arabradio.us/vana>
    # <arabrasdio.rs/vana>
    # ~~~
    # <televisoresponse>
    # <60146000009>
    # ~~~
    # <www.voices.com/bitesize>
    # <www.voropes.com/besit>
    # ~~~
    # <qd.razavi.ac.ir/bunya>
    # <stda-da-vir-acirra>
    # ~~~
    # <+246-987-50-80-02-294>
    # <+246-987-50-80-80294>
    # ~~~
# *classification modellt nem sikerült betanítani továbbra sem
# (más modellekkel is próbálkoztam azok sem mentek, szóval valahogy az adattal lehet gond)
# találtam másik adathalmazt is hozzá következőkben nem 250 hanem sokkal kevesebb signt fogok neki betanítani
# *real worlddel való összekötés sikerei? olvasd tovább:
    # Kísérletek:
    # van előre egy szekvencia, és azt kell fordítani, az egész jól megy
    # ha viszont egy folyamatos videóból kell ráadásul egymással nem kapcsolatban lévő részeket fordítani az nem az igazi
    # 1. ötlet: mindig egy újabb karaktert generálunk, és egy fifóból többségi döntéssel prediktálunk
    #    eredmények:
            # Ha mindig start tokentől generáltatok akkor nem zavarodik össze de rossz a kimenet
            # Ha mindig az utolsó többségi döntés alapján meghatározott karakter szerint döntünk akkor az elején magabiztos, de utána összekavarodik
            # Ha mindig az utolsó predikciót használjuk akkor káosz
            # Szerintem nem kellene függni a generált kimeneti szekvenciától a következő kimenetnek
            # Implementing stopping migtht also help (so implement signing detection)
            # e kapcsán kipróbáltam mivan ha csak 1-1 szavakat adok oda neki nem sokkal jobb de kicsit igen
                # pl.: 2747473|hyenana|an
                # be bearur
            # További ötletek, máshogy tanítsuk a modellt, ne csak az utolsó karaktert vágjuk le hanem bármit

    # 2.ötlet: várjunk amíg össze nem gyűlik egy frame és greedy kiértékelést alkalmazva fordítsuk le
    # eredmények: (jobb, de van zaj benne bőven)
    # 'bearu>' - bear
    # 'janna ag>' - hyena
    # '/zebra>' - zebra
    # 'jattiger>' - tiger
    # 'www.whale>' - whale
    
# *címek:
    # 1. Hallássérult jelelők támogatása Mesterséges intelligencián alapuló automatizált fordítassal,
    # szekvenciális kamerakép adatok felhasználásával
    
    # 2. Hallássérult jelelők támogatasa Mesterséges intelligencián alapuló kamerakép szöveggé leképzésével,
    # valamint elterjedt szekvencia feldolgozásra alkalmas architektúrák összehasonlítása
    
    # 3. Jelnyelv szöveggé fordítása, szekvenciafeldolgozásra alkalmas mesterséges intelligencia modellekkel

# Továbbiakban:
# Ha sikerülne az összekötés lehetne más modelleknek utánanézni kipróbálni (már találtam pár cikket erről)
# A klasszifikációt meg kell végre szerelni (jó kérdés hogyan:)
# Saját kamerával való tesztelés nagyon lassan (hátha nagyon basic jelelőknek működni fog)
# Hogyan detektáljuk, hogy valaki jelel-e éppen (szerintem a modellek pontosságán lehetne vele javítani)


# Kérdések:
# Mennyire van megnyomkodva a program vagy a kód tdk/szakdogán
# Az egy oldalasba mit kell belerakni? (mivan ha nem tudom összehozni a kettőt együtt? fingerspelling and sign pred)
# Forráskódok hivatkozása?
