In [21]:
import tensorflow as tf
from tensorflow import keras
import json
import numpy as np
import cv2
import mediapipe as mp
import time
import os
import math
import matplotlib.pyplot as plt
import pickle
from tensorflow.keras.utils import to_categorical
from gtts import gTTS
from fractions import Fraction

In [22]:
with open('dataset/WLASL/info.json' , 'r') as ds_info_f:
    ds_info = json.load(ds_info_f)

In [23]:
len(ds_info)

2000

In [24]:
# Unique glosses, gloss to video_id map, signer info with gloss + video_id

def get_required_info(ds_info, remove_missing=True):
    glosses = []
    gloss_video_id_map = {}
    signer_info = {}

    for word in ds_info:
        gloss = word['gloss']
        glosses.append(gloss)

        for inst in word['instances']:
            if (not os.path.exists(f'dataset/WLASL/videos/{inst["video_id"]}.mp4')) and remove_missing:
                continue
            
            if gloss not in gloss_video_id_map:
                gloss_video_id_map[gloss] = []

            gloss_video_id_map[gloss].append(inst['video_id'])

            s_id = inst['signer_id']

            if s_id not in signer_info:
                signer_info[s_id] = {}

            if gloss not in signer_info[s_id]:
                signer_info[s_id][gloss] = []
                
            signer_info[s_id][gloss].append(inst['video_id'])
        
        gloss_video_id_map[gloss].sort()
            
    return glosses, gloss_video_id_map, signer_info

In [25]:
glosses, gloss_video_id_map, signer_info = get_required_info(ds_info)

In [26]:
words = ['better', 'late', 'than', 'never']

In [27]:
y = [i for i in range(len(words))]
y = to_categorical(y).astype(int)

In [28]:
label_map = {label:y[num] for num, label in enumerate(words)}
label_map

{'better': array([1, 0, 0, 0]),
 'late': array([0, 1, 0, 0]),
 'than': array([0, 0, 1, 0]),
 'never': array([0, 0, 0, 1])}

In [29]:
# Extract frames from the video file
# video: path of the video file
# n_frames: required number of frames to extract from the file

from fractions import Fraction

def save_holistic(path, holistic, image):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image.flags.writeable = False
    results = holistic.process(image)
    image.flags.writeable = True

    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)

    landmarks = np.concatenate([pose, face, lh, rh])
    np.save(path, landmarks)
    
def sequential_frames(v_id, src, holistic, n_frames):
    for f in range(n_frames):
        ret, frame = src.read()
        if ret:
            save_holistic(f'./wlasl-info/landmarks-sentence/{v_id}/{f}.npy', holistic, frame)
        else:
            np.save(f'./wlasl-info/landmarks-sentence/{v_id}/{f}.npy', np.zeros((1662, )))

def frames_from_video(v_id, n_frames):
    mp_drawing = mp.solutions.drawing_utils
    mp_drawing_styles = mp.solutions.drawing_styles
    mp_holistic = mp.solutions.holistic
    
    with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
        src = cv2.VideoCapture(f'./dataset/WLASL/videos/{v_id}.mp4')
        video_length = src.get(cv2.CAP_PROP_FRAME_COUNT)
        os.makedirs(f'./wlasl-info/landmarks-sentence/{v_id}', mode=777)
        
        if video_length <= n_frames:
            frame_step = 0
            sequential_frames(v_id, src, holistic, n_frames)
        else:
            f = 0
            ratio = round(video_length/n_frames, 1)
            frac = Fraction(ratio).limit_denominator(10)
            
            if ratio == 1.7 or ratio == 1.9:
                frac = Fraction(ratio).limit_denominator(2)
            
            num, den = frac.numerator, frac.denominator
            while f < n_frames:
                for _ in range(min(den, n_frames-f)):
                    ret, frame = src.read()
                    if ret:
                        save_holistic(f'./wlasl-info/landmarks-sentence/{v_id}/{f}.npy', holistic, frame)
                    else:
                        np.save(f'./wlasl-info/landmarks-sentence/{v_id}/{f}.npy', np.zeros((1662, )))
                    f += 1
                for _ in range(num-den):
                    ret, frame = src.read()

        src.release()

In [10]:
# for gloss in words:
#     for v_id in gloss_video_id_map[gloss]:
#         frames_from_video(v_id, 30)

In [30]:
def get_train_val_test_data(glosses, label_map, split_ratio=[0.8, 0.9]):
    
    result_train_v_ids = []
    result_val_v_ids = []
    result_test_v_ids = []
    result_train_labels = []
    result_val_labels = []
    result_test_labels = []
    
    for gloss in glosses:
        v_ids = gloss_video_id_map[gloss]
        split_1 = int(len(v_ids) * (split_ratio[0]))
        split_2 = int(len(v_ids) * (split_ratio[1]))
        result_train_v_ids.extend(v_ids[:split_1])
        result_val_v_ids.extend(v_ids[split_1:split_2])
        result_test_v_ids.extend(v_ids[split_2:])
        result_train_labels.extend([label_map[gloss]]*split_1)
        result_val_labels.extend([label_map[gloss]]*(split_2-split_1))
        result_test_labels.extend([label_map[gloss]]*(len(v_ids) - split_2))
        
    return np.array(result_train_v_ids), np.array(result_train_labels), np.array(result_val_v_ids), np.array(result_val_labels), np.array(result_test_v_ids), np.array(result_test_labels) 

In [31]:
a, b, c, d, e, f_ = get_train_val_test_data(words, label_map)

In [32]:
a.shape, b.shape, c.shape, d.shape, e.shape, f_.shape

((37,), (37, 4), (5,), (5, 4), (7,), (7, 4))

In [33]:
class DataGenerator(tf.keras.utils.Sequence):
    def __init__(self, v_ids, labels, n_frames, batch_size = 2, training = False):
        self.v_ids = v_ids
        self.labels = labels
        self.n_frames = n_frames
        self.training = training
        self.batch_size = batch_size
        self.indexes = np.arange(len(self.v_ids))
        if self.training:
            np.random.shuffle(self.indexes)

    def get_video_frames(self, v_ids, classes):
        result_X = []
        result_y = []
        for index, v_id in enumerate(v_ids):
            window = []
            for f in range(self.n_frames):
                window.append(np.load(f'./wlasl-info/landmarks-sentence/{v_id}/{f}.npy'))
            result_X.append(np.stack(window, axis=0))
            result_y.append(classes[index])
        
        return np.array(result_X), np.array(result_y)
            
    def __getitem__(self, index):
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        return self.get_video_frames(self.v_ids[indexes], self.labels[indexes])
    
    def __len__(self):
        return math.floor(len(self.v_ids) / self.batch_size)
    
    def on_epoch_end(self):
        if self.training:
            np.random.shuffle(self.indexes)

In [34]:
train_gen = DataGenerator(a, b, 30, training=True)
val_gen = DataGenerator(c, d, 30)

In [35]:
model = keras.models.load_model('models/wlasl-sentence')



In [36]:
def generate_speech(folder, model, words):
    total_dir = 0
    for base, dirs, files in os.walk(folder):
        for directories in dirs:
            total_dir += 1
    v_ids = range(1, total_dir+1)
    sentence = []
    for index, v_id in enumerate(v_ids):
        window = []
        for f in range(30):
            window.append(np.load(f'./wlasl-info/landmarks-sentence-test/{v_id}/{f}.npy'))
        sentence.append(words[np.argmax(model.predict(np.array([window])))])
    
    sentence = ' '.join(sentence)
    speech = gTTS(text=sentence, lang='en', slow=False)
    speech.save("output.mp3")
    os.system("start output.mp3")

In [37]:
generate_speech('./wlasl-info/landmarks-sentence-test/', model, words)

