# Download dependencies

In [7]:
!pip list

Package                                  Version
---------------------------------------- ---------------
absl-py                                  1.4.0
aiofiles                                 22.1.0
aiohttp                                  3.9.3
aiohttp-cors                             0.7.0
aiorwlock                                1.4.0
aiosignal                                1.3.1
aiosqlite                                0.19.0
annotated-types                          0.6.0
anyio                                    4.2.0
apache-beam                              2.46.0
archspec                                 0.2.2
argon2-cffi                              23.1.0
argon2-cffi-bindings                     21.2.0
array-record                             0.5.0
arrow                                    1.3.0
asttokens                                2.4.1
astunparse                               1.6.3
async-timeout                            4.0.3
attrs                                    23

In [8]:
!pip install opencv-python matplotlib imageio gdown tensorflow



In [9]:
import os
import cv2
import tensorflow as tf
import numpy as np
from typing import List
from matplotlib import pyplot as plt
import imageio

2024-03-07 08:59:55.668614: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-03-07 08:59:56.482056: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-07 08:59:59.100570: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/conda/lib/python3.10/site-packages/cv2/../../lib64:/usr/local/cuda/lib64:/usr/loca

# 1. Build Data Loading Functions

In [30]:
def load_video(path:str) -> List[float]: 
    '''
    this function loads a video file, preprocesses its frames by converting them to grayscale,
    cropping them, and then normalizes them by subtracting the mean and dividing by the standard deviation.
    These normalized frames are then returned for further processing
    '''

    cap = cv2.VideoCapture(path)
    frames = []
    for _ in range(int(cap.get(cv2.CAP_PROP_FRAME_COUNT))): 
        ret, frame = cap.read()
        frame = tf.image.rgb_to_grayscale(frame)
        frames.append(frame[190:236,80:220,:])
    cap.release()
    
    mean = tf.math.reduce_mean(frames)
    std = tf.math.reduce_std(tf.cast(frames, tf.float32))
    return tf.cast((frames - mean), tf.float32) / std

In [31]:
# - This line creates a list vocab containing all the characters that are allowed in the vocabulary. 
#It includes lowercase letters, some special characters (' ', '?', '!'), and digits.
vocab = [x for x in "abcdefghijklmnopqrstuvwxyz'?!123456789 "]

In [32]:
# this code sets up mappings between characters and numbers, and vice versa, using TensorFlow's Keras API.
# It defines the vocabulary and creates layers for character-to-number and number-to-character mappings,
# which are essential for processing text data in machine learning models, especially for tasks like natural language processing (NLP) and text generation.

char_to_num = tf.keras.layers.StringLookup(vocabulary=vocab, oov_token="")
num_to_char = tf.keras.layers.StringLookup(
    vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True
)

print(
    f"The vocabulary is: {char_to_num.get_vocabulary()} "
    f"(size ={char_to_num.vocabulary_size()})"
)

The vocabulary is: ['', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', "'", '?', '!', '1', '2', '3', '4', '5', '6', '7', '8', '9', ' '] (size =40)


In [33]:
# char_to_num.get_vocabulary()

In [34]:
# char_to_num(['n','i','c','k'])

In [35]:
# num_to_char([14,  9,  3, 11])

In [36]:
def load_alignments(path:str) -> List[str]:
    '''
     this function loads alignments from a file, filters out silence tokens,
     converts alignment tokens to numbers using the char_to_num mapping,
     and returns them as a list of strings.

    '''    
    with open(path, 'r') as f: 
        lines = f.readlines() 
    tokens = []
    for line in lines:
        line = line.split()
        if line[2] != 'sil': 
            tokens = [*tokens,' ',line[2]]
    return char_to_num(tf.reshape(tf.strings.unicode_split(tokens, input_encoding='UTF-8'), (-1)))[1:]

In [37]:
def load_data(path: str):
    '''
    this function takes a path as input, extracts the filename, constructs paths to the video and alignment files,
    loads the video frames and alignments, and returns them.
    
    '''   
    path = bytes.decode(path.numpy())
    file_name = path.split('/')[-1].split('.')[0]
    # File name splitting for windows
    # file_name = path.split('\\')[-1].split('.')[0]
    video_path = os.path.join('data','s1',f'{file_name}.mpg')
    alignment_path = os.path.join('data','alignments','s1',f'{file_name}.align')
    frames = load_video(video_path) 
    alignments = load_alignments(alignment_path)
    
    return frames, alignments

In [46]:
# type(frames)

In [39]:
# test_path = '.\\data\\s1\\bbal6n.mpg'

In [47]:
# test_path_2 = './data/s1/bbal6n.mpg'

In [48]:
# tf.convert_to_tensor(test_path).numpy().decode('utf-8').split('\\')[-1].split('.')[0]

In [49]:
# frames, alignments = load_data(tf.convert_to_tensor(test_path_2))

In [50]:
# plt.imshow(frames[40])

In [51]:
# type(alignments)

In [52]:
# type(frames)

In [54]:
#Overall, this line of code takes the numerical alignments, converts them into strings using the num_to_char mapping,
#and then concatenates them into a single string representation.

# tf.strings.reduce_join([bytes.decode(x) for x in num_to_char(alignments.numpy()).numpy()])

In [55]:
def mappable_function(path:str) ->List[str]:
    '''
     this function allows the load_data function to be used within TensorFlow's computational graph
     by wrapping it with tf.py_function, enabling it to be executed in a TensorFlow environment.
    '''
    result = tf.py_function(load_data, [path], (tf.float32, tf.int64))
    return result

# 2. Create Data Pipeline

In [57]:
from matplotlib import pyplot as plt

In [58]:
# this code sets up a TensorFlow data pipeline for loading and processing video data, batching it into padded batches,
# shuffling it, and splitting it into training and testing sets.
# It ensures efficient processing and utilization of available system resources.

data = tf.data.Dataset.list_files('./data/s1/*.mpg')
data = data.shuffle(500, reshuffle_each_iteration=False)
data = data.map(mappable_function)
data = data.padded_batch(2, padded_shapes=([75,None,None,None],[40]))
data = data.prefetch(tf.data.AUTOTUNE)
# Added for split 
train = data.take(450)
test = data.skip(450)

In [90]:
# len(data)

501

In [62]:
#this line of code fetches the next batch of data from the TensorFlow dataset data and converts it into NumPy arrays for further processing or analysis in the notebook.

# frames, alignments = data.as_numpy_iterator().next()

In [63]:
# sample = data.as_numpy_iterator()

In [64]:
# len(sample)

In [65]:
# val = sample.next(); val[0]

In [65]:
# len(val[0][1])

75

In [66]:
# this code snippet saves an animation as a GIF file by extracting the first frame of the animation, scaling its pixel values to the range [0, 255], and then saving it using imageio.mimsave()

# imageio.mimsave('./animation.gif', np.squeeze(val[0][0])* 255, fps=10)

In [67]:
# plt.imshow(val[0][0][74])

In [68]:
# tf.strings.reduce_join([num_to_char(word) for word in val[1][1]])

# 3. Design the Deep Neural Network

In [69]:
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Conv3D, LSTM, Dense, Dropout, Bidirectional, MaxPool3D, Activation, Reshape, SpatialDropout3D, BatchNormalization, TimeDistributed, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler

In [71]:
#retrieves the shape of the first element of the first batch of data in the TensorFlow dataset data,

# data.as_numpy_iterator().next()[0][0].shape

In [72]:
# creating a model architecture
def initiate_model():
    model = Sequential()
    model.add(Conv3D(128, 3, input_shape=(75,46,140,1), padding='same'))
    model.add(Activation('relu'))
    model.add(MaxPool3D((1,2,2)))

    model.add(Conv3D(256, 3, padding='same'))
    model.add(Activation('relu'))
    model.add(MaxPool3D((1,2,2)))

    model.add(Conv3D(75, 3, padding='same'))
    model.add(Activation('relu'))
    model.add(MaxPool3D((1,2,2)))

    model.add(TimeDistributed(Flatten()))

    model.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True)))
    model.add(Dropout(.5))

    model.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True)))
    model.add(Dropout(.5))

    model.add(Dense(char_to_num.vocabulary_size()+1, kernel_initializer='he_normal', activation='softmax'))
    
    return model

In [73]:
model = initiate_model()
# model.summary()

In [74]:
# yhat = model.predict(val[0])

In [75]:
# tf.strings.reduce_join([num_to_char(x) for x in tf.argmax(yhat[0],axis=1)])

In [76]:
# model.input_shape

In [77]:
# model.output_shape

# 4. Setup Training Options and Train

In [78]:
def scheduler(epoch, lr):
    '''
    this scheduler function maintains the learning rate constant for the first 30 epochs
    and then exponentially decays it with each subsequent epoch.
    '''
    if epoch < 30:
        return lr
    else:
        return lr * tf.math.exp(-0.1)

In [79]:
def CTCLoss(y_true, y_pred):
    '''
     this function calculates the CTC loss between the true labels and predicted labels,
     taking into account the variable-length nature of sequences using the input and label lengths.
     It's commonly used in sequence-to-sequence tasks, such as speech recognition or handwriting recognition.
     
     '''
    
    batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
    input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
    label_length = tf.cast(tf.shape(y_true)[1], dtype="int64")

    input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")
    label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64")

    loss = tf.keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length)
    return loss

In [80]:
class ProduceExample(tf.keras.callbacks.Callback): 
    '''
    this callback class is designed to print examples of original and predicted sequences at the end of each epoch during training,
    which can be useful for monitoring the model's progress and debugging.
    '''
    def __init__(self, dataset) -> None: 
        self.dataset = dataset.as_numpy_iterator()
    
    def on_epoch_end(self, epoch, logs=None) -> None:
        data = self.dataset.next()
        yhat = self.model.predict(data[0])
        decoded = tf.keras.backend.ctc_decode(yhat, [75,75], greedy=False)[0][0].numpy()
        for x in range(len(yhat)):           
            print('Original:', tf.strings.reduce_join(num_to_char(data[1][x])).numpy().decode('utf-8'))
            print('Prediction:', tf.strings.reduce_join(num_to_char(decoded[x])).numpy().decode('utf-8'))
            print('~'*100)

In [81]:
#Model compilation
def compile_model(model):

    model.compile(optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=0.0001), loss=CTCLoss)
    
    return model

In [82]:
model = compile_model(model)

In [86]:
# this line of code creates a ModelCheckpoint callback that will save the model's weights to the specified file path whenever the training loss improves
checkpoint_callback = ModelCheckpoint(os.path.join('models','checkpoint'), monitor='loss', save_freq=1, save_weights_only=True) 

In [87]:
# this line of code creates a LearningRateScheduler callback that will adjust the learning rate during training according to the specified scheduler function
schedule_callback = LearningRateScheduler(scheduler)

In [88]:
#This callback can be used during model training to produce examples or monitor the model's performance at the end of each epoch,
# providing useful insights into the model's behavior.
example_callback = ProduceExample(test)

In [None]:
history = model.fit(train, validation_data=test, epochs=100, callbacks=[checkpoint_callback, schedule_callback, example_callback])

Epoch 1/100

[mpeg1video @ 0x7f59b0036180] ac-tex damaged at 22 17




[mpeg1video @ 0x7f59dc0662c0] ac-tex damaged at 22 17




[mpeg1video @ 0x7f5a000186c0] ac-tex damaged at 22 17


Original: bin red by m six now
Prediction: bin red by m six now
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Original: place red in v seven again
Prediction: place red in v seven again
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 7/100
  8/450 [..............................] - ETA: 23:55 - loss: 2.7282

[mpeg1video @ 0x7f59f4014ec0] ac-tex damaged at 22 17




[mpeg1video @ 0x7f5a405bd940] ac-tex damaged at 22 17


Original: set blue by n six now
Prediction: set blue by n six now
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Original: set blue by h two now
Prediction: set blue by h two now
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 8/100

[mpeg1video @ 0x7f59e801e180] ac-tex damaged at 22 17




[mpeg1video @ 0x7f598c11a1c0] ac-tex damaged at 22 17


Original: lay white in e five soon
Prediction: lay white in e five son
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Original: bin red in f six please
Prediction: bin red in f six please
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 9/100

[mpeg1video @ 0x7f59b802dc80] ac-tex damaged at 22 17




[mpeg1video @ 0x7f59d0003200] ac-tex damaged at 22 17


Original: lay white in r four please
Prediction: lay white in r four please
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Original: bin green by u three soon
Prediction: bin gren by u thre son
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 10/100

[mpeg1video @ 0x7f59f405d200] ac-tex damaged at 22 17




[mpeg1video @ 0x7f5a1402ef00] ac-tex damaged at 22 17


Original: bin green with b seven again
Prediction: bin gren with b seven again
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Original: lay red in k two now
Prediction: lay red in k two now
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 34/100

[mpeg1video @ 0x7f59f8039a00] ac-tex damaged at 22 17




[mpeg1video @ 0x7f5a7c0d6040] ac-tex damaged at 22 17


Original: lay red in d eight now
Prediction: lay red in d eight now
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Original: bin red at f eight now
Prediction: bin red at f eight now
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 35/100

[mpeg1video @ 0x7f59a40b0540] ac-tex damaged at 22 17


Original: place red at p five soon
Prediction: place red at p five son
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Original: set white by c one soon
Prediction: set white by c one son
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 36/100
 32/450 [=>............................] - ETA: 22:33 - loss: 2.4567

[mpeg1video @ 0x7f59d0078ac0] ac-tex damaged at 22 17




[mpeg1video @ 0x7f59f4096a80] ac-tex damaged at 22 17


Original: place blue at c one soon
Prediction: place blue at c one son
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Original: place red with q four please
Prediction: place red with q four please
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Epoch 37/100

# 5. Make a Prediction 

In [268]:
# import gdown

In [269]:
# url = 'https://drive.google.com/uc?id=1vWscXs4Vt0a_1IH1-ct2TCgXAZT-N3_Y'
# output = 'checkpoints.zip'
# gdown.download(url, output, quiet=False)
# gdown.extractall('checkpoints.zip', 'models_yt')

Downloading...
From (original): https://drive.google.com/uc?id=1vWscXs4Vt0a_1IH1-ct2TCgXAZT-N3_Y
From (redirected): https://drive.google.com/uc?id=1vWscXs4Vt0a_1IH1-ct2TCgXAZT-N3_Y&confirm=t&uuid=f12e3f1f-4d24-43da-94a1-4c3dce24a04e
To: /home/jupyter/checkpoints.zip
100%|██████████| 94.5M/94.5M [00:03<00:00, 24.9MB/s]


['models/checkpoint.index',
 'models/__MACOSX/._checkpoint.index',
 'models/checkpoint.data-00000-of-00001',
 'models/__MACOSX/._checkpoint.data-00000-of-00001',
 'models/checkpoint',
 'models/__MACOSX/._checkpoint']

In [38]:
model.load_weights('models/checkpoint')

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x7f199c443ee0>

In [67]:
test_data = test.as_numpy_iterator()
test_data

<tensorflow.python.data.ops.dataset_ops._NumpyIterator at 0x7f199c324040>

In [40]:
sample = test_data.next()
# sample

[mpeg1video @ 0x7f18f40b5ac0] ac-tex damaged at 22 17


In [55]:
len(sample)

2

In [41]:
yhat = model.predict(sample[0])



In [42]:
print('~'*100, 'REAL TEXT')
[tf.strings.reduce_join([num_to_char(word) for word in sentence]) for sentence in sample[1]]

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ REAL TEXT


[<tf.Tensor: shape=(), dtype=string, numpy=b'place white at q three again'>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'lay red in e one again'>]

In [43]:
decoded = tf.keras.backend.ctc_decode(yhat, input_length=[75,75], greedy=True)[0][0].numpy()

In [44]:
print('~'*100, 'PREDICTIONS')
[tf.strings.reduce_join([num_to_char(word) for word in sentence]) for sentence in decoded]

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PREDICTIONS


[<tf.Tensor: shape=(), dtype=string, numpy=b'place white at q three again'>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'lay red in e one again'>]

# Test on a Video

In [68]:
sample = load_data(tf.convert_to_tensor('./data/s1/bbab8n.mpg'))

In [69]:
print('~'*100, 'REAL TEXT')
[tf.strings.reduce_join([num_to_char(word) for word in sentence]) for sentence in [sample[1]]]

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ REAL TEXT


[<tf.Tensor: shape=(), dtype=string, numpy=b'bin blue at b eight now'>]

In [70]:
yhat = model.predict(tf.expand_dims(sample[0], axis=0))



In [71]:
decoded = tf.keras.backend.ctc_decode(yhat, input_length=[75], greedy=True)[0][0].numpy()

In [72]:
print('~'*100, 'PREDICTIONS')
[tf.strings.reduce_join([num_to_char(word) for word in sentence]) for sentence in decoded]

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PREDICTIONS


[<tf.Tensor: shape=(), dtype=string, numpy=b'bin wree at o one soon'>]