In [1]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import cv2
import imageio as im
import os

2025-01-23 13:56:58.845212: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-23 13:56:58.862561: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-01-23 13:56:58.882617: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8473] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-01-23 13:56:58.888407: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1471] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-23 13:56:58.901918: I tensorflow/core/platform/cpu_feature_guar

In [2]:
import cv2
import numpy as np

def preprocess_video(video_path):
    cap = cv2.VideoCapture(video_path)
    frames = []
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        gray_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        mouth_crop = gray_frame[190:236, 80:220]  # Crop mouth region
        frames.append(mouth_crop)
    cap.release()

    # Normalize frames
    frames = np.array(frames, dtype=np.float32)
    mean = np.mean(frames)
    std = np.std(frames)
    normalized_frames = (frames - mean) / std
    return normalized_frames

In [3]:
def parse_alignment(alignment_path, vocab):
    with open(alignment_path, 'r') as f:
        lines = f.readlines()
    
    tokens = []
    for line in lines:
        start, end, word = line.strip().split()
        if word != "sil":
            tokens.append(" ")
            tokens.extend(list(word))
    return [vocab[char] for char in tokens if char in vocab]

In [4]:
def create_data_pipeline(video_dir, alignment_dir, vocab):
    video_paths = [
        os.path.join(video_dir, fname)
        for fname in os.listdir(video_dir)
        if fname.endswith('.mpg')  # Update to match video file extension
    ]
    alignment_paths = [
        os.path.join(alignment_dir, fname)
        for fname in os.listdir(alignment_dir)
        if fname.endswith('.align')  # Update to match alignment file extension
    ]

    video_alignment_pairs = []
    
    # Pair videos and alignments based on their filenames
    for video_path in video_paths:
        video_name = os.path.basename(video_path).split('.')[0]
        corresponding_alignment = os.path.join(alignment_dir, f"{video_name}.align")
        if os.path.exists(corresponding_alignment):
            video_alignment_pairs.append((video_path, corresponding_alignment))

    for video_path, alignment_path in video_alignment_pairs:
        video_data = preprocess_video(video_path)
        alignment_data = parse_alignment(alignment_path, vocab)
        yield video_data, alignment_data

# Example vocabulary
vocab = {char: idx for idx, char in enumerate("abcdefghijklmnopqrstuvwxyz ", start=1)}

# Define directories with raw strings to avoid escape issues
video_directory = r"data/s1"  # Update to your actual video directory
alignment_directory = r"data/alignments/s1"  # Update to your actual alignment directory

pipeline = create_data_pipeline(video_directory, alignment_directory, vocab)

# Iterate through the pipeline and process data
for video_data, alignment_data in pipeline:
    print("Video Data Shape:", video_data.shape)
    print("Alignment Data:", alignment_data)
    print("------")

Video Data Shape: (75, 46, 140)
Alignment Data: [27, 2, 9, 14, 27, 2, 12, 21, 5, 27, 1, 20, 27, 6, 27, 20, 23, 15, 27, 14, 15, 23]
------
Video Data Shape: (75, 46, 140)
Alignment Data: [27, 2, 9, 14, 27, 2, 12, 21, 5, 27, 1, 20, 27, 6, 27, 20, 8, 18, 5, 5, 27, 19, 15, 15, 14]
------
Video Data Shape: (75, 46, 140)
Alignment Data: [27, 2, 9, 14, 27, 2, 12, 21, 5, 27, 1, 20, 27, 6, 27, 6, 15, 21, 18, 27, 16, 12, 5, 1, 19, 5]
------
Video Data Shape: (75, 46, 140)
Alignment Data: [27, 2, 9, 14, 27, 2, 12, 21, 5, 27, 1, 20, 27, 6, 27, 6, 9, 22, 5, 27, 1, 7, 1, 9, 14]
------
Video Data Shape: (75, 46, 140)
Alignment Data: [27, 2, 9, 14, 27, 2, 12, 21, 5, 27, 1, 20, 27, 12, 27, 19, 9, 24, 27, 14, 15, 23]
------
Video Data Shape: (75, 46, 140)
Alignment Data: [27, 2, 9, 14, 27, 2, 12, 21, 5, 27, 1, 20, 27, 12, 27, 19, 5, 22, 5, 14, 27, 19, 15, 15, 14]
------
Video Data Shape: (75, 46, 140)
Alignment Data: [27, 2, 9, 14, 27, 2, 12, 21, 5, 27, 1, 20, 27, 12, 27, 5, 9, 7, 8, 20, 27, 16, 12, 5, 

[mpeg1video @ 0x349a8b40] ac-tex damaged at 22 17


Video Data Shape: (75, 46, 140)
Alignment Data: [27, 19, 5, 20, 27, 7, 18, 5, 5, 14, 27, 9, 14, 27, 3, 27, 15, 14, 5, 27, 1, 7, 1, 9, 14]
------
Video Data Shape: (75, 46, 140)
Alignment Data: [27, 19, 5, 20, 27, 7, 18, 5, 5, 14, 27, 9, 14, 27, 3, 27, 26, 5, 18, 15, 27, 16, 12, 5, 1, 19, 5]
------
Video Data Shape: (75, 46, 140)
Alignment Data: [27, 19, 5, 20, 27, 7, 18, 5, 5, 14, 27, 9, 14, 27, 9, 27, 20, 23, 15, 27, 14, 15, 23]
------
Video Data Shape: (75, 46, 140)
Alignment Data: [27, 19, 5, 20, 27, 7, 18, 5, 5, 14, 27, 9, 14, 27, 9, 27, 20, 8, 18, 5, 5, 27, 19, 15, 15, 14]
------
Video Data Shape: (75, 46, 140)
Alignment Data: [27, 19, 5, 20, 27, 7, 18, 5, 5, 14, 27, 9, 14, 27, 9, 27, 6, 15, 21, 18, 27, 16, 12, 5, 1, 19, 5]
------
Video Data Shape: (75, 46, 140)
Alignment Data: [27, 19, 5, 20, 27, 7, 18, 5, 5, 14, 27, 9, 14, 27, 9, 27, 6, 9, 22, 5, 27, 1, 7, 1, 9, 14]
------
Video Data Shape: (75, 46, 140)
Alignment Data: [27, 19, 5, 20, 27, 7, 18, 5, 5, 14, 27, 9, 14, 27, 15, 27,

In [8]:
###############################################
import warnings
warnings.filterwarnings('ignore')

In [14]:

import tensorflow as tf
import numpy as np
import cv2
import os
from glob import glob
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Conv3D, LSTM, Dense, Dropout, Bidirectional, MaxPool3D, BatchNormalization, Reshape
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tqdm.keras import TqdmCallback  
import datetime


class LipReadingDataGenerator(tf.keras.utils.Sequence):
    def __init__(self, data_path, alignment_path, batch_size=32, frame_length=75, 
                 image_height=46, image_width=140, **kwargs):
        super().__init__(**kwargs)
        self.data_path = data_path
        self.alignment_path = alignment_path
        self.batch_size = batch_size
        self.frame_length = frame_length
        self.image_height = image_height
        self.image_width = image_width

        self.video_paths = sorted(glob(os.path.join(data_path, '*.mpg')))
        self.alignment_paths = sorted(glob(os.path.join(alignment_path, '*.align')))
        
        print(f"Found {len(self.video_paths)} video files and {len(self.alignment_paths)} alignment files")
        self.vocabulary = self._create_word_vocabulary()
            
        self.char_to_num = tf.keras.layers.StringLookup(
            vocabulary=self.vocabulary, oov_token="")
        self.num_to_char = tf.keras.layers.StringLookup(
            vocabulary=self.vocabulary, oov_token="", invert=True)
    
    def _create_word_vocabulary(self):
        words = set()
        print(f"Processing alignment files from: {self.alignment_path}")
        
        for align_path in self.alignment_paths:
            try:
                with open(align_path, 'r') as f:
                    content = f.read().strip().split()
                    words.update([content[i] for i in range(2, len(content), 3)])
            except Exception as e:
                print(f"Error processing {align_path}: {str(e)}")
        
        words.discard('sil')
        vocabulary = sorted(list(words))
        
        if not vocabulary:
            print("No words found in alignment files. Using default vocabulary.")
            vocabulary = ['bin', 'blue', 'at', 'f', 'two', 'now']
        
        print(f"Vocabulary size: {len(vocabulary)}")
        return vocabulary
    
    def __len__(self):
        return max(1, len(self.video_paths) // self.batch_size)
    
    def _process_video(self, video_path):
        frames = []
        cap = cv2.VideoCapture(video_path)
        
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            
            gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
            mouth = gray[190:236, 80:220]
            mouth = cv2.resize(mouth, (self.image_width, self.image_height))
            frames.append(mouth)
        
        cap.release()
        
        frames = np.array(frames, dtype=np.float32)
        frames = (frames - frames.mean()) / (frames.std() + 1e-6)
        
        if len(frames) < self.frame_length:
            pad_length = self.frame_length - len(frames)
            frames = np.pad(frames, ((0, pad_length), (0, 0), (0, 0)), mode='constant')
        else:
            frames = frames[:self.frame_length]
        
        return frames
    
    def _process_alignment(self, alignment_path):
        with open(alignment_path, 'r') as f:
            content = f.read().strip().split()
        
        words = [content[i] for i in range(2, len(content), 3) if content[i] != 'sil']
        text = ' '.join(words)
        return self.char_to_num(tf.convert_to_tensor(text.split()))
    
    def __getitem__(self, idx):
        batch_videos = self.video_paths[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_alignments = self.alignment_paths[idx * self.batch_size:(idx + 1) * self.batch_size]
        
        X = np.zeros((len(batch_videos), self.frame_length, self.image_height, self.image_width, 1))
        Y = np.zeros((len(batch_videos), len(self.vocabulary)))
        
        for i, (video_path, align_path) in enumerate(zip(batch_videos, batch_alignments)):
            frames = self._process_video(video_path)
            X[i] = frames.reshape(self.frame_length, self.image_height, self.image_width, 1)
            
            labels = self._process_alignment(align_path)
            Y[i] = tf.reduce_max(tf.one_hot(labels, len(self.vocabulary)), axis=0)
        
        return X, Y


def build_model(frame_length, image_height, image_width, vocabulary_size):
    model = Sequential([
        tf.keras.Input(shape=(frame_length, image_height, image_width, 1)), 
        Conv3D(64, kernel_size=(3, 3, 3), activation='relu'),
        MaxPool3D(pool_size=(1, 2, 2)),
        BatchNormalization(),
        
        Conv3D(128, kernel_size=(3, 3, 3), activation='relu'),
        MaxPool3D(pool_size=(1, 2, 2)),
        BatchNormalization(),
        
        Conv3D(256, kernel_size=(3, 3, 3), activation='relu'),
        MaxPool3D(pool_size=(1, 2, 2)),
        BatchNormalization(),
        
        Reshape((-1, 256)),
        
        Bidirectional(LSTM(128, return_sequences=True)),
        Dropout(0.5),
        
        Bidirectional(LSTM(64)),
        Dropout(0.5),
        
        Dense(256, activation='relu'),
        Dropout(0.5),
        Dense(vocabulary_size, activation='softmax')
    ])
    
    return model


def train_and_save_main_model(data_dir, alignment_dir, batch_size=8):
    print("Starting training for sentence-level prediction...")
    
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    model_dir = f"models_main_{timestamp}"
    os.makedirs(model_dir, exist_ok=True)
    
    data_generator = LipReadingDataGenerator(data_dir, alignment_dir, batch_size=batch_size)
    
    model = build_model(
        frame_length=75,
        image_height=46,
        image_width=140,
        vocabulary_size=len(data_generator.vocabulary)
    )
    
    model.compile(
        optimizer=Adam(learning_rate=0.0001),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )

    callbacks = [
        ModelCheckpoint(
            os.path.join(model_dir, 'lip_reading_main_best.keras'),
            save_best_only=True,
            monitor='accuracy'
        ),
        EarlyStopping(
            monitor='loss',
            patience=10,
            restore_best_weights=True
        ),
        TqdmCallback(verbose=1)
    ]

    print("Training started...")
    model.fit(
        data_generator,
        epochs=10,
        callbacks=callbacks
    )

    final_model_path = os.path.join(model_dir, 'model.h5')
    model.save(final_model_path)
    print(f"Final model saved: {final_model_path}")

    vocab_path = os.path.join(model_dir, 'vocabulary_main.txt')
    with open(vocab_path, 'w') as f:
        f.write('\n'.join(data_generator.vocabulary))
    print(f"Vocabulary saved: {vocab_path}")

    return model, model_dir

In [15]:
def predict_with_main_model(model_dir, video_path):
    vocab_path = os.path.join(model_dir, 'vocabulary_main.txt')
    with open(vocab_path, 'r') as f:
        vocabulary = f.read().splitlines()

    data_generator = LipReadingDataGenerator("", "")
    data_generator.vocabulary = vocabulary
    data_generator.char_to_num = tf.keras.layers.StringLookup(
        vocabulary=vocabulary, oov_token="")
    data_generator.num_to_char = tf.keras.layers.StringLookup(
        vocabulary=vocabulary, oov_token="", invert=True)

    model = load_model(os.path.join(model_dir, 'model.h5'))

    frames = data_generator._process_video(video_path)
    frames = frames.reshape(1, data_generator.frame_length, 
                            data_generator.image_height, 
                            data_generator.image_width, 1)

    prediction = model.predict(frames)
    predicted_indices = tf.argmax(prediction, axis=1)
    predicted_text = data_generator.num_to_char(predicted_indices)

    return ' '.join(predicted_text.numpy().decode('utf-8').split())


if __name__ == "__main__":
    data_dir = r"data/s1"
    alignment_dir = r"data/alignments/s1"

    print("Training main model...")
    main_model, main_model_dir = train_and_save_main_model(data_dir, alignment_dir)

    test_video = r"data\s1\bbaf2n.mpg"

    print("\nMaking predictions...")
    sentence_prediction = predict_with_main_model(main_model_dir, test_video)
    print(f"Predicted sentence: {sentence_prediction}")

Training main model...
Starting training for sentence-level prediction...
Found 1000 video files and 1000 alignment files
Processing alignment files from: data/alignments/s1
Vocabulary size: 52



  0%|                                                                                                                        | 0/10 [00:00<?, ?epoch/s]

Training started...



[A%|                                                                                                                    | 0.00/125 [00:00<?, ?batch/s]

Epoch 1/10






[A[A[AW0000 00:00:1737641349.508824   15306 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1737641349.515870   15306 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1737641349.522684   15306 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1737641349.585451   15306 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1737641349.615710   15306 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1737641349.623498   15306 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1737641349.691668   15306 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1737641349.732139   15306 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1737641349.7447

InternalError: Graph execution error:

Detected at node CudnnRNN defined at (most recent call last):
<stack traces unavailable>
Failed to call DoRnnForward with model config: [rnn_mode, rnn_input_mode, rnn_direction_mode]: 2, 0, 0 , [num_layers, input_size, num_units, dir_count, max_seq_length, batch_size, cell_num_units]: [1, 256, 128, 1, 4140, 8, 128] 
	 [[{{node CudnnRNN}}]]
	 [[sequential_3/bidirectional_6/forward_lstm_6/PartitionedCall]] [Op:__inference_train_function_51333]