In [27]:
import os
import sys

# Redirect stdout to /dev/null
original_stdout = sys.stdout
sys.stdout = open(os.devnull, 'w')

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Restore stdout
sys.stdout = original_stdout

# print the acknowledgement
print("Finished processing files.")

Finished processing files.


In [28]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Conv3D, LSTM, Dense, Dropout, Bidirectional, MaxPool3D, BatchNormalization, Reshape
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from glob import glob
import cv2

class LipReadingDataGenerator(tf.keras.utils.Sequence):
    def __init__(self, data_path, alignment_path, batch_size=16, frame_length=75, 
                 image_height=46, image_width=140, video_paths=None, alignment_paths=None, 
                 fixed_vocabulary=None, **kwargs):
        super().__init__(**kwargs)
        self.data_path = data_path
        self.alignment_path = alignment_path
        self.batch_size = batch_size
        self.frame_length = frame_length
        self.image_height = image_height
        self.image_width = image_width

        # Use provided paths or generate from directory
        if video_paths is None:
            self.video_paths = sorted(glob(os.path.join(data_path, '*.mpg')))
        else:
            self.video_paths = video_paths
        
        if alignment_paths is None:
            self.alignment_paths = sorted(glob(os.path.join(alignment_path, '*.align')))
        else:
            self.alignment_paths = alignment_paths
        
        print(f"Found {len(self.video_paths)} video files and {len(self.alignment_paths)} alignment files")
        
        # Use fixed vocabulary if provided, otherwise create from data
        if fixed_vocabulary is not None:
            self.vocabulary = fixed_vocabulary
        else:
            self.vocabulary = self._create_word_vocabulary()
            
        self.char_to_num = tf.keras.layers.StringLookup(
            vocabulary=self.vocabulary, oov_token="")
        self.num_to_char = tf.keras.layers.StringLookup(
            vocabulary=self.vocabulary, oov_token="", invert=True)

    def _create_word_vocabulary(self):
        words = set()
        print(f"Processing alignment files from: {self.alignment_path}")
        
        for align_path in self.alignment_paths:
            try:
                with open(align_path, 'r') as f:
                    content = f.read().strip().split()
                    words.update([content[i] for i in range(2, len(content), 3)])
            except Exception as e:
                print(f"Error processing {align_path}: {str(e)}")
        
        words.discard('sil')
        vocabulary = sorted(list(words))
        
        if not vocabulary:
            print("No words found in alignment files. Using default vocabulary.")
            vocabulary = ['bin', 'blue', 'at', 'f', 'two', 'now']
        
        print(f"Vocabulary size: {len(vocabulary)}")
        return vocabulary

    def __len__(self):
        return max(1, len(self.video_paths) // self.batch_size)
    
    def _process_video(self, video_path):
        frames = []
        cap = cv2.VideoCapture(video_path)
        
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            
            gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
            mouth = gray[190:236, 80:220]
            mouth = cv2.resize(mouth, (self.image_width, self.image_height))
            frames.append(mouth)
        
        cap.release()
        
        frames = np.array(frames, dtype=np.float32)
        frames = (frames - frames.mean()) / (frames.std() + 1e-6)
        
        if len(frames) < self.frame_length:
            pad_length = self.frame_length - len(frames)
            frames = np.pad(frames, ((0, pad_length), (0, 0), (0, 0)), mode='constant')
        else:
            frames = frames[:self.frame_length]
        
        return frames
    
    def _process_alignment(self, alignment_path):
        with open(alignment_path, 'r') as f:
            content = f.read().strip().split()
        
        words = [content[i] for i in range(2, len(content), 3) if content[i] != 'sil']
        text = ' '.join(words)
        return self.char_to_num(tf.convert_to_tensor(text.split()))
    
    def __getitem__(self, idx):
        batch_videos = self.video_paths[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_alignments = self.alignment_paths[idx * self.batch_size:(idx + 1) * self.batch_size]
        
        X = np.zeros((len(batch_videos), self.frame_length, self.image_height, self.image_width, 1))
        Y = np.zeros((len(batch_videos), len(self.vocabulary)))
        
        for i, (video_path, align_path) in enumerate(zip(batch_videos, batch_alignments)):
            frames = self._process_video(video_path)
            X[i] = frames.reshape(self.frame_length, self.image_height, self.image_width, 1)
            
            labels = self._process_alignment(align_path)
            Y[i] = tf.reduce_max(tf.one_hot(labels, len(self.vocabulary)), axis=0)
        
        return X, Y

In [29]:
def build_model(frame_length, image_height, image_width, vocabulary_size):
    model = Sequential([
        tf.keras.Input(shape=(frame_length, image_height, image_width, 1)), 
        Conv3D(64, kernel_size=(3, 3, 3), activation='relu'),
        MaxPool3D(pool_size=(1, 2, 2)),
        BatchNormalization(),
        
        Conv3D(128, kernel_size=(3, 3, 3), activation='relu'),
        MaxPool3D(pool_size=(1, 2, 2)),
        BatchNormalization(),
        
        Conv3D(256, kernel_size=(3, 3, 3), activation='relu'),
        MaxPool3D(pool_size=(1, 2, 2)),
        BatchNormalization(),
        
        Reshape((-1, 256)),
        
        Bidirectional(LSTM(128, return_sequences=True)),
        Dropout(0.5),
        
        Bidirectional(LSTM(64)),
        Dropout(0.5),
        
        Dense(256, activation='relu'),
        Dropout(0.5),
        Dense(vocabulary_size, activation='softmax')

    ])
    
    return model

In [30]:
# TRAINING THE MODEL ON FIRST HALF SET OF THE DATA ALONE

In [31]:
import os
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau  # Import ReduceLROnPlateau

def create_callbacks(model_dir, model_name, monitor='val_loss', patience=5):
    """Creates and returns a list of Keras callbacks."""

    checkpoint_path = os.path.join(model_dir, f"{model_name}_best.keras")  # Saves best model
    checkpoint = ModelCheckpoint(
        checkpoint_path,
        monitor=monitor,
        save_best_only=True,
        save_weights_only=False,
        verbose=1
    )

    early_stopping = EarlyStopping(
        monitor=monitor,
        patience=patience,
        restore_best_weights=True,  # Important: Restores the best weights
        verbose=1
    )

    reduce_lr = ReduceLROnPlateau(
        monitor=monitor,
        factor=0.5,  # Reduce learning rate by half
        patience=patience // 2,  # Reduce LR patience usually less than ES
        min_lr=1e-6,  # Set a minimum learning rate
        verbose=1
    )

    return [checkpoint, early_stopping, reduce_lr]

def train_and_save_main_model(data_dir, alignment_dir, batch_size=16):
    print("Starting training for sentence-level prediction...")

    model_dir = "models_main"
    os.makedirs(model_dir, exist_ok=True)

    full_data_generator = LipReadingDataGenerator(data_dir, alignment_dir, batch_size=batch_size)
    full_vocabulary = full_data_generator.vocabulary

    total_videos = len(full_data_generator.video_paths)
    mid_point = total_videos // 2

    # --- First Half Training ---
    first_half_generator = LipReadingDataGenerator(
        data_path=data_dir,
        alignment_path=alignment_dir,
        batch_size=batch_size,
        video_paths=full_data_generator.video_paths[:mid_point],
        alignment_paths=full_data_generator.alignment_paths[:mid_point],
        fixed_vocabulary=full_vocabulary
    )

    model = build_model(frame_length=75, image_height=46, image_width=140, vocabulary_size=len(full_vocabulary))
    model.compile(optimizer=Adam(learning_rate=0.0001), loss='categorical_crossentropy', metrics=['accuracy'])

    first_half_callbacks = create_callbacks(model_dir, 'lip_reading_first_half', monitor='accuracy')
    train_model(model, first_half_generator, epochs=30, callbacks=first_half_callbacks, dataset_split_name="first half")


    # --- Save the model and vocabulary after training on the first half ---
    save_model_and_vocabulary(model, model_dir, 'lip_reading_first_half', full_vocabulary)

    print("Training on the first half complete.  Model and vocabulary saved. "
          "You can now evaluate the model's performance on the first half "
          "and decide whether to proceed with training on the second half.")

In [32]:
# ACTUAL CODE OF FULL TRAINED MODEL ( UPDATED ON GITHUB )
"""def train_and_save_main_model(data_dir, alignment_dir, batch_size=16):
    print("Starting training for sentence-level prediction...")
    
    model_dir = "models_main"
    os.makedirs(model_dir, exist_ok=True)
    
    # Create full data generator to get consistent vocabulary
    full_data_generator = LipReadingDataGenerator(data_dir, alignment_dir, batch_size=batch_size)
    full_vocabulary = full_data_generator.vocabulary
    
    # Split video paths into two halves
    total_videos = len(full_data_generator.video_paths)
    mid_point = total_videos // 2
    
    # First half training
    first_half_generator = LipReadingDataGenerator(
        data_path=data_dir, 
        alignment_path=alignment_dir, 
        batch_size=batch_size,
        video_paths=full_data_generator.video_paths[:mid_point],
        alignment_paths=full_data_generator.alignment_paths[:mid_point],
        fixed_vocabulary=full_vocabulary
    )
    
    # Build and compile the model
    model = build_model(
        frame_length=75,
        image_height=46,
        image_width=140,
        vocabulary_size=len(full_vocabulary)
    )
    
    model.compile(
        optimizer=Adam(learning_rate=0.0001),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )

    # Callbacks for first half
    first_half_callbacks = [
        ModelCheckpoint(
            os.path.join(model_dir, 'lip_reading_first_half_best.keras'),
            save_best_only=True,
            monitor='accuracy'
        ),
        EarlyStopping(
            monitor='loss',
            patience=7,
            restore_best_weights=True
        )
    ]

    # Train on first half
    print("Training on first half of the dataset...")
    model.fit(
        first_half_generator,
        epochs=30,
        callbacks=first_half_callbacks
    )
    
    # Second half training
    second_half_generator = LipReadingDataGenerator(
        data_path=data_dir, 
        alignment_path=alignment_dir, 
        batch_size=batch_size,
        video_paths=full_data_generator.video_paths[mid_point:],
        alignment_paths=full_data_generator.alignment_paths[mid_point:],
        fixed_vocabulary=full_vocabulary
    )
    
    # Callbacks for second half
    second_half_callbacks = [
        ModelCheckpoint(
            os.path.join(model_dir, 'lip_reading_full_dataset_best.keras'),
            save_best_only=True,
            monitor='accuracy'
        ),
        EarlyStopping(
            monitor='loss',
            patience=7,
            restore_best_weights=True
        )
    ]

    # Train on second half
    print("Training on second half of the dataset...")
    model.fit(
        second_half_generator,
        epochs=30,
        callbacks=second_half_callbacks
    )

    # Save final model
    final_model_path = os.path.join(model_dir, 'lip_reading_full_model.h5')
    model.save(final_model_path)
    print(f"Final model saved: {final_model_path}")

    # Save vocabulary
    vocab_path = os.path.join(model_dir, 'lip_reading_full_model.h5')
    model.save(final_model_path)
    print(f"Final model saved: {final_model_path}")

    # Save vocabulary
    vocab_path = os.path.join(model_dir, 'vocabulary_main.txt')
    with open(vocab_path, 'w') as f:
        f.write('\n'.join(full_vocabulary))
    print(f"Vocabulary saved: {vocab_path}")
"""
;

''

In [33]:
"""if __name__ == "__main__":
    data_dir = r"/kaggle/input/mouth-map-comp/data/s1"
    alignment_dir = r"/kaggle/input/mouth-map-comp/data/alignments/s1"
    
    print("Training main model...")
    train_and_save_main_model(data_dir, alignment_dir)
"""

'if __name__ == "__main__":\n    data_dir = r"/kaggle/input/mouth-map-comp/data/s1"\n    alignment_dir = r"/kaggle/input/mouth-map-comp/data/alignments/s1"\n    \n    print("Training main model...")\n    train_and_save_main_model(data_dir, alignment_dir)\n'

In [None]:
#######   GENERATED USING GPT LOGICAL REASONING.
## DEEPSEEK, GEMINI, KIMI'S APPRAOCHES ARE YET TO BE TESTED 

In [36]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Conv3D, LSTM, Dense, Dropout, Bidirectional, MaxPool3D, BatchNormalization, Reshape
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from glob import glob
import cv2

# -------------------------------------------------------------------
# Data Generator for Lip Reading
# -------------------------------------------------------------------
class LipReadingDataGenerator(tf.keras.utils.Sequence):
    def __init__(self, data_path, alignment_path, batch_size=16, frame_length=75, 
                 image_height=46, image_width=140, video_paths=None, alignment_paths=None, 
                 fixed_vocabulary=None, **kwargs):
        super().__init__(**kwargs)
        self.data_path = data_path
        self.alignment_path = alignment_path
        self.batch_size = batch_size
        self.frame_length = frame_length
        self.image_height = image_height
        self.image_width = image_width

        # Use provided paths or search the directory
        if video_paths is None:
            self.video_paths = sorted(glob(os.path.join(data_path, '*.mpg')))
        else:
            self.video_paths = video_paths
        
        if alignment_paths is None:
            self.alignment_paths = sorted(glob(os.path.join(alignment_path, '*.align')))
        else:
            self.alignment_paths = alignment_paths
        
        print(f"Found {len(self.video_paths)} video files and {len(self.alignment_paths)} alignment files")
        
        # Build vocabulary either from the fixed vocabulary provided or from the data.
        if fixed_vocabulary is not None:
            self.vocabulary = fixed_vocabulary
        else:
            self.vocabulary = self._create_word_vocabulary()
            
        # Create lookup layers for converting between words and numbers.
        self.char_to_num = tf.keras.layers.StringLookup(
            vocabulary=self.vocabulary, oov_token=""
        )
        self.num_to_char = tf.keras.layers.StringLookup(
            vocabulary=self.vocabulary, oov_token="", invert=True
        )

    def _create_word_vocabulary(self):
        words = set()
        print(f"Processing alignment files from: {self.alignment_path}")
        for align_path in self.alignment_paths:
            try:
                with open(align_path, 'r') as f:
                    content = f.read().strip().split()
                    # The alignment file is assumed to have a pattern where every third token (starting at index 2) is a word.
                    words.update([content[i] for i in range(2, len(content), 3)])
            except Exception as e:
                print(f"Error processing {align_path}: {str(e)}")
        # Remove the silence token if present
        words.discard('sil')
        vocabulary = sorted(list(words))
        if not vocabulary:
            print("No words found in alignment files. Using default vocabulary.")
            vocabulary = ['bin', 'blue', 'at', 'f', 'two', 'now']
        print(f"Vocabulary size: {len(vocabulary)}")
        return vocabulary

    def __len__(self):
        return max(1, len(self.video_paths) // self.batch_size)
    
    def _process_video(self, video_path):
        frames = []
        cap = cv2.VideoCapture(video_path)
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            # Convert to grayscale
            gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
            # Crop to the mouth region (hard-coded crop coordinates)
            mouth = gray[190:236, 80:220]
            # Resize the cropped image to the desired size
            mouth = cv2.resize(mouth, (self.image_width, self.image_height))
            frames.append(mouth)
        cap.release()
        frames = np.array(frames, dtype=np.float32)
        # Normalize the frames: zero mean and unit variance
        frames = (frames - frames.mean()) / (frames.std() + 1e-6)
        # Pad if the number of frames is less than frame_length, or trim if more
        if len(frames) < self.frame_length:
            pad_length = self.frame_length - len(frames)
            frames = np.pad(frames, ((0, pad_length), (0, 0), (0, 0)), mode='constant')
        else:
            frames = frames[:self.frame_length]
        return frames
    
    def _process_alignment(self, alignment_path):
        with open(alignment_path, 'r') as f:
            content = f.read().strip().split()
        # Extract every third token starting at index 2 (ignoring 'sil')
        words = [content[i] for i in range(2, len(content), 3) if content[i] != 'sil']
        text = ' '.join(words)
        # Convert words to numerical IDs using the lookup layer
        return self.char_to_num(tf.convert_to_tensor(text.split()))
    
    def __getitem__(self, idx):
        batch_videos = self.video_paths[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_alignments = self.alignment_paths[idx * self.batch_size:(idx + 1) * self.batch_size]
        
        # Initialize arrays for the batch data.
        X = np.zeros((len(batch_videos), self.frame_length, self.image_height, self.image_width, 1))
        Y = np.zeros((len(batch_videos), len(self.vocabulary)))
        
        for i, (video_path, align_path) in enumerate(zip(batch_videos, batch_alignments)):
            frames = self._process_video(video_path)
            # Add channel dimension (1 for grayscale)
            X[i] = frames.reshape(self.frame_length, self.image_height, self.image_width, 1)
            
            # Process alignment and convert to a multi-hot vector.
            labels = self._process_alignment(align_path)
            # One-hot encode each label and then use reduce_max to combine them into a single vector.
            Y[i] = tf.reduce_max(tf.one_hot(labels, len(self.vocabulary)), axis=0)
        
        return X, Y

# -------------------------------------------------------------------
# Model Architecture
# -------------------------------------------------------------------
def build_model(frame_length, image_height, image_width, vocabulary_size):
    model = Sequential([
        tf.keras.Input(shape=(frame_length, image_height, image_width, 1)),
        # First 3D convolution block
        Conv3D(64, kernel_size=(3, 3, 3), activation='relu'),
        MaxPool3D(pool_size=(1, 2, 2)),
        BatchNormalization(),
        
        # Second 3D convolution block
        Conv3D(128, kernel_size=(3, 3, 3), activation='relu'),
        MaxPool3D(pool_size=(1, 2, 2)),
        BatchNormalization(),
        
        # Third 3D convolution block
        Conv3D(256, kernel_size=(3, 3, 3), activation='relu'),
        MaxPool3D(pool_size=(1, 2, 2)),
        BatchNormalization(),
        
        # Reshape to combine the spatial dimensions into a feature vector per time step.
        Reshape((-1, 256)),
        
        # Temporal modeling with Bidirectional LSTMs
        Bidirectional(LSTM(128, return_sequences=True)),
        Dropout(0.5),
        Bidirectional(LSTM(64)),
        Dropout(0.5),
        
        # Dense layers for classification
        Dense(256, activation='relu'),
        Dropout(0.5),
        Dense(vocabulary_size, activation='softmax')
    ])
    
    return model

# -------------------------------------------------------------------
# Revised Training Routine
# -------------------------------------------------------------------
def train_and_save_model(data_dir, alignment_dir, batch_size=16, epochs=30):
    # -----------------------------------------------------------
    # Step 1: Build the full vocabulary using all available data.
    # -----------------------------------------------------------
    full_data_generator = LipReadingDataGenerator(data_dir, alignment_dir, batch_size=batch_size)
    full_vocabulary = full_data_generator.vocabulary
    
    # Retrieve all file paths (assumes they are in the same order)
    all_video_paths = full_data_generator.video_paths
    all_align_paths = full_data_generator.alignment_paths
    
    total_files = len(all_video_paths)
    print(f"Total number of files: {total_files}")
    
    # -----------------------------------------------------------
    # Step 2: Split the data into training and validation sets.
    # Training: first half of the data.
    # Validation: 70% of the second half.
    # -----------------------------------------------------------
    mid_point = total_files // 2
    train_video_paths = all_video_paths[:mid_point]
    train_align_paths = all_align_paths[:mid_point]
    
    second_half_video_paths = all_video_paths[mid_point:]
    second_half_align_paths = all_align_paths[mid_point:]
    
    # Use 70% of the second half for validation.
    val_count = int(len(second_half_video_paths) * 0.7)
    val_video_paths = second_half_video_paths[:val_count]
    val_align_paths = second_half_align_paths[:val_count]
    
    print(f"Training on {len(train_video_paths)} files.")
    print(f"Validating on {len(val_video_paths)} files.")
    
    # -----------------------------------------------------------
    # Step 3: Create data generators for training and validation.
    # -----------------------------------------------------------
    train_generator = LipReadingDataGenerator(
        data_path=data_dir, 
        alignment_path=alignment_dir, 
        batch_size=batch_size,
        video_paths=train_video_paths,
        alignment_paths=train_align_paths,
        fixed_vocabulary=full_vocabulary
    )
    
    val_generator = LipReadingDataGenerator(
        data_path=data_dir, 
        alignment_path=alignment_dir, 
        batch_size=batch_size,
        video_paths=val_video_paths,
        alignment_paths=val_align_paths,
        fixed_vocabulary=full_vocabulary
    )
    
    # -----------------------------------------------------------
    # Step 4: Build, compile, and train the model.
    # -----------------------------------------------------------
    model = build_model(
        frame_length=75,
        image_height=46,
        image_width=140,
        vocabulary_size=len(full_vocabulary)
    )
    
    model.compile(
        optimizer=Adam(learning_rate=0.0001),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    
    model_dir = "models_main"
    os.makedirs(model_dir, exist_ok=True)
    
    # Callbacks to save the best model and to perform early stopping.
    callbacks = [
        ModelCheckpoint(
            os.path.join(model_dir, 'lip_reading_best_model.keras'),
            save_best_only=True,
            monitor='val_accuracy',
            mode='max'
        ),
        EarlyStopping(
            monitor='val_loss',
            patience=7,
            restore_best_weights=True
        )
    ]
    
    print("Starting training...")
    history = model.fit(
        train_generator,
        epochs=epochs,
        validation_data=val_generator,
        callbacks=callbacks
    )
    
    # -----------------------------------------------------------
    # Step 5: Save the final model and the vocabulary.
    # -----------------------------------------------------------
    final_model_path = os.path.join(model_dir, 'lip_reading_full_model.h5')
    model.save(final_model_path)
    print(f"Final model saved: {final_model_path}")
    
    vocab_path = os.path.join(model_dir, 'vocabulary_main.txt')
    with open(vocab_path, 'w') as f:
        f.write('\n'.join(full_vocabulary))
    print(f"Vocabulary saved: {vocab_path}")
    
    return model, history


In [37]:

# -------------------------------------------------------------------
# Example usage:
# Set your directories for the video (.mpg) files and alignment (.align) files.
data_dir = r"/kaggle/input/mouth-map-comp/data/s1"
alignment_dir = r"/kaggle/input/mouth-map-comp/data/alignments/s1"
train_and_save_model(data_dir, alignment_dir, batch_size=16, epochs=30)


Found 1000 video files and 1000 alignment files
Processing alignment files from: /kaggle/input/mouth-map-comp/data/alignments/s1
Vocabulary size: 52
Total number of files: 1000
Training on 500 files.
Validating on 350 files.
Found 500 video files and 500 alignment files
Found 350 video files and 350 alignment files
Starting training...
Epoch 1/30
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 4s/step - accuracy: 0.0411 - loss: 23.3512 - val_accuracy: 0.0060 - val_loss: 23.3720
Epoch 2/30
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m110s[0m 3s/step - accuracy: 0.0640 - loss: 22.9763 - val_accuracy: 0.0000e+00 - val_loss: 23.3792
Epoch 3/30
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 3s/step - accuracy: 0.0721 - loss: 23.0465 - val_accuracy: 0.0000e+00 - val_loss: 23.9857
Epoch 4/30
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m110s[0m 3s/step - accuracy: 0.1188 - loss: 25.1400 - val_accuracy: 0.0000e+00 - val_loss: 25.919

(<Sequential name=sequential_6, built=True>,
 <keras.src.callbacks.history.History at 0x7efd4496a530>)

In [39]:
def predict_on_video(model_path, video_path, vocabulary_path, frame_length=75, 
                     image_height=46, image_width=140):
    """Make prediction on a single video file"""
    # Load vocabulary
    with open(vocabulary_path, 'r') as f:
        vocabulary = f.read().strip().split('\n')
    
    # Load model
    model = load_model(model_path)
    
    # Process video
    frames = []
    cap = cv2.VideoCapture(video_path)
    
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        mouth = gray[190:236, 80:220]
        mouth = cv2.resize(mouth, (image_width, image_height))
        frames.append(mouth)
    
    cap.release()
    
    frames = np.array(frames, dtype=np.float32)
    frames = (frames - frames.mean()) / (frames.std() + 1e-6)
    
    if len(frames) < frame_length:
        pad_length = frame_length - len(frames)
        frames = np.pad(frames, ((0, pad_length), (0, 0), (0, 0)), mode='constant')
    else:
        frames = frames[:frame_length]
    
    frames = frames.reshape(1, frame_length, image_height, image_width, 1)
    
    # Make prediction
    prediction = model.predict(frames)[0]
    top_indices = np.argsort(prediction)[-5:][::-1]  # Get top 5 predictions
    
    results = []
    for idx in top_indices:
        results.append({
            'word': vocabulary[idx],
            'confidence': float(prediction[idx])
        })
    
    return results

results = predict_on_video(
    model_path="/kaggle/working/models_main/lip_reading_best_model.keras",
    video_path="/kaggle/input/mouth-map-comp/data/s1/bbaf2n.mpg",
    vocabulary_path="/kaggle/working/models_main/vocabulary_main.txt"
)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 874ms/step


In [40]:
results

[{'word': 'blue', 'confidence': 0.02360173501074314},
 {'word': 'j', 'confidence': 0.022521479055285454},
 {'word': 's', 'confidence': 0.022424446418881416},
 {'word': 'm', 'confidence': 0.022119032219052315},
 {'word': 'x', 'confidence': 0.022104639559984207}]