In [1]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import cv2
import imageio as im
import os

In [2]:
import cv2
import numpy as np

def preprocess_video(video_path):
    cap = cv2.VideoCapture(video_path)
    frames = []
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        gray_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        mouth_crop = gray_frame[190:236, 80:220]  # Crop mouth region
        frames.append(mouth_crop)
    cap.release()

    # Normalize frames
    frames = np.array(frames, dtype=np.float32)
    mean = np.mean(frames)
    std = np.std(frames)
    normalized_frames = (frames - mean) / std
    return normalized_frames


In [3]:
def parse_alignment(alignment_path, vocab):
    with open(alignment_path, 'r') as f:
        lines = f.readlines()
    
    tokens = []
    for line in lines:
        start, end, word = line.strip().split()
        if word != "sil":
            tokens.append(" ")
            tokens.extend(list(word))
    return [vocab[char] for char in tokens if char in vocab]


In [14]:
def create_data_pipeline(video_dir, alignment_dir, vocab):
    video_paths = [
        os.path.join(video_dir, fname)
        for fname in os.listdir(video_dir)
        if fname.endswith('.mpg')  # Update to match video file extension
    ]
    alignment_paths = [
        os.path.join(alignment_dir, fname)
        for fname in os.listdir(alignment_dir)
        if fname.endswith('.align')  # Update to match alignment file extension
    ]

    video_alignment_pairs = []
    
    # Pair videos and alignments based on their filenames
    for video_path in video_paths:
        video_name = os.path.basename(video_path).split('.')[0]
        corresponding_alignment = os.path.join(alignment_dir, f"{video_name}.align")
        if os.path.exists(corresponding_alignment):
            video_alignment_pairs.append((video_path, corresponding_alignment))

    for video_path, alignment_path in video_alignment_pairs:
        video_data = preprocess_video(video_path)
        alignment_data = parse_alignment(alignment_path, vocab)
        yield video_data, alignment_data

# Example vocabulary
vocab = {char: idx for idx, char in enumerate("abcdefghijklmnopqrstuvwxyz ", start=1)}

# Define directories with raw strings to avoid escape issues
video_directory = r"data\s1"  # Update to your actual video directory
alignment_directory = r"data\alignments\s1"  # Update to your actual alignment directory

pipeline = create_data_pipeline(video_directory, alignment_directory, vocab)

# Iterate through the pipeline and process data
for video_data, alignment_data in pipeline:
    print("Video Data Shape:", video_data.shape)
    print("Alignment Data:", alignment_data)
    print("------")

Video Data Shape: (75, 46, 140)
Alignment Data: [27, 2, 9, 14, 27, 2, 12, 21, 5, 27, 1, 20, 27, 6, 27, 20, 23, 15, 27, 14, 15, 23]
------
Video Data Shape: (75, 46, 140)
Alignment Data: [27, 2, 9, 14, 27, 2, 12, 21, 5, 27, 1, 20, 27, 6, 27, 20, 8, 18, 5, 5, 27, 19, 15, 15, 14]
------
Video Data Shape: (75, 46, 140)
Alignment Data: [27, 2, 9, 14, 27, 2, 12, 21, 5, 27, 1, 20, 27, 6, 27, 6, 15, 21, 18, 27, 16, 12, 5, 1, 19, 5]
------
Video Data Shape: (75, 46, 140)
Alignment Data: [27, 2, 9, 14, 27, 2, 12, 21, 5, 27, 1, 20, 27, 6, 27, 6, 9, 22, 5, 27, 1, 7, 1, 9, 14]
------
Video Data Shape: (75, 46, 140)
Alignment Data: [27, 2, 9, 14, 27, 2, 12, 21, 5, 27, 1, 20, 27, 12, 27, 19, 9, 24, 27, 14, 15, 23]
------
Video Data Shape: (75, 46, 140)
Alignment Data: [27, 2, 9, 14, 27, 2, 12, 21, 5, 27, 1, 20, 27, 12, 27, 19, 5, 22, 5, 14, 27, 19, 15, 15, 14]
------
Video Data Shape: (75, 46, 140)
Alignment Data: [27, 2, 9, 14, 27, 2, 12, 21, 5, 27, 1, 20, 27, 12, 27, 5, 9, 7, 8, 20, 27, 16, 12, 5, 