In [None]:
%cd preparation

In [None]:
!pip install torch torchvision torchaudio pytorch-lightning sentencepiece av

Collecting pytorch-lightning
  Downloading pytorch_lightning-2.5.1-py3-none-any.whl.metadata (20 kB)
Collecting av
  Downloading av-14.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft

In [None]:
!pip install -r requirements.txt

Collecting ffmpeg-python (from -r requirements.txt (line 3))
  Downloading ffmpeg_python-0.2.0-py3-none-any.whl.metadata (1.7 kB)
Downloading ffmpeg_python-0.2.0-py3-none-any.whl (25 kB)
Installing collected packages: ffmpeg-python
Successfully installed ffmpeg-python-0.2.0


In [None]:
%cd /content/drive/MyDrive/Colab\ Notebooks/lip_reading
!git clone https://github.com/hhj1897/face_detection.git
%cd face_detection
!git lfs pull
%pip install -e .
%cd ..

/content/drive/MyDrive/Colab Notebooks/lip_reading
fatal: destination path 'face_detection' already exists and is not an empty directory.
/content/drive/MyDrive/Colab Notebooks/lip_reading/face_detection
Obtaining file:///content/drive/MyDrive/Colab%20Notebooks/lip_reading/face_detection
  Preparing metadata (setup.py) ... [?25l[?25hdone
Installing collected packages: ibug_face_detection
  Running setup.py develop for ibug_face_detection
Successfully installed ibug_face_detection-0.1.0
/content/drive/MyDrive/Colab Notebooks/lip_reading


In [None]:
!git clone https://github.com/hhj1897/face_alignment.git
%cd face_alignment
%pip install -e .
%cd ..

fatal: destination path 'face_alignment' already exists and is not an empty directory.
/content/drive/MyDrive/Colab Notebooks/lip_reading/face_alignment
Obtaining file:///content/drive/MyDrive/Colab%20Notebooks/lip_reading/face_alignment
  Preparing metadata (setup.py) ... [?25l[?25hdone
Installing collected packages: ibug_face_alignment
  Running setup.py develop for ibug_face_alignment
Successfully installed ibug_face_alignment-0.1.0
/content/drive/MyDrive/Colab Notebooks/lip_reading


In [None]:
import sys
import os

# Add the parent directories of ibug packages to Python path
face_alignment_path = "/content/drive/MyDrive/Colab Notebooks/lip_reading/face_alignment"
face_detection_path = "/content/drive/MyDrive/Colab Notebooks/lip_reading/face_detection"

# Add both paths to sys.path
sys.path.insert(0, face_alignment_path)
sys.path.insert(0, face_detection_path)

# Verify imports
try:
    from ibug.face_alignment import FANPredictor
    print("Successfully imported FANPredictor")
    from ibug.face_detection import RetinaFacePredictor
    print("Successfully imported RetinaFacePredictor")
except ImportError as e:
    print(f"Import Error: {e}")
    print("\nCurrent sys.path:")
    for p in sys.path:
        print(p)

Successfully imported FANPredictor
Successfully imported RetinaFacePredictor


In [None]:
%cd /content/drive/MyDrive/Colab\ Notebooks/lip_reading/auto_avsr/tutorials

/content/drive/MyDrive/Colab Notebooks/lip_reading/auto_avsr/tutorials


**Note** To run this tutorial, please make sure you are in tutorials folder.

In [None]:
import sys
sys.path.insert(0, "../")

In [None]:
import os
import torch
import torchaudio
import torchvision

## 1. Build an inference pipeline

The InferencePipeline carries out the following three steps:

1. Load audio or video data
2. Run pre-processing functions
3. Run inference

In [None]:
import os
from lightning import ModelModule
from datamodule.transforms import AudioTransform, VideoTransform

In [None]:
import argparse
parser = argparse.ArgumentParser()
args, _ = parser.parse_known_args(args=[])

In [None]:
class InferencePipeline(torch.nn.Module):
    def __init__(self, args, ckpt_path, detector="retinaface"):
        super(InferencePipeline, self).__init__()
        self.modality = args.modality
        if self.modality == "audio":
            self.audio_transform = AudioTransform(subset="test")
        elif self.modality == "video":
            if detector == "mediapipe":
                from preparation.detectors.mediapipe.detector import LandmarksDetector
                from preparation.detectors.mediapipe.video_process import VideoProcess
                self.landmarks_detector = LandmarksDetector()
                self.video_process = VideoProcess(convert_gray=False)
            elif detector == "retinaface":
                from preparation.detectors.retinaface.detector import LandmarksDetector
                from preparation.detectors.retinaface.video_process import VideoProcess
                self.landmarks_detector = LandmarksDetector(device="cuda:0")
                self.video_process = VideoProcess(convert_gray=False)
            self.video_transform = VideoTransform(subset="test")

        ckpt = torch.load(ckpt_path, map_location=lambda storage, loc: storage)
        self.modelmodule = ModelModule(args)
        self.modelmodule.model.load_state_dict(ckpt)
        self.modelmodule.eval()

    def load_video(self, data_filename):
        return torchvision.io.read_video(data_filename, pts_unit="sec")[0].numpy()

    def forward(self, data_filename):
        data_filename = os.path.abspath(data_filename)
        assert os.path.isfile(data_filename), f"data_filename: {data_filename} does not exist."

        if self.modality == "audio":
            audio, sample_rate = self.load_audio(data_filename)
            audio = self.audio_process(audio, sample_rate)
            audio = audio.transpose(1, 0)
            audio = self.audio_transform(audio)
            with torch.no_grad():
                transcript = self.modelmodule(audio)

        if self.modality == "video":
            video = self.load_video(data_filename)
            landmarks = self.landmarks_detector(video)
            video = self.video_process(video, landmarks)
            video = torch.tensor(video)
            video = video.permute((0, 3, 1, 2))
            video = self.video_transform(video)
        #     with torch.no_grad():
        #         transcript = self.modelmodule(video)
        # return transcript
            with torch.no_grad():
                predicted, confidence_score, nbest_hyps, best_hype = self.modelmodule(video)
            return predicted, confidence_score, nbest_hyps, best_hype

    def load_audio(self, data_filename):
        waveform, sample_rate = torchaudio.load(data_filename, normalize=True)
        return waveform, sample_rate

    def load_video(self, data_filename):
        return torchvision.io.read_video(data_filename, pts_unit="sec")[0].numpy()

    def audio_process(self, waveform, sample_rate, target_sample_rate=16000):
        if sample_rate != target_sample_rate:
            waveform = torchaudio.functional.resample(
                waveform, sample_rate, target_sample_rate
            )
        waveform = torch.mean(waveform, dim=0, keepdim=True)
        return waveform

In [None]:
model_path = "./visual_asr.pth"

In [None]:
import os
print("model_path:", model_path)
print("model file exists:", os.path.exists(model_path))
print("model file size:", os.path.getsize(model_path) if os.path.exists(model_path) else "File not found")

model_path: ./visual_asr.pth
model file exists: True
model file size: 1001892616


In [None]:
setattr(args, 'modality', 'video')
pipeline = InferencePipeline(args, model_path, detector="retinaface")

In [None]:
import os

folder_path = "data/video"
confidence = []
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    print("=============")
    print(filename)
    output_path = "data/VSR_result/" + filename.split(".")[0] + ".txt"
    print(output_path)
    predicted, confidence_score, nbest_hyps, best_hype = pipeline(file_path)
    with open(output_path, 'x') as file:
      file.write(predicted)
    confidence.append(confidence_score)
    print(predicted)
    print(confidence_score)

quite_1.mp4
data/VSR_result/quite_1.txt
DO YOU LIKE LEARNING YES I LEARNED VERY MUCH I GOT THIS HAPPY WHEN I WAS 12 YEARS OLD
-5.134954452514648
quite_2.mp4
data/VSR_result/quite_2.txt
THERE'S NO 10 AT LC6 I JUST GOT BACK FROM SUMMER CAMP I WAS IN GREECE I GOT TO TRY SO MANY THINGS FOR THE FIRST TIME
-12.132772445678711
quite_3.mp4
data/VSR_result/quite_3.txt
SATYA DID YOU GET THE PERFUME I GIVE YOU INSTANT BUT I TELL YOU THE TRUTH I DON'T WANT TO PERFUME I'M SORRY I DON'T KNOW DADDY
-15.144222259521484
quite_4.mp4
data/VSR_result/quite_4.txt
I WOULD LIKE TO REGISTER FOR CLASS T NO PROBLEM WHAT CLASS WOULD YOU LIKE TO TAKE I WILL VERY MUCH ENJOY TAKING THE PSYCHOLOGY CLASS BECAUSE I'M CRAZY
-10.184162139892578
quite_5.mp4
data/VSR_result/quite_5.txt
HI BILL I SAW YOUR GRANDMA YESTERDAY OH WHERE WAS THAT I WAS WORKING AROUND THE TRACK AT MY COLLEGE AND SHE WAS WORKING AROUND THE SAME TRACK
-4.815734386444092
quite_6.mp4
data/VSR_result/quite_6.txt
ROY ALWAYS TRIES TO DEFINITELY GET LOUS

In [None]:
import csv

with open('VSR_confidence.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    for item in confidence:
        writer.writerow([item])

In [None]:
!ls