# Use SVFAP conda env


In [1]:
import cv2
import os
import torchvision
from pipelines.data.data_module import AVSRDataLoader
from pipelines.detectors.mediapipe.detector import LandmarksDetector
from fractions import Fraction

def save2vid(filename, vid, frames_per_second):
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    # fps = float(frames_per_second)
    fps = Fraction(frames_per_second).limit_denominator()
    torchvision.io.write_video(filename, vid, fps)

def preprocess_video(src_filename, dst_filename):
    landmarks = landmarks_detector(src_filename)
    data = dataloader.load_data(src_filename, landmarks)
    
    fps_raw = cv2.VideoCapture(src_filename).get(cv2.CAP_PROP_FPS)
    fps = float(fps_raw) if fps_raw is not None else 25.0  # Default fallback
    print("FPS:", fps, "Type:", type(fps))  # Debugging
    
    save2vid(dst_filename, data, fps)


dataloader = AVSRDataLoader(modality="video", speed_rate=1, transform=False, detector="mediapipe", convert_gray=False)
landmarks_detector = LandmarksDetector()

In [2]:
import torch
from pipelines.model import AVSR

class InferencePipeline(torch.nn.Module):
    def __init__(self, modality, model_path, model_conf, detector="mediapipe", face_track=False, device="cuda:0"):
        super(InferencePipeline, self).__init__()
        self.device = device
        # modality configuration
        self.modality = modality
        self.dataloader = AVSRDataLoader(modality, detector=detector)
        self.model = AVSR(modality, model_path, model_conf, rnnlm=None, rnnlm_conf=None, penalty=0.0, ctc_weight=0.1, lm_weight=0.0, beam_size=40, device=device)
        if face_track and self.modality in ["video", "audiovisual"]:
            self.landmarks_detector = LandmarksDetector()
        else:
            self.landmarks_detector = None


    def process_landmarks(self, data_filename, landmarks_filename):
        if self.modality == "audio":
            return None
        if self.modality in ["video", "audiovisual"]:
            landmarks = self.landmarks_detector(data_filename)
            return landmarks


    def forward(self, data_filename, landmarks_filename=None):
        assert os.path.isfile(data_filename), f"data_filename: {data_filename} does not exist."
        landmarks = self.process_landmarks(data_filename, landmarks_filename)
        data = self.dataloader.load_data(data_filename, landmarks)
        transcript = self.model.infer(data)
        return transcript

    def extract_features(self, data_filename, landmarks_filename=None, extract_resnet_feats=False):
        assert os.path.isfile(data_filename), f"data_filename: {data_filename} does not exist."
        landmarks = self.process_landmarks(data_filename, landmarks_filename)
        data = self.dataloader.load_data(data_filename, landmarks)
        with torch.no_grad():
            if isinstance(data, tuple):
                enc_feats = self.model.model.encode(data[0].to(self.device), data[1].to(self.device), extract_resnet_feats)
            else:
                enc_feats = self.model.model.encode(data.to(self.device), extract_resnet_feats)
        return enc_feats     

INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
W0000 00:00:1746553848.020871  887977 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1746553848.039631  887984 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


In [3]:
preprocess_video(src_filename="clip.mp4", dst_filename="/clip_roi.mp4")

FPS: 24.0 Type: <class 'float'>


In [4]:
modality = "video"
model_conf = "LRS3_V_WER19.1/model.json"  
model_path = "LRS3_V_WER19.1/model.pth"
pipeline = InferencePipeline(modality, model_path, model_conf, face_track=True)

Namespace(a_upsample_ratio=1, accum_grad=2, adim=768, aheads=12, apply_uttmvn=True, aux_lsm_weight=0.0, backend='pytorch', badim=320, batch_bins=0, batch_count='auto', batch_frames_in=0, batch_frames_inout=0, batch_frames_out=0, bdropout_rate=0.0, beam_size=4, blayers=2, bnmask=2, bprojs=300, btype='blstmp', bunits=300, cnn_module_kernel=31, config2=None, config3=None, context_residual=False, criterion='acc', ctc_type='warpctc', ctc_weight=0.3, debugmode=1, dec_init=None, dec_init_mods=['att.', ' dec.'], dict='data/lang_1char/units.txt', dlayers=6, dropout_rate=0.1, dunits=3072, early_stop_criterion='validation/main/acc', elayers=12, enc_init=None, enc_init_mods=['enc.enc.'], eps=1e-08, eps_decay=0.01, eunits=3072, fbank_fmax=None, fbank_fmin=0.0, fbank_fs=16000, grad_clip=5.0, grad_noise=False, labels_type='unigram5000', lm_weight=0.1, lsm_weight=0.1, macaron_style=1, maxlen_in=220, maxlen_out=220, maxlenratio=0.0, minibatches=0, minlenratio=0.0, model_module='espnet.nets.pytorch_back

W0000 00:00:1746553863.902075  888282 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


In [5]:
features = pipeline.extract_features("/clip_roi.mp4")
print(features.size())

W0000 00:00:1746553863.921940  888289 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


torch.Size([178, 768])


In [2]:
import numpy as np

In [11]:
x = np.load("/data/audio-video-deepfake-2/ASR_features/LRS3_V_WER19.1/real+fake/val/id00046/9h3XRcuVI0s/00009/fake_video_real_audio.npz", allow_pickle=True)
x['visual']

array([[-0.10826629,  0.10884785,  0.04951465, ..., -0.2946354 ,
        -0.29910976, -0.3329704 ],
       [-0.18616872,  0.1710677 , -0.00855133, ..., -0.03903882,
        -0.21815234, -0.5626292 ],
       [-0.22646238,  0.42673042,  0.02385088, ...,  0.126887  ,
        -0.15350495, -0.3092311 ],
       ...,
       [-0.02050836, -0.00163938,  0.8248512 , ..., -0.01400121,
         0.02664605,  0.00908441],
       [-0.00380427, -0.00457797,  0.8197377 , ..., -0.01151075,
         0.00412359, -0.00309835],
       [ 0.00163391,  0.01081297,  0.79678965, ..., -0.00742658,
         0.00794817,  0.02704286]], dtype=float32)