# Setup

In [None]:
videos_dir = None
save_dir_all = None

In [None]:
import os

dir_path = os.getcwd()

if not videos_dir:
    if 'google.colab' in str(get_ipython()):
        # Update this path as necessary
        videos_dir = f'{dir_path}/AIC_Video'
    elif 'kaggle' in str(get_ipython()):
        videos_dir = f'{dir_path}/AIC_Video'
    else:
        parent_dir_path = os.path.dirname(dir_path)
        videos_dir = f'{parent_dir_path}/dataset/AIC_Video'
        
if not save_dir_all:
    save_dir_all = f'{dir_path}/Audio'

In [None]:
! pip install ffmpeg-python
! pip install pyannote.audio



In [None]:
import os
import ffmpeg
from tqdm import tqdm
import json
from pyannote.audio import Pipeline
import torch
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


# Parse data path

In [None]:
def parse_video_info(videos_dir):
    """Parse video information from the directory structure."""
    all_video_paths = {}
    for part in sorted(os.listdir(videos_dir)):
        data_part = part.split('_')[-1]
        all_video_paths[data_part] = {}

    for data_part in sorted(all_video_paths.keys()):
        data_part_path = f'{videos_dir}/Videos_{data_part}/video'
        video_paths = sorted(os.listdir(data_part_path))
        video_ids = [video_path.replace('.mp4', '').split(
            '_')[-1] for video_path in video_paths]
        for video_id, video_path in zip(video_ids, video_paths):
            video_path_full = f'{data_part_path}/{video_path}'
            all_video_paths[data_part][video_id] = video_path_full

    return all_video_paths

# VOICE ACTIVITY DETECTION (VAD)

In [None]:
def initialize_vad_pipeline():
    try:
        # Retrieve the token from environment or use hardcoded
        token = os.getenv("HUGGINGFACE_TOKEN", "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")
        
        # Attempt to load the pipeline
        pipeline = Pipeline.from_pretrained("pyannote/voice-activity-detection",
                                            use_auth_token=token)
        
        # Check if the pipeline was loaded successfully
        if pipeline is None:
            raise ValueError("Pipeline returned None. Please check your token and model name.")
        
        # Move the pipeline to the appropriate device
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        pipeline = pipeline.to(device)
        
        return pipeline

    except Exception as e:
        print(f"Error initializing VAD pipeline: {e}")
        return None

def get_video_frame_rate(video_path):
    """Get the frame rate of the video"""
    
    probe = ffmpeg.probe(video_path)
    video_streams = [stream for stream in probe['streams'] if stream['codec_type'] == 'video']
    if not video_streams:
        raise ValueError("No video stream found in the file.")
    frame_rate = eval(video_streams[0]['avg_frame_rate'])
    return frame_rate
    
def run_vad_on_audio(pipeline, audio_path):
    """Run VAD on extracted audio and return the list of speech segments with start and end times"""
    
    output = pipeline(audio_path)
    speech_segments = []
    for speech in output.get_timeline().support():
        speech_segments.append((speech.start, speech.end))
    return speech_segments

# Get audio

In [None]:
def extract_audio(video_path, output_audio_path):
    """Extract full audio from video file"""
    
    try:
        (
            ffmpeg
            .input(video_path)
            .output(output_audio_path)
            .run(overwrite_output=True)
        )
        return output_audio_path
    except Exception as e:
        print(f"Error extracting audio from {video_path}: {e}")
        return None

def process_videos_with_vad(all_video_paths, save_dir_all):
    """Process each video with VAD and split audio based on speech segments"""
    
    vad_pipeline = initialize_vad_pipeline()

    if vad_pipeline is None:
        print("Failed to initialize the VAD pipeline. Please check your configuration.")
        return
    
    if not os.path.exists(save_dir_all): 
        os.mkdir(save_dir_all)

    for key in all_video_paths.keys():
        save_dir = f"{save_dir_all}/{key}"
        
        if not os.path.exists(save_dir):
            os.mkdir(save_dir)
        
        if not all_video_paths[key]:
            print(f"Skipping empty AIC_Video subdirectory: {key}")
            continue
        
        video_paths_dict = all_video_paths[key]
        video_ids = sorted(video_paths_dict.keys())
        for video_id in tqdm(video_ids):
            video_path = video_paths_dict[video_id]
            
            # Create a folder for this video_id
            video_save_dir = os.path.join(save_dir, video_id)
            if not os.path.exists(video_save_dir):
                os.mkdir(video_save_dir)

            # Extract full audio
            full_audio_path = os.path.join(video_save_dir, "full_audio.wav")
            extract_audio(video_path, full_audio_path)

            # Run VAD on full audio
            vad_output = vad_pipeline(full_audio_path)

            # Get video frame rate
            frame_rate = get_video_frame_rate(video_path)

            # Process speech segments
            speech_segments = []
            for speech_turn, _, _ in vad_output.itertracks(yield_label=True):
                start_frame = int(speech_turn.start * frame_rate)
                end_frame = int(speech_turn.end * frame_rate)
                speech_segments.append([start_frame, end_frame])

            # Save speech segments as audio files and update JSON
            for idx, (start_frame, end_frame) in enumerate(speech_segments):
                start_time = start_frame / frame_rate
                end_time = end_frame / frame_rate
                
                segment_audio_path = os.path.join(video_save_dir, f"shot_{start_frame}-{end_frame}.mp3")
                
                # Extract and save the speech segment
                (
                    ffmpeg
                    .input(full_audio_path, ss=start_time, to=end_time)
                    .output(segment_audio_path)
                    .run(overwrite_output=True)
                )

            # Save frame ranges to JSON
            json_path = os.path.join(video_save_dir, f"{video_id}_speech_frames.json")
            with open(json_path, 'w') as f:
                json.dump(speech_segments, f)

            # Remove the full audio file
            os.remove(full_audio_path)

In [None]:
all_video_paths = parse_video_info(videos_dir)
process_videos_with_vad(all_video_paths, save_dir_all)

Lightning automatically upgraded your loaded checkpoint from v1.1.3 to v2.4.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../../../../../.cache/torch/pyannote/models--pyannote--segmentation/snapshots/059e96f964841d40f1a5e755bb7223f76666bba4/pytorch_model.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.3.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.7.1, yours is 2.3.1+cu121. Bad things might happen unless you revert torch to 1.x.


  0%|          | 0/2 [00:00<?, ?it/s]ffmpeg version 6.1.1-3ubuntu5+esm1 Copyright (c) 2000-2023 the FFmpeg developers
  built with gcc 13 (Ubuntu 13.2.0-23ubuntu4)
  configuration: --prefix=/usr --extra-version=3ubuntu5+esm1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --disable-omx --enable-gnutls --enable-libaom --enable-libass --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libglslang --enable-libgme --enable-libgsm --enable-libharfbuzz --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx265 --enable-