### create venv

In [4]:
import subprocess

def open_mp4_with_ffmpeg(file_path, chunk_duration=8):
    """
    Open an MP4 file and extract audio channel in memory as chunks.
    
    Args:
        file_path: Path to the MP4 file
        chunk_duration: Duration of each audio chunk in seconds (default: 8)
    
    Yields:
        bytes: Audio data chunks in WAV format
    """
    # Extract audio to stdout in WAV format
    process = subprocess.Popen(
        [
            'ffmpeg',
            '-i', file_path,
            '-vn',  # No video
            '-acodec', 'pcm_s16le',  # PCM 16-bit little-endian
            '-ar', '16000',  # 16kHz sample rate
            '-ac', '1',  # Mono
            '-f', 'wav',  # WAV format
            'pipe:1'  # Output to stdout
        ],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        bufsize=10**8
    )
    
    # Calculate bytes per chunk (16kHz * 2 bytes/sample * 1 channel * duration)
    bytes_per_chunk = 16000 * 2 * chunk_duration
    
    # Skip WAV header (44 bytes)
    process.stdout.read(44)
    
    # Read and yield chunks
    while True:
        chunk = process.stdout.read(bytes_per_chunk)
        if not chunk:
            break
        yield chunk
    
    process.wait()

In [5]:
import numpy as np
from IPython.display import Audio, display

def play_audio_chunk(audio_bytes, sample_rate=16000):
    """
    Play binary audio data in Jupyter Notebook.
    
    Args:
        audio_bytes: Raw audio data in bytes (PCM format)
        sample_rate: Sample rate of the audio (default: 16000)
    """
    # Convert raw PCM bytes to numpy array
    audio_array = np.frombuffer(audio_bytes, dtype=np.int16)
    # Normalize to float32 in range [-1, 1]
    audio_normalized = audio_array.astype(np.float32) / 32768.0
    
    display(Audio(audio_normalized, rate=sample_rate, autoplay=True))

def play_audio_from_generator(audio_generator, sample_rate=16000):
    """
    Play audio chunks from a generator.
    
    Args:
        audio_generator: Generator yielding audio chunks
        sample_rate: Sample rate of the audio (default: 16000)
    """
    for chunk in audio_generator:
        play_audio_chunk(chunk, sample_rate)

In [6]:
for i, chunk in enumerate(open_mp4_with_ffmpeg('example.mp4', chunk_duration=6)):  
    play_audio_chunk(chunk)
    if i >= 1:  # i=0 is first chunk, i=1 is second chunk
        break

In [13]:
save_path = 'extracted_audio.wav'
with open(save_path, 'wb') as f:
    # Write WAV header
    f.write(b'RIFF')
    f.write((36 + 0).to_bytes(4, 'little'))  # Placeholder for file size
    f.write(b'WAVEfmt ')
    f.write((16).to_bytes(4, 'little'))  # Subchunk1Size
    f.write((1).to_bytes(2, 'little'))  # AudioFormat (PCM)
    f.write((1).to_bytes(2, 'little'))  # NumChannels
    f.write((16000).to_bytes(4, 'little'))  # SampleRate
    f.write((16000 * 2).to_bytes(4, 'little'))  # ByteRate
    f.write((2).to_bytes(2, 'little'))  # BlockAlign
    f.write((16).to_bytes(2, 'little'))  # BitsPerSample
    f.write(b'data')
    f.write((0).to_bytes(4, 'little'))  # Placeholder for data chunk size

    data_size = 0
    for i, chunk in enumerate(open_mp4_with_ffmpeg('example.mp4', chunk_duration=4)): 
        # skip first chunk for demo purposes
        if i == 0:
            continue
        f.write(chunk)
        data_size += len(chunk)
        if i >= 1: 
            break

    # Update file size and data chunk size in header
    f.seek(4)
    f.write((36 + data_size).to_bytes(4, 'little'))
    f.seek(40)
    f.write((data_size).to_bytes(4, 'little'))

In [3]:
# Download model quantized with Q5_0 method
from huggingface_hub import hf_hub_download; hf_hub_download(repo_id='bofenghuang/whisper-large-v3-french', filename='ggml-model-q5_0.bin', local_dir='./models/whisper-large-v3-french')


  from .autonotebook import tqdm as notebook_tqdm


'models/whisper-large-v3-french/ggml-model-q5_0.bin'