us av library instead!

Use the "av" library instead, which "just makes sense" since ```faster-library``` already uses it anyways. This removes the ```pyaudio``` dependency as well as that of ```scipy``` and ```numpy```. It also obviates having to rely on users to know how to install ffmpeg since ```av``` is basically a wrapper for ffmpeg.
collabora · Mar 31, 2024 · fc6e93d · fc6e93d
1 parent 0dfbdb2
commit fc6e93d
Showing 1 changed file with 31 additions and 26 deletions.
diff --git a/whisper_live/utils.py b/whisper_live/utils.py
@@ -1,22 +1,18 @@
 import os
 import textwrap
-import scipy
-import ffmpeg
-import numpy as np
-
+import av
+from pathlib import Path
 
 def clear_screen():
     """Clears the console screen."""
     os.system("cls" if os.name == "nt" else "clear")
 
-
 def print_transcript(text):
     """Prints formatted transcript text."""
     wrapper = textwrap.TextWrapper(width=60)
     for line in wrapper.wrap(text="".join(text)):
         print(line)
 
-
 def format_time(s):
     """Convert seconds (float) to SRT time format."""
     hours = int(s // 3600)
@@ -25,27 +21,23 @@ def format_time(s):
     milliseconds = int((s - int(s)) * 1000)
     return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}"
 
-
 def create_srt_file(segments, output_file):
+    """Creates an SRT file from the given segments."""
     with open(output_file, 'w', encoding='utf-8') as srt_file:
         segment_number = 1
         for segment in segments:
             start_time = format_time(float(segment['start']))
             end_time = format_time(float(segment['end']))
             text = segment['text']
-
             srt_file.write(f"{segment_number}\n")
             srt_file.write(f"{start_time} --> {end_time}\n")
             srt_file.write(f"{text}\n\n")
-
             segment_number += 1
 
-
 def resample(file: str, sr: int = 16000):
     """
-    # https://github.com/openai/whisper/blob/7858aa9c08d98f75575035ecd6481f462d66ca27/whisper/audio.py#L22
     Open an audio file and read as mono waveform, resampling as necessary,
-    save the resampled audio
+    save the resampled audio using the av library.
 
     Args:
         file (str): The audio file to open
@@ -54,18 +46,31 @@ def resample(file: str, sr: int = 16000):
     Returns:
         resampled_file (str): The resampled audio file
     """
-    try:
-        # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
-        # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
-        out, _ = (
-            ffmpeg.input(file, threads=0)
-            .output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr)
-            .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
-        )
-    except ffmpeg.Error as e:
-        raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
-    np_buffer = np.frombuffer(out, dtype=np.int16)
+    container = av.open(file)
+    stream = next(s for s in container.streams if s.type == 'audio')
+
+    resampler = av.AudioResampler(
+        format='s16',
+        layout='mono',
+        rate=sr,
+    )
+
+    output_file = Path(file).stem + "_resampled.wav"
+    output_container = av.open(output_file, mode='w')
+    output_stream = output_container.add_stream('pcm_s16le', rate=sr)
+    output_stream.layout = 'mono'
+
+    for frame in container.decode(audio=0):
+        frame.pts = None
+        resampled_frames = resampler.resample(frame)
+        if resampled_frames is not None:
+            for resampled_frame in resampled_frames:
+                for packet in output_stream.encode(resampled_frame):
+                    output_container.mux(packet)
+
+    for packet in output_stream.encode(None):
+        output_container.mux(packet)
+
+    output_container.close()
 
-    resampled_file = f"{file.split('.')[0]}_resampled.wav"
-    scipy.io.wavfile.write(resampled_file, sr, np_buffer.astype(np.int16))
-    return resampled_file
+    return output_file