Skip to content

Commit

Permalink
us av library instead!
Browse files Browse the repository at this point in the history
Use the "av" library instead, which "just makes sense" since ```faster-library``` already uses it anyways.  This removes the ```pyaudio``` dependency as well as that of ```scipy``` and ```numpy```.  It also obviates having to rely on users to know how to install ffmpeg since ```av``` is basically a wrapper for ffmpeg.
  • Loading branch information
BBC-Esq committed Mar 31, 2024
1 parent 0dfbdb2 commit fc6e93d
Showing 1 changed file with 31 additions and 26 deletions.
57 changes: 31 additions & 26 deletions whisper_live/utils.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,18 @@
import os
import textwrap
import scipy
import ffmpeg
import numpy as np

import av
from pathlib import Path

def clear_screen():
"""Clears the console screen."""
os.system("cls" if os.name == "nt" else "clear")


def print_transcript(text):
"""Prints formatted transcript text."""
wrapper = textwrap.TextWrapper(width=60)
for line in wrapper.wrap(text="".join(text)):
print(line)


def format_time(s):
"""Convert seconds (float) to SRT time format."""
hours = int(s // 3600)
Expand All @@ -25,27 +21,23 @@ def format_time(s):
milliseconds = int((s - int(s)) * 1000)
return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}"


def create_srt_file(segments, output_file):
"""Creates an SRT file from the given segments."""
with open(output_file, 'w', encoding='utf-8') as srt_file:
segment_number = 1
for segment in segments:
start_time = format_time(float(segment['start']))
end_time = format_time(float(segment['end']))
text = segment['text']

srt_file.write(f"{segment_number}\n")
srt_file.write(f"{start_time} --> {end_time}\n")
srt_file.write(f"{text}\n\n")

segment_number += 1


def resample(file: str, sr: int = 16000):
"""
# https://github.com/openai/whisper/blob/7858aa9c08d98f75575035ecd6481f462d66ca27/whisper/audio.py#L22
Open an audio file and read as mono waveform, resampling as necessary,
save the resampled audio
save the resampled audio using the av library.
Args:
file (str): The audio file to open
Expand All @@ -54,18 +46,31 @@ def resample(file: str, sr: int = 16000):
Returns:
resampled_file (str): The resampled audio file
"""
try:
# This launches a subprocess to decode audio while down-mixing and resampling as necessary.
# Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
out, _ = (
ffmpeg.input(file, threads=0)
.output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr)
.run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
)
except ffmpeg.Error as e:
raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
np_buffer = np.frombuffer(out, dtype=np.int16)
container = av.open(file)
stream = next(s for s in container.streams if s.type == 'audio')

resampler = av.AudioResampler(
format='s16',
layout='mono',
rate=sr,
)

output_file = Path(file).stem + "_resampled.wav"
output_container = av.open(output_file, mode='w')
output_stream = output_container.add_stream('pcm_s16le', rate=sr)
output_stream.layout = 'mono'

for frame in container.decode(audio=0):
frame.pts = None
resampled_frames = resampler.resample(frame)
if resampled_frames is not None:
for resampled_frame in resampled_frames:
for packet in output_stream.encode(resampled_frame):
output_container.mux(packet)

for packet in output_stream.encode(None):
output_container.mux(packet)

output_container.close()

resampled_file = f"{file.split('.')[0]}_resampled.wav"
scipy.io.wavfile.write(resampled_file, sr, np_buffer.astype(np.int16))
return resampled_file
return output_file

0 comments on commit fc6e93d

Please sign in to comment.