Skip to content

Commit

Permalink
801 add large v2 v3 models (#803)
Browse files Browse the repository at this point in the history
  • Loading branch information
raivisdejus committed Jun 18, 2024
1 parent 82da36f commit 900f6c9
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 5 deletions.
22 changes: 18 additions & 4 deletions buzz/model_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,17 @@ class WhisperModelSize(str, enum.Enum):
SMALL = "small"
MEDIUM = "medium"
LARGE = "large"
LARGEV2 = "large-v2"
LARGEV3 = "large-v3"

def to_faster_whisper_model_size(self) -> str:
if self == WhisperModelSize.LARGE:
return "large-v2"
return "large-v1"
return self.value

def to_whisper_cpp_model_size(self) -> str:
if self == WhisperModelSize.LARGE:
return "large-v1"
return self.value

def __str__(self):
Expand Down Expand Up @@ -201,7 +208,9 @@ def get_local_model_path(self) -> Optional[str]:
"base": "60ed5bc3dd14eea856493d334349b405782ddcaf0028d4b5df4088345fba2efe",
"small": "1be3a9b2063867b937e64e2ec7483364a79917e157fa98c5d94b5c1fffea987b",
"medium": "6c14d5adee5f86394037b4e4e8b59f1673b6cee10e3cf0b11bbdbee79c156208",
"large": "64d182b440b98d5203c4f9bd541544d84c605196c4f7b845dfa11fb23594d1e2",
"large-v1": "7d99f41a10525d0206bddadd86760181fa920438b6b33237e3118ff6c83bb53d",
"large-v2": "9a423fe4d40c82774b6af34115b8b935f34152246eb19e80e376071d3f999487",
"large-v3": "64d182b440b98d5203c4f9bd541544d84c605196c4f7b845dfa11fb23594d1e2",
}


Expand Down Expand Up @@ -318,7 +327,12 @@ def download_faster_whisper_model(
% (size, ", ".join(faster_whisper.utils._MODELS))
)

repo_id = "guillaumekln/faster-whisper-%s" % size
logging.debug("Downloading Faster Whisper model: %s", size)

if size == WhisperModelSize.LARGEV3:
repo_id = "Systran/faster-whisper-large-v3"
else:
repo_id = "guillaumekln/faster-whisper-%s" % size

allow_patterns = [
"model.bin", # largest by size first
Expand Down Expand Up @@ -357,7 +371,7 @@ def __init__(self, model: TranscriptionModel):

def run(self) -> None:
if self.model.model_type == ModelType.WHISPER_CPP:
model_name = self.model.whisper_model_size.value
model_name = self.model.whisper_model_size.to_whisper_cpp_model_size()
url = huggingface_hub.hf_hub_url(
repo_id="ggerganov/whisper.cpp",
filename=f"ggml-{model_name}.bin",
Expand Down
8 changes: 7 additions & 1 deletion docs/docs/faq.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,10 @@ sidebar_position: 5
Relevant tools:
- Mac OS - [BlackHole](https://github.com/ExistentialAudio/BlackHole).
- Windows - [VB CABLE](https://vb-audio.com/Cable/)
- Linux - [PulseAudio Volume Control](https://wiki.ubuntu.com/record_system_sound)
- Linux - [PulseAudio Volume Control](https://wiki.ubuntu.com/record_system_sound)

4. **What model should I use?**

Model size to use will depend on your hardware and use case. Smaller models will work faster but will have more inaccuracies. Larger models will be more accurate but will require more powerful hardware or longer time to transcribe.

When choosing among large models consider the following. "Large" is the first released older model, "Large-V2" is later updated model with better accuracy, for some languages considered the most robust and stable. "Large-V3" is the latest model with the best accuracy in many cases, but some times can hallucinate or invent words that were never in the audio. The only sure way to know what model best suits your needs is to test them all in your language.

0 comments on commit 900f6c9

Please sign in to comment.