In [2]:
# pip install transformers datasets evaluate jiwer accelerate librosa soundfile


In [3]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset


device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-large-v3"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=30,
    batch_size=16,
    return_timestamps=True,
    torch_dtype=torch_dtype,
    device=device,
)

# dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")
# sample = dataset[0]["audio"]

# result = pipe(sample)
# print(result["text"])


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [22]:
dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")
sample = dataset[0]["audio"]


In [23]:
result = pipe(sample, return_timestamps=True, generate_kwargs={"language": "english"})


In [20]:
print(result['text'])
print(result['chunks'])

[{'timestamp': (0.0, 5.28), 'text': ' Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.'}, {'timestamp': (6.34, 10.1), 'text': " Nor is Mr. Quilter's manner less interesting than his matter."}, {'timestamp': (10.92, 17.6), 'text': ' He tells us that at this festive season of the year, with Christmas and roast beef looming before us,'}, {'timestamp': (18.44, 22.58), 'text': ' similes drawn from eating and its results occur most readily to the mind.'}, {'timestamp': (23.16, 28.66), 'text': " He has grave doubts whether Sir Frederick Leighton's work is really Greek after all,"}, {'timestamp': (29.1, 32.48), 'text': ' and can discover in it but little of rocky Ithaca.'}, {'timestamp': (33.62, 37.86), 'text': " Linnell's pictures are a sort of Upguards and Adam paintings,"}, {'timestamp': (37.86, 42.88), 'text': " and Mason's exquisite idylls are as national as a jingo poem."}, {'timestamp': (44.56, 45.78), 'text': " Mr. Burkett Foster's"}, {'timestamp

In [None]:
from transformers import pipeline, AutoModelForCausalLM, AutoModelForSpeechSeq2Seq, AutoProcessor
import torch
from datasets import load_dataset

device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

assistant_model_id = "openai/whisper-tiny"

assistant_model = AutoModelForCausalLM.from_pretrained(
    assistant_model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
assistant_model.to(device)

model_id = "openai/whisper-large-v3"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    generate_kwargs={"assistant_model": assistant_model},
    torch_dtype=torch_dtype,
    device=device,
)

dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
sample = dataset[0]["audio"]

result = pipe(sample)
print(result["text"])


In [None]:
#phoneme transcription
https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme