reference: https://huggingface.co/openai/whisper-large-v3

pip install --upgrade git+https://github.com/huggingface/transformers.git accelerate datasets[audio]

In [1]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset

import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

import string

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/regal/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /home/regal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
print(device)

cuda:0


In [3]:
#model_id = "openai/whisper-large-v3"
model_id = "openai/whisper-tiny"
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 384, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(384, 384, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 384)
      (layers): ModuleList(
        (0-3): 4 x WhisperEncoderLayer(
          (self_attn): WhisperSdpaAttention(
            (k_proj): Linear(in_features=384, out_features=384, bias=False)
            (v_proj): Linear(in_features=384, out_features=384, bias=True)
            (q_proj): Linear(in_features=384, out_features=384, bias=True)
            (out_proj): Linear(in_features=384, out_features=384, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=384, out_features=1536, bias=True)
          (fc2): Linear(in_features=1536, out_features=384, bias=True)
          

In [4]:
processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=30,
    batch_size=16,
    return_timestamps="word",
    torch_dtype=torch_dtype,
    device=device
)

In [None]:
dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")
sample = dataset[0]["audio"]

In [5]:
#result = pipe(sample)
result = pipe("/home/regal/devel/ws_cacti/src/hri_cacti_xr/speech_recognition/speech_to_text_research/openai/whisper_large_v3/recorded_audio.wav",
              generate_kwargs={"language": "english"},
              return_timestamps="word")

Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


In [6]:
print(result["text"])

 I will tell you right now, I hope that I am recording my voice. Do you think my voice is being recorded? I don't know. Should it be recorded?


In [8]:
import re
print(f"Command details:")

# Remove punctuation from the list of tokens
tokens = nltk.word_tokenize(result["text"])
#tokens = [word for word in nltk.word_tokenize(result["text"]) if word not in string.punctuation]
pos = nltk.pos_tag(tokens)
print(pos)

print()
out = []

for token, tag in pos:
    print(token)

for i, word in enumerate(result["chunks"]):
    print(word[0])
        # out.append(word['text'])
        # print(f"#{i}:{word['text']} ; dur: '{duration:.2f}'; pos: '{pos[i][1]}'")

print(out)


Command details:
[('I', 'PRP'), ('will', 'MD'), ('tell', 'VB'), ('you', 'PRP'), ('right', 'RB'), ('now', 'RB'), (',', ','), ('I', 'PRP'), ('hope', 'VBP'), ('that', 'IN'), ('I', 'PRP'), ('am', 'VBP'), ('recording', 'VBG'), ('my', 'PRP$'), ('voice', 'NN'), ('.', '.'), ('Do', 'VBP'), ('you', 'PRP'), ('think', 'VB'), ('my', 'PRP$'), ('voice', 'NN'), ('is', 'VBZ'), ('being', 'VBG'), ('recorded', 'VBN'), ('?', '.'), ('I', 'PRP'), ('do', 'VBP'), ("n't", 'RB'), ('know', 'VB'), ('.', '.'), ('Should', 'VB'), ('it', 'PRP'), ('be', 'VB'), ('recorded', 'VBN'), ('?', '.')]

I
will
tell
you
right
now
,
I
hope
that
I
am
recording
my
voice
.
Do
you
think
my
voice
is
being
recorded
?
I
do
n't
know
.
Should
it
be
recorded
?


KeyError: 0

In [None]:
# Tokenize the text into individual words
#tokens = nltk.word_tokenize(result["text"])
tokens = nltk.word_tokenize("go right it")

# Perform POS tagging on the list of tokens
tagged_tokens = nltk.pos_tag(tokens)

# Print the tokens and their POS tags
for token, tag in tagged_tokens:
    print(token, tag)