In [2]:
# pip install transformers datasets evaluate jiwer accelerate librosa soundfile


In [3]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset


device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-large-v3"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=30,
    batch_size=16,
    return_timestamps=True,
    torch_dtype=torch_dtype,
    device=device,
)

# dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")
# sample = dataset[0]["audio"]

# result = pipe(sample)
# print(result["text"])


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [22]:
dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")
sample = dataset[0]["audio"]


In [23]:
result = pipe(sample, return_timestamps=True, generate_kwargs={"language": "english"})


In [20]:
print(result['text'])
print(result['chunks'])

[{'timestamp': (0.0, 5.28), 'text': ' Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.'}, {'timestamp': (6.34, 10.1), 'text': " Nor is Mr. Quilter's manner less interesting than his matter."}, {'timestamp': (10.92, 17.6), 'text': ' He tells us that at this festive season of the year, with Christmas and roast beef looming before us,'}, {'timestamp': (18.44, 22.58), 'text': ' similes drawn from eating and its results occur most readily to the mind.'}, {'timestamp': (23.16, 28.66), 'text': " He has grave doubts whether Sir Frederick Leighton's work is really Greek after all,"}, {'timestamp': (29.1, 32.48), 'text': ' and can discover in it but little of rocky Ithaca.'}, {'timestamp': (33.62, 37.86), 'text': " Linnell's pictures are a sort of Upguards and Adam paintings,"}, {'timestamp': (37.86, 42.88), 'text': " and Mason's exquisite idylls are as national as a jingo poem."}, {'timestamp': (44.56, 45.78), 'text': " Mr. Burkett Foster's"}, {'timestamp

In [86]:
from transformers import pipeline, AutoModelForCausalLM, AutoModelForSpeechSeq2Seq, AutoProcessor
import torch
from datasets import load_dataset

device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

assistant_model_id = "openai/whisper-tiny"

assistant_model = AutoModelForCausalLM.from_pretrained(
    assistant_model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
assistant_model.to(device)

model_id = "openai/whisper-large-v3"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    generate_kwargs={"assistant_model": assistant_model},
    torch_dtype=torch_dtype,
    device=device,
)

dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
sample = dataset[0]["audio"]

result = pipe(sample)
print(result["text"])


Downloading (…)lve/main/config.json:   0%|          | 0.00/1.98k [00:00<?, ?B/s]

ValueError: Unrecognized configuration class <class 'transformers.models.whisper.configuration_whisper.WhisperConfig'> for this kind of AutoModel: AutoModelForCausalLM.
Model type should be one of BartConfig, BertConfig, BertGenerationConfig, BigBirdConfig, BigBirdPegasusConfig, BioGptConfig, BlenderbotConfig, BlenderbotSmallConfig, BloomConfig, CamembertConfig, LlamaConfig, CodeGenConfig, CpmAntConfig, CTRLConfig, Data2VecTextConfig, ElectraConfig, ErnieConfig, FalconConfig, GitConfig, GPT2Config, GPT2Config, GPTBigCodeConfig, GPTNeoConfig, GPTNeoXConfig, GPTNeoXJapaneseConfig, GPTJConfig, LlamaConfig, MarianConfig, MBartConfig, MegaConfig, MegatronBertConfig, MistralConfig, MptConfig, MusicgenConfig, MvpConfig, OpenLlamaConfig, OpenAIGPTConfig, OPTConfig, PegasusConfig, PersimmonConfig, PLBartConfig, ProphetNetConfig, QDQBertConfig, ReformerConfig, RemBertConfig, RobertaConfig, RobertaPreLayerNormConfig, RoCBertConfig, RoFormerConfig, RwkvConfig, Speech2Text2Config, TransfoXLConfig, TrOCRConfig, XGLMConfig, XLMConfig, XLMProphetNetConfig, XLMRobertaConfig, XLMRobertaXLConfig, XLNetConfig, XmodConfig.

In [None]:
#phoneme transcription

#wav2vec-large -> Learns the phonemes
https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme

In [87]:
#wav2vec2phoneme
from transformers import Wav2Vec2PhonemeCTCTokenizer


#same architecture as wav2vec2 -> https://huggingface.co/docs/transformers/model_doc/wav2vec2

# Load model directly
from transformers import AutoProcessor, AutoModelForCTC

processor = AutoProcessor.from_pretrained("ramimmo/wav2vec2phoneme" )


proc = Wav2Vec2PhonemeCTCTokenizer(

#use this phonemizer - is default-> https://github.com/bootphon/phonemizer#readme
model = AutoModelForCTC.from_pretrained("ramimmo/wav2vec2phoneme")

Ignored unknown kwarg option normalize
Ignored unknown kwarg option normalize
Ignored unknown kwarg option normalize
Ignored unknown kwarg option normalize


In [88]:
#import torch
#sample_array = dataset[0]["audio"]["array"]
sample=dataset[0]["audio"].keys()

#sample_rate=16000
input_values = processor(dataset[0]["audio"]["array"], sampling_rate=dataset[0]["audio"]["sampling_rate"], return_tensors="pt").input_values
#input_values = processor(sample, return_tensors="pt").input_values



In [90]:
logits = model(input_values).logits

In [91]:
logits.shape
predicted_ids = torch.argmax(logits, dim=-1)

In [49]:
type(predicted_ids)

torch.Tensor

In [53]:
predicted_ids.shape

torch.Size([1, 3122])

In [92]:

transcription = processor.decode(predicted_ids[0])
print(transcription)

#clean paddings
clean_transcription = transcription.replace("[PAD]", "").strip()
print(clean_transcription)

word_list_phonemes=clean_transcription.split(" ")
print(word_list_phonemes)


h#[PAD]m[PAD]eh[PAD]sh#t[PAD]erh#k w[PAD]eh[PAD]lh#t[PAD]er[PAD]eh[PAD]sth[PAD]ey[PAD]ehh#p[PAD]aa[PAD]s[PAD] l[PAD]h#[PAD]ah[PAD]fth[PAD]ah[PAD]m[PAD]eh[PAD]t[PAD] l[PAD]h#k l[PAD]eh[PAD]s[PAD]eh[PAD]sh#[PAD]ehn[PAD] [PAD]w[PAD]ey[PAD]erh#g[PAD] l[PAD]eh[PAD]h#t[PAD]ah[PAD]w[PAD]eh[PAD]lh#k[PAD]ah[PAD]m[PAD]hh[PAD]eh[PAD]sh#g[PAD]aa[PAD]sh#p[PAD]awl[PAD]h#[PAD]n[PAD]aaer[PAD]eh[PAD]sh#[PAD]m[PAD]eh[PAD]sh#t[PAD]er[PAD]h#k [PAD]eh[PAD]lh#t[PAD]er[PAD]s [PAD]m[PAD]eh[PAD]n[PAD]er[PAD]h#l[PAD]eh[PAD]s[PAD]h#[PAD]eh[PAD]n[PAD]h#t[PAD]er[PAD]eh[PAD]sh#t[PAD]ehng[PAD]th[PAD]eh[PAD]n[PAD]hh[PAD]eh[PAD]sh#[PAD]m[PAD]eh[PAD]h#t[PAD]er[PAD]h#hh[PAD]ey[PAD]h#t[PAD]eh[PAD]l [PAD]s[PAD]ah[PAD]sh#[PAD]th[PAD]eh[PAD]h#[PAD]ehh#th[PAD]eh[PAD]s [PAD]f[PAD]eh[PAD]sh#[PAD]eh[PAD]f[PAD] s[PAD]ey[PAD]s[PAD] n[PAD]ah[PAD]f[PAD]th[PAD]ey[PAD]y[PAD]eh[PAD]er[PAD]h#[PAD]w[PAD]eh[PAD]th[PAD]h#k[PAD]er[PAD]eh[PAD]sh#[PAD]m[PAD]eh[PAD]n[PAD]eh[PAD]s[PAD]ehn[PAD]er[PAD]aw[PAD]sh#b[PAD]ey[PAD]f[PAD] l[PAD]uw[PAD]m

In [84]:
for word_phones in word_list_phonemes:
    word_phones_str = ' '.join(word_phones.split('#')).upper()
    #print(phoneme_str)
    print(word_phones_str)

    #check dictionary
    return ipa_dict.get(phoneme_str, None)

h#mehsh#terh#k
wehlh#terehstheyehh#paas
lh#ahfthahmeht
lh#k
lehsehsh#ehn
weyerh#g
lehh#tahwehlh#kahmhhehsh#gaash#pawlh#naaerehsh#mehsh#terh#k
ehlh#ters
mehnerh#lehsh#ehnh#terehsh#tehngthehnhhehsh#mehh#terh#hheyh#tehl
sahsh#thehh#ehh#thehs
fehsh#ehf
seys
nahftheyyeherh#wehthh#kerehsh#mehnehsehnerawsh#beyf
luwmehngh#beyfaaerahsh#sehmahleysh#deraan
fahmh#eyteyngh#ehnehh#serehsahlh#sh#ahh#kermawsh#ereht
leyh#tuwthahmeynh#h#hheyhhehsh#gereyh#dawh#sh#wehtherh#serferehh#derehh#k
leyh#n
sh#werh#kehsh#erehteyh#gereyh#kh#ehfh#terh#aalh#ehnh#kehnh#dehsh#kahferh#ehnehh#bahh#leht
lh#ahfh#eraah#keyh#ehthehh#kahh#lehnehl
sh#pehh#chersh#erh#ahsaaertahfh#ahh#gaaerh#sahnehtahmh#peynh#teyngsh#h#ehn
meys
n
sh#ehh#k
sh#k
wehsehh#eht
l
sh#aaerahsh#nehshh#nahlh#ehsahh#jhehnggawh#pawahmh#mehsh#terh#berh#kehh#faash#ters
lehn
sh#keyh#p
sh#h#sh#meylehh#wahnmahh#chnthehseym
weythehh#mehsh#terh#kaaerh#kerh#yuwsh#tehf
lehshehsh#teythh#ehn
mehsh#terh#jhaanh#kaaleyerh#gehf
sehsehterh#ehh#chyeherf
l
s
lehh#pahnh#behh#

In [85]:
# Assuming 'ipa_dict' is your dictionary mapping ARPABET phoneme sequences to words
# For example: ipa_dict = {'HH AH L OW': 'hello', 'W ER L D': 'world', ...}

# Here's a list of ARPABET phoneme sequences to check
phoneme_sequences = [
    'h#mehsh#terh#k', 'wehlh#terehstheyehh#paas', 'lh#ahfthahmeht', #...
    # Add all your phoneme sequences here
]

word_phones_str = ' '.join(word_phones.split('#')).upper()
#print(phoneme_str)
print(word_phones_str)

#check dictionary
word = ipa_dict.get(phoneme_str, None)

if word != None:
    print('word is', word)
else:
    print('none')



# # Function to check if a sequence of phonemes forms a valid word
# def is_valid_phoneme_sequence(phoneme_sequence, ipa_dict):
#     # Normalize the sequence to match the dictionary keys
#     phoneme_str = ' '.join(phoneme_sequence.split('#')).upper()
    
#     # Look up the phoneme sequence in the dictionary
#     return ipa_dict.get(phoneme_str, None)

# # Check each phoneme sequence
# for seq in phoneme_sequences:
#     word = is_valid_phoneme_sequence(seq, ipa_dict)
#     if word:
#         print(f"Phoneme sequence '{seq}' corresponds to the word: {word}")
#     else:
#         print(f"Phoneme sequence '{seq}' does not correspond to a valid word.")


SH MEHNH 
none


In [12]:
#uses https://github.com/open-dict-data/ipa-dict.git
#ipa_dict_entries=file.text
file_path = './ipa-dict/data/en_US.txt'

with open(file_path, 'r') as file:
    ipa_dict_entries = file.read()


# Split the string into lines
lines = ipa_dict_entries.strip().split('\n')

# Initialize an empty dictionary to hold the word-phoneme mappings
ipa_dict = {}

phonemes_to_words={}

# Parse each line and add to the dictionary
for line in lines:
    # Split the line into word and phonemes, if phonemes are provided
    if '/' in line:
        parts = line.split('/', 1)  # Only split at the first '/'

        
        word = parts[0].strip().strip("'").lower() 

        
        phonemes = parts[1].strip()
        

        print('line', line, 'word', word, 'phonemes', phonemes)
        



        
        ipa_dict[word] = phonemes.strip().split(', ') 
        break
    else:
        word = line.strip().strip("'").lower()
        ipa_dict[word] = []





def is_valid_phoneme_sequence(phoneme_sequence, ipa_dict):
    # Join the sequence into a single string
    phoneme_str = ' '.join(phoneme_sequence)
    phoneme_str=phoneme_str.lower()
    
    # Search for the sequence in the dictionary values
    for word, phonemes_list in ipa_dict.items():
        if phoneme_str in phonemes_list:
            return True, word
    return False, None


# Example usage
word = "example"
phoneme_representation = ipa_dict[word]
print('phones are', phoneme_representation)


#phoneme_sequence = ['b', 'aʊ', 't']  # Phonemes for "bout"
is_valid, word = is_valid_phoneme_sequence(phoneme_representation, ipa_dict)
print(f"Is valid: {is_valid}, Word: {word}")


line 'bout	/ˈbaʊt/ word bout phonemes ˈbaʊt/


KeyError: 'example'

In [3]:
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from datasets import load_dataset
import torch

# load model and processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft")
 
# load dummy dataset and read soundfiles
ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")

# tokenize
input_values = processor(ds[0]["audio"]["array"], return_tensors="pt").input_values

# retrieve logits
with torch.no_grad():
    logits = model(input_values).logits

# take argmax and decode
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)
# => should give ['m ɪ s t ɚ k w ɪ l t ɚ ɹ ɪ z ð ɪ ɐ p ɑː s əl ʌ v ð ə m ɪ d əl k l æ s ᵻ z æ n d w iː ɑːɹ ɡ l æ d t ə w ɛ l k ə m h ɪ z ɡ ɑː s p əl']


Downloading (…)lve/main/config.json:   0%|          | 0.00/1.86k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.16k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating validation split: 0 examples [00:00, ? examples/s]

It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


In [None]:
dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")
sample = dataset[0]["audio"]

In [4]:
print(transcription)

# word_phones_str = ' '.join(word_phones.split('#')).upper()
# #print(phoneme_str)
# print(word_phones_str)

# #check dictionary
# word = ipa_dict.get(phoneme_str, None)

['ɐ m æ n s ɛ d t ə ð ə j uː n ɪ v ɚ s s ɚ aɪ ɛ ɡ z ɪ s t']


In [7]:
ipa_dict

{'bout': ['ˈbaʊt/'],
 'cause': ['ˈkɑz/', '/ˈkɔz/'],
 'course': ['ˈkɔɹs/'],
 'cuse': ['ˈkjuz/'],
 'em': ['ˈɛm/'],
 'frisco': ['ˈfɹiskoʊ/'],
 'gain': ['ˈɡeɪn/'],
 'kay': ['ˈkeɪ/'],
 'm': ['ˈɛm/'],
 'n': ['ˈɛn/'],
 'round': ['ˈɹaʊnd/'],
 's': ['ˈɛs/'],
 'til': ['ˈtɪɫ/'],
 'tis': ['ˈtɪz/'],
 'twas': ['ˈtwəz/'],
 'a': ['ˈeɪ/', '/ə/'],
 'a.': ['ˈeɪ/'],
 "a.'s": ['ˈeɪz/'],
 'a.d.': ['ˌeɪˈdi/'],
 'a.m.': ['ˌeɪˈɛm/'],
 'a.s': ['ˈeɪz/'],
 "a's": ['ˈeɪz/'],
 'aaa': ['ˌtɹɪpəˈɫeɪ/'],
 'aaberg': ['ˈɑbɝɡ/'],
 'aachen': ['ˈɑkən/'],
 'aachener': ['ˈɑkənɝ/'],
 'aaker': ['ˈɑkɝ/'],
 'aaliyah': ['ˌɑˈɫiˌɑ/'],
 'aalseth': ['ˈɑɫsɛθ/'],
 'aamodt': ['ˈɑmət/'],
 'aancor': ['ˈɑnˌkɔɹ/'],
 'aardema': ['ɑɹˈdɛmə/'],
 'aardvark': ['ˈɑɹdˌvɑɹk/'],
 'aardvarks': ['ˈɑɹdˌvɑɹks/'],
 'aargh': ['ˈɑɹɡ/'],
 'aarhus': ['ˌɑˈhus/'],
 'aaron': ['ˈɛɹən/'],
 "aaron's": ['ˈɛɹənz/'],
 'aarons': ['ˈɛɹənz/'],
 'aaronson': ['ˈɑɹənsən/', '/ˈɛɹənsən/'],
 "aaronson's": ['ˈɑɹənsənz/', '/ˈɛɹənsənz/'],
 'aarti': ['ˈɑɹˌti/'],
 'aase': ['ˈɑs/'],
