In [10]:
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import torch
import speech_recognition as sr
import io
from pydub import AudioSegment
# load model and processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")
    
r = sr.Recognizer()

with sr.Microphone(sample_rate = 16000) as source:
    print('Start Speaking now...')
    while True:
        audio = r.listen(source) #pyaudio object
        data = io.BytesIO(audio.get_wav_data()) #list of bytes
        clip = AudioSegment.from_file(data) #numpy array
        x = torch.FloatTensor(clip.get_array_of_samples()) #Tensor
        
        inputs = processor(x , samping_rate = 16000 ,return_tensors="pt", padding = 'longest').input_values
        logits = model(inputs).logits
        tokens = torch.argmax(logits, axis = 1) #get the 
        text   = processor.batch_decode(tokens) #tokens into a string

        print('You said',str(text).lower())

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Start Speaking now...




FileNotFoundError: [WinError 2] The system cannot find the file specified

## Usage

In [8]:
# !pip install transformers
# !pip install datasets
import soundfile as sf
import torch
from datasets import load_dataset
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

# load pretrained model
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")


librispeech_samples_ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")

# load audio
audio_input, sample_rate = sf.read(librispeech_samples_ds[0]["file"])

# pad input values and return pt tensor
input_values = processor(audio_input, sampling_rate=sample_rate, return_tensors="pt").input_values

# INFERENCE

# retrieve logits & take argmax
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)

# transcribe
transcription = processor.decode(predicted_ids[0])

# FINE-TUNE

target_transcription = "A MAN SAID TO THE UNIVERSE I EXIST"

# encode labels
with processor.as_target_processor():
  labels = processor(target_transcription, return_tensors="pt").input_ids

# compute loss by passing labels
loss = model(input_values, labels=labels).loss
loss.backward()