In [None]:
import speech_recognition as sr
from transformers import Wav2Vec2Processor, HubertForCTC,Wav2Vec2ForCTC
import soundfile as sf
from datasets import load_dataset
import torch

In [None]:
pathSave = 'C:\\Users\\chushengtan\\Desktop\\'
filename = 'audio_file_test.wav'
timeout = 0.5
waiting_time = 10

r = sr.Recognizer()
with sr.Microphone(device_index=1,sample_rate = 16000) as source:
    r.adjust_for_ambient_noise(source)
    print('請開始說話.....')
    audio = r.listen(source,
                    timeout = timeout,
                    phrase_time_limit = waiting_time)
    print('錄音結束.....')
with open(pathSave + filename,'wb') as file:
    file.write(audio.get_wav_data())

# 完成版

In [None]:
def speech2Wave(pathSave,filename,sample_rate = 16000,timeout = 0.5,waiting_time = 10):
    """
    phrase_time_limit : waiting time for ending programming
    mic = sr.Microphone() # 查詢全部 microphones 的裝置名稱
    ref : https://realpython.com/python-speech-recognition/#working-with-microphones
    ref : https://github.com/Uberi/speech_recognition
    ref : https://github.com/pytorch/fairseq/tree/main/examples/hubert
    """
    r = sr.Recognizer()
    with sr.Microphone(sample_rate = sample_rate) as source:
        r.adjust_for_ambient_noise(source)
        print('請開始說話.....')
        audio = r.listen(source,
                         timeout = timeout,
                         phrase_time_limit = waiting_time)
        print('錄音結束.....')
    with open(pathSave + filename,'wb') as file:
        file.write(audio.get_wav_data())

        
"""
------------------- loading model ------------------- 
"""        

def Load_Model(processor_name,model_name):
    """
    Load_Model(processor_name,model_name) : 載入使用模型
    """
    processor = Wav2Vec2Processor.from_pretrained(processor_name)
    model = HubertForCTC.from_pretrained(model_name)
    return processor,model

def Speech2Text(audio_path,processor_name,model_name):
    """
    Speech2Text(audio_path,processor_name,model_name) : 將語音轉換成文字
    --> audio_path : 語音檔案存放路徑位置 ; format : .wav
    """
    processor , model = Load_Model(processor_name,model_name)
#     processor = Wav2Vec2Processor.from_pretrained(processor_name)
#     model = HubertForCTC.from_pretrained(model_name)
    speech,_ = sf.read(audio_path)
    input_values = processor(speech,return_tensors='pt',padding='longest').input_values
    logits = model(input_values).logits
    predicted_ids = torch.argmax(logits,dim=-1)
    return processor.decode(predicted_ids[0])

In [None]:
%%time
pathSave = 'C:\\Users\\chushengtan\\Desktop\\'
filename = 'audio_file_test.wav'
speech2Wave(pathSave=pathSave,filename=filename)

In [None]:
%%time
processor_name = 'facebook/hubert-xlarge-ls960-ft'
model_name = 'facebook/hubert-xlarge-ls960-ft'
audio_path = 'C:\\Users\\chushengtan\\Desktop\\audio_file_test.wav'
Speech2Text(audio_path = audio_path,
            processor_name = processor_name,
            model_name = model_name)