In [1]:
import pandas as pd
import numpy as np 
import json

import warnings as w 
w.filterwarnings('ignore')

In [2]:
data = pd.read_json('data/manifest.jsonl', lines=True)

data

Unnamed: 0,id,audio_filepath,text,duration
0,e632f7d39c15e7edfc665b91e6f2071f,files/e632f7d39c15e7edfc665b91e6f2071f.wav,афина воспроизведи музыку вперемешку,4.900000
1,5db5df8bb9e3b6660b2a04b34d4a355d,files/5db5df8bb9e3b6660b2a04b34d4a355d.wav,найти сериал григорий р,3.652000
2,2c471aedc6979109f28cd53c58f8c4fb,files/2c471aedc6979109f28cd53c58f8c4fb.wav,прямой эфир апл манчестер юнайтед тоттенхэм,4.341750
3,756a137ee9debde4a008adc4a4121dc7,files/756a137ee9debde4a008adc4a4121dc7.wav,ильвиром ивановичем ворончихиным,3.900000
4,1ee3b00170123a6723a40e129b2f6bce,files/1ee3b00170123a6723a40e129b2f6bce.wav,можешь показать киношку исходный код,3.320000
...,...,...,...,...
9989,73bb9272fe3724eb6212e33186f63152,files/73bb9272fe3724eb6212e33186f63152.wav,рудню,2.260000
9990,c872c6f50709a2696b0927725dc86ded,files/c872c6f50709a2696b0927725dc86ded.wav,найти мульт щенячий патруль,3.020000
9991,03b92263995462b2e09655a8e29d4d0b,files/03b92263995462b2e09655a8e29d4d0b.wav,стол из эпоксидной смолы,3.598688
9992,09ac1f9718c6da18380b4a01d92d2561,files/09ac1f9718c6da18380b4a01d92d2561.wav,шесть четыреста семьдесят шесть четыреста двад...,11.795312


In [3]:
i = np.random.randint(0, data.shape[0])

audio_id = data['id'].iloc[i]
audio_path = 'data/' + data['audio_filepath'].iloc[i]
audio_text = data['text'].iloc[i]

def compare_and_dump(result, real, audio_id):
    result_str = str(result).lower()
    real_str = str(real).lower()
    similarity = result_str == real_str
    
    print(f'Recognized is "{result_str}",',  
          f'\nwhile real is "{real_str}".',  
          f'\nSimilarity is {similarity}')
    
    result_path = f'data/texts/{audio_id}_text.txt'
    
    with open(result_path, 'w') as f:
        json.dump(result, f, ensure_ascii=False, indent=4)

    print('\nResult has been saved into '+result_path)

Speech Recognition (google)

In [4]:
import speech_recognition as sr 

In [5]:
recog = sr.Recognizer()

with sr.AudioFile(audio_path) as audio:
    recog.adjust_for_ambient_noise(audio)
    rec = recog.record(audio)
    
result = recog.recognize_google(audio_data=rec, language='ru-RU') 

audio_id_google = audio_id + '_google'
compare_and_dump(result, audio_text, audio_id_google)

Recognized is "катастрофа", 
while real is "включи катастрофу". 
Similarity is False

Result has been saved into data/texts/f4dd5758b36ea972c330ccb806385140_google_text.txt


In [7]:
import os
from pocketsphinx import LiveSpeech, get_model_path

model_path = get_model_path()

speech = LiveSpeech(
    verbose=True,
    sampling_rate=16000,
    buffer_size=2048,
    no_search=False,
    full_utt=False,
    hmm=os.path.join(model_path, 'zero_ru.cd_cont_4000'),
    lm=os.path.join(model_path, 'ru.lm'),
    dic=os.path.join(model_path, 'ru.dic')
)

print("Say something!")

for phrase in speech:
    print(phrase)

RuntimeError: Failed to initialize PocketSphinx

Vosk

In [None]:
from vosk import Model, KaldiRecognizer
from pydub import AudioSegment 

In [None]:
model = Model("vosk_model")
recog = KaldiRecognizer(model, 16000)
recog.SetWords(True)

# Используя библиотеку pydub делаем предобработку аудио
wav = (AudioSegment.from_wav(audio_path)
       .set_channels(1)
       .set_frame_rate(16000))

# Преобразуем вывод в json
recog.AcceptWaveform(wav.raw_data)
result = recog.Result()
text = json.loads(result)["text"]

audio_id_vosk = audio_id + '_vosk'
compare_and_dump(text, audio_text, audio_id_vosk)

Recognized is "фильм бал", 
while real is "фильм башня из слоновой кости". 
Similarity is False

Result has been saved into data/texts/5cd3e9faa0a4f25593f522565ec79083_vosk_text.txt
