In [7]:
from os import path
import io
import os

import json
import pandas as pd
from pydub import AudioSegment
import requests
from tqdm.auto import tqdm
from vosk import Model, KaldiRecognizer
from nltk.translate.bleu_score import sentence_bleu

# Чтение файлов

In [26]:
df_valid = pd.read_csv('../data/valid.csv')
df_train = pd.read_csv('../data/train.csv')

# Yandex Cloud STT

In [4]:
# brew install ffmpeg

import sys
sys.path.append('/path/to/ffmpeg')

In [55]:
with open('../secrets/yandex-traslate-apikey.txt', 'r') as file:
    apikey = file.read().replace('\n', '')
    
def yandex_stt(filename: str, lang:str = 'ru-RU', sampleRateHertz: int = 8000) -> str:

    fn = f'../data/speech/{filename}'
    if os.path.getsize(fn) < 500:
        return ''
    
    url = f"https://stt.api.cloud.yandex.net/speech/v1/stt:recognize?lang={lang}&format=lpcm&sampleRateHertz={sampleRateHertz}"

    sound = AudioSegment.from_mp3(fn)
    sound = sound.set_frame_rate(sampleRateHertz)
    buffer = io.BytesIO()
    sound.export(buffer, format='wav')
    files = {'file': buffer.read()[:1_000_000]}

    headers = {
        'Authorization': f'Api-Key {apikey}',
    }
    response = requests.request("POST", url, headers=headers, files=files)
    return response.json()['result']


In [59]:
tqdm.pandas()

def apply_fn(speech_fn):
    return yandex_stt(speech_fn)

df_valid['yandex_tts'] = df_valid['speech_fn'].progress_apply(apply_fn)
df_valid.to_csv('../data/yandex_stt_valid.csv')

100%|██████████| 200/200 [02:50<00:00,  1.17it/s]


In [73]:
df_valid.apply(lambda row: sentence_bleu([row['text_ru']],row['yandex_tts']), axis=1).mean()


0.6407131801825868

# VOSK

модели можно скачать тут (содержимое распаковать в соотвествующую папку)
https://alphacephei.com/vosk/models


In [19]:
def vosk_stt(model, filename: str) -> str:

    fn = f'../data/speech/{filename}'
    if os.path.getsize(fn) < 500:
        return ''
    

    sound = AudioSegment.from_mp3(fn)
    rec.AcceptWaveform(sound.raw_data)
    result = rec.Result()
    text = json.loads(result)["text"]

    return text

Small russian vosk

In [20]:
tqdm.pandas()

model = Model("../vosk-model-small-ru-0.22/")
rec = KaldiRecognizer(model, 48000)
rec.SetWords(True)

def apply_fn(speech_fn):
    return vosk_stt(model, speech_fn)

df_valid['vosksmall_tts'] = df_valid['speech_fn'].progress_apply(apply_fn)
df_valid.to_csv('../data/vosksmall_stt_valid.csv')

LOG (VoskAPI:ReadDataFiles():model.cc:213) Decoding params beam=10 max-active=3000 lattice-beam=2
LOG (VoskAPI:ReadDataFiles():model.cc:216) Silence phones 1:2:3:4:5:6:7:8:9:10
LOG (VoskAPI:RemoveOrphanNodes():nnet-nnet.cc:948) Removed 0 orphan nodes.
LOG (VoskAPI:RemoveOrphanComponents():nnet-nnet.cc:847) Removing 0 orphan components.
LOG (VoskAPI:ReadDataFiles():model.cc:248) Loading i-vector extractor from ../vosk-model-small-ru-0.22//ivector/final.ie
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:183) Computing derived variables for iVector extractor
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:204) Done.
LOG (VoskAPI:ReadDataFiles():model.cc:282) Loading HCL and G from ../vosk-model-small-ru-0.22//graph/HCLr.fst ../vosk-model-small-ru-0.22//graph/Gr.fst
LOG (VoskAPI:ReadDataFiles():model.cc:303) Loading winfo ../vosk-model-small-ru-0.22//graph/phones/word_boundary.int
100%|██████████| 200/200 [01:06<00:00,  3.02it/s]


In [22]:
df_valid.apply(lambda row: sentence_bleu([row['text_ru']],row['vosksmall_tts']), axis=1).mean()


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


0.5767803833812252

Large russian vosk

In [23]:
tqdm.pandas()

model = Model("../vosk-model-ru-0.42/")
rec = KaldiRecognizer(model, 48000)
rec.SetWords(True)

def apply_fn(speech_fn):
    return vosk_stt(model, speech_fn)

df_valid['vosk_tts'] = df_valid['speech_fn'].progress_apply(apply_fn)
df_valid.to_csv('../data/vosks_stt_valid.csv')

LOG (VoskAPI:ReadDataFiles():model.cc:213) Decoding params beam=13 max-active=7000 lattice-beam=6
LOG (VoskAPI:ReadDataFiles():model.cc:216) Silence phones 1:2:3:4:5:6:7:8:9:10
LOG (VoskAPI:RemoveOrphanNodes():nnet-nnet.cc:948) Removed 1 orphan nodes.
LOG (VoskAPI:RemoveOrphanComponents():nnet-nnet.cc:847) Removing 2 orphan components.
LOG (VoskAPI:Collapse():nnet-utils.cc:1488) Added 1 components, removed 2
LOG (VoskAPI:ReadDataFiles():model.cc:248) Loading i-vector extractor from ../vosk-model-ru-0.42//ivector/final.ie
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:183) Computing derived variables for iVector extractor
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:204) Done.
LOG (VoskAPI:ReadDataFiles():model.cc:279) Loading HCLG from ../vosk-model-ru-0.42//graph/HCLG.fst
LOG (VoskAPI:ReadDataFiles():model.cc:294) Loading words from ../vosk-model-ru-0.42//graph/words.txt
LOG (VoskAPI:ReadDataFiles():model.cc:303) Loading winfo ../vosk-model-ru-0.42//graph/phones/word

In [24]:
df_valid.apply(lambda row: sentence_bleu([row['text_ru']],row['vosk_tts']), axis=1).mean()


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


0.49000397511625976

Сделаем файл текста для трейна (на всякий случай)

In [27]:
tqdm.pandas()

model = Model("../vosk-model-small-ru-0.22/")
rec = KaldiRecognizer(model, 48000)
rec.SetWords(True)

def apply_fn(speech_fn):
    return vosk_stt(model, speech_fn)

df_train['vosksmall_tts'] = df_train['speech_fn'].progress_apply(apply_fn)
df_train.to_csv('../data/vosksmall_stt_train.csv')

LOG (VoskAPI:ReadDataFiles():model.cc:213) Decoding params beam=10 max-active=3000 lattice-beam=2
LOG (VoskAPI:ReadDataFiles():model.cc:216) Silence phones 1:2:3:4:5:6:7:8:9:10
LOG (VoskAPI:RemoveOrphanNodes():nnet-nnet.cc:948) Removed 0 orphan nodes.
LOG (VoskAPI:RemoveOrphanComponents():nnet-nnet.cc:847) Removing 0 orphan components.
LOG (VoskAPI:ReadDataFiles():model.cc:248) Loading i-vector extractor from ../vosk-model-small-ru-0.22//ivector/final.ie
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:183) Computing derived variables for iVector extractor
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:204) Done.
LOG (VoskAPI:ReadDataFiles():model.cc:282) Loading HCL and G from ../vosk-model-small-ru-0.22//graph/HCLr.fst ../vosk-model-small-ru-0.22//graph/Gr.fst
LOG (VoskAPI:ReadDataFiles():model.cc:303) Loading winfo ../vosk-model-small-ru-0.22//graph/phones/word_boundary.int
100%|██████████| 600/600 [03:39<00:00,  2.73it/s]


# Выводы
- 0.64 - Yandex SpeechKit (online) - базовое значение очень хорошей модели
- **0.57** - Vosk RU Small (offline) - хороший результат
- 0.49 - Vosk RU Large (offline) - больше не значит лучше, хотя может быть вызвано грязными данными