In [6]:
!pip install vosk



In [28]:
from vosk import Model, KaldiRecognizer, SpkModel
import IPython.display as ipd
import numpy as np
import wave
import json
import sys
import re
import os


def find_person(array):
    database = np.array(list(users.values()))
    distances = []
    
    for i in database:
        distances.append(cosine_dist(array, i))
    min_dist = min(distances)
    
    if min_dist > 0.12:
        return 'UNKNOWN PERSON'
    
    key = distances.index(min_dist)
    return list(users.keys())[key]


def punctuation(text):
    text = re.sub(' –∞ ', ', –∞ ', text)
    text = re.sub(' –Ω–æ ', ', –Ω–æ ', text)
    text = re.sub(' —á—Ç–æ ', ', —á—Ç–æ ', text)
    text = re.sub(' —á—Ç–æ–±—ã ', ', —á—Ç–æ–±—ã ', text)
    text = re.sub(' –Ω–∞–ø—Ä–∏–º–µ—Ä ', ', –Ω–∞–ø—Ä–∏–º–µ—Ä,  ', text)
    text = re.sub(' –¥–∞ ', ', –¥–∞  ', text)
    text = re.sub(' –∫–∞–∫ ', ', –∫–∞–∫  ', text)
    text = re.sub(' –∑–∞—Ç–æ ', ', –∑–∞—Ç–æ  ', text)
    text = re.sub(' –∫—Ä–æ–º–µ —Ç–æ–≥–æ ', ', –∫—Ä–æ–º–µ —Ç–æ–≥–æ,  ', text)
    text = re.sub(' –∑–¥—Ä–∞–≤—Å—Ç–≤—É–π—Ç–µ ', ' –∑–¥—Ä–∞–≤—Å—Ç–≤—É–π—Ç–µ,  ', text)
    return text.capitalize() + '.'


def cosine_dist(x, y):
    nx = np.array(x)
    ny = np.array(y)
    return 1 - np.dot(nx, ny) / np.linalg.norm(nx) / np.linalg.norm(ny)



def init_weights(persons):
    """ –§—É–Ω–∫—Ü–∏—è –ø—Ä–∏—Å–≤–∞–∏–≤–∞–µ—Ç –∫–∞–∂–¥–æ–º—É —á–µ–ª–æ–≤–µ–∫—É –≤–µ–∫—Ç–æ—Ä –Ω—É–ª–µ–π"""
    users = {}
    for i in persons:
        users.update({i: np.zeros(128)})
    return users

persons = ['UNKNOWN PERSON', 'timur', 'danya', 'anton']
users = init_weights(persons)

# –º–æ–¥–µ–ª—åüôà, –¥–µ–ª–∞—é—â–∞—è —Ä–∞–∑–±–∏–µ–Ω–∏–µ —Ä–µ—á–∏ –Ω–∞ —Ç–µ–∫—Å—Ç
model_path = "vosk-model-ru-0.10"
# –º–æ–¥–µ–ª—å üôâ, –¥–µ–ª–∞—é—â–∞—è —Ä–∞–∑–±–∏–µ–Ω–∏–µ —Ç–µ–∫—Å—Ç–∞ –Ω–∞ –¥–∏–∞–ª–æ–≥–∏
spk_model_path = "vosk-model-spk-0.3"
# –ø—É—Ç—å –∫ –≤–æ–π—Å—É
PATH = 'sample/ffmpeg_output_3.wav'


wf = wave.open(PATH, "rb")
model = Model(model_path)
spk_model = SpkModel(spk_model_path)
rec = KaldiRecognizer(model, spk_model, wf.getframerate())


while True:
    data = wf.readframes(4000)
    if len(data) == 0:
        break
    if rec.AcceptWaveform(data):
        res = json.loads(rec.Result())
        print ("{}: ".format(find_person(res['spk'])), punctuation(res['text']))


res = json.loads(rec.FinalResult())
print ("{}: ".format(find_person(res['spk'])), res['text'])




UNKNOWN PERSON:  –Ø –±—É–¥—É —É–±–æ—Ä—â–∏—Ü–µ–π.
UNKNOWN PERSON:  –ö–æ—Ä–æ—á–µ —è –±—É–¥—É –æ—Ç—Å—á–∏—Ç—ã–≤–∞—Ç—å —É–ø–æ—Ä —è —É–±–æ—Ä—â–∏—Ü–∞.
UNKNOWN PERSON:  –ó–¥—Ä–∞–≤—Å—Ç–≤—É–π—Ç–µ –∫–ª–∞–≤–∞ –º–∏—Ö–∞–π–ª–æ–≤–Ω–∞ —è –≤–∞—Å –ø—Ä–∏–≥–ª–∞—Å–∏–ª, —á—Ç–æ–±—ã –ø–æ–≥–æ–≤–æ—Ä–∏—Ç—å —Å –≤–∞–º–∏ –Ω–∞ —Ç–µ–º—É —Ç–æ–≥–æ, –∫–∞–∫  –ø–ª–æ—Ö–æ –≤—ã –ø–æ–º—ã–ª–∏ –ø–æ–ª—ã –≤ —Ç—É–∞–ª–µ—Ç–µ –æ–±—ä—è—Å–Ω–∏—Ç–µ –ø–æ–∂–∞–ª—É–π—Å—Ç–∞ –ø–æ—á–µ–º—É –≤—ã –Ω–µ –≤—ã—Ç–µ—Ä–ª–∏.
UNKNOWN PERSON:  –ê —è, —á—Ç–æ —è –Ω–∏—á—Ç–æ —è —Å–∏–¥–µ–ª–∞ —Å–µ–±–µ —Å–ø–æ–∫–æ–π–Ω–æ —Å–º–æ—Ç—Ä–µ–ª–∞ —Å–≤–æ–∏ —Å–µ—Ä–∏–∞–ª—ã.
UNKNOWN PERSON:  –í–æ—Ç –Ω—É –≤ —Å–º—ã—Å–ª–µ –Ω–µ —Å–µ—Ä–∏–∞–ª—ã —è —Å–º–æ—Ç—Ä–µ–ª–∞ –≤ –≤–æ–¥—É –≤ —É–Ω–∏—Ç–∞–∑–µ –∏ –≤—Å–µ –±—ã–ª–æ —Ö–æ—Ä–æ—à–æ –≥–∞–¥–∞—Ç—å –Ω–∞ –∫–æ—Ñ–µ–π–Ω–æ–π –≥—É—â–µ.
UNKNOWN PERSON:  –ù—É —Ç–∞–º –¥—Ä—É–≥–∞—è –≥—É—â–µ.
UNKNOWN PERSON:  –¢–∞–∫ –ª–∞–¥–Ω–æ —è –¥—É–º–∞—é —ç—Ç–æ –ø–æ–π—Ç–∏.
UNKNOWN PERSON:  


In [29]:
# —Å–∞–º –¥–∏–∞–ª–æ–≥
ipd.Audio(PATH)

### –ö–∞–∫ –≤–∏–¥–∏–º, –Ω–∏ –æ–¥–∏–Ω –∏–∑ —É—á–∞—Å—Ç–Ω–∏–∫–æ–≤ –¥–∏–∞–ª–æ–≥–∞ –Ω–µ —Ä–∞—Å–ø–æ–∑–Ω–∞–Ω. üôâ

–û–±—É—á–∏–º –Ω–∞ –æ—Ç–¥–µ–ª—å–Ω–æ –≤–∑—è—Ç—ã—Ö –∑–∞–ø–∏—Å—è—Ö:

In [25]:
def train_on_records(users):
    matrices = []
    for i in list(users.keys()):
        print(f'\n–û–±—É—á–µ–Ω–∏–µ –Ω–∞ –≥–æ–ª–æ—Å–µ {i}')
        print('='*10)
        print()
        PATH = 'sample/'+i+'-2.wav'
        wf = wave.open(PATH, "rb")
        model = Model(model_path)
        spk_model = SpkModel(spk_model_path)
        rec = KaldiRecognizer(model, spk_model, wf.getframerate())
        
        matrices = []
        while True:
            data = wf.readframes(4000)
            if len(data) == 0:
                break
            if rec.AcceptWaveform(data):
                res = json.loads(rec.Result())
                
                print ("{}: ".format(find_person(res['spk'])), punctuation(res['text']))
                matrices.append(res['spk'])

        res = json.loads(rec.FinalResult())

        print ("{}: ".format(find_person(res['spk'])), res['text'])
        matrices = np.array(matrices)
        users[i] = matrices.mean(axis=0)

persons = ['timur', 'danya', 'anton']
users = init_weights(persons)
train_on_records(users)


–û–±—É—á–µ–Ω–∏–µ –Ω–∞ –≥–æ–ª–æ—Å–µ timur





timur:  –¢—ã.
timur:  –î–µ–º–æ–∫—Ä–∞—Ç–∏—á–µ—Å–∫–∞—è —Ä–æ—Å—Å–∏—è –≤–æ –≥–ª–∞–≤–µ —Å –Ω–µ –º–µ–Ω–µ–µ –¥–µ–º–æ–∫—Ä–∞—Ç–∏—á–µ—Å–∫–∏–º –ø—Ä–µ–∑–∏–¥–µ–Ω—Ç–æ–º –∏ –µ–≥–æ –≤–µ—Ç–∫–∞–º–∏ —É–º–µ–ª–æ –∏—Å–ø–æ–ª—å–∑—É—é—Ç –≤–æ –≤–Ω—É—Ç—Ä–µ–Ω–Ω–µ–π –∏ –≤–Ω–µ—à–Ω–µ–π –ø–æ–ª–∏—Ç–∏–∫–∏ —Å—Ç—Ä–∞–Ω—ã –ø–æ–ª–∏—Ç–∏–∫—É —Ç–∞–π–Ω–æ–π –¥–∏–ø–ª–æ–º–∞—Ç–∏–∏ –ø–æ–ª–∏—Ç–∏–∫–∞.
timur:  –ö–æ—Ç–æ—Ä—ã–π –Ω–∞—á–∞–ª –ø—Ä–∏ –≤–∏–¥–µ –∫—Ä—é—á–æ–∫ –µ—â—ë –≤–æ –≤—Ä–µ–º–µ–Ω–∞ –ø–µ—Ä–µ—Å—Ç—Ä–æ–π–∫–∏ –ø—Ä–∏—Å—Ç—Ä–æ–π–∫–∞ –ø—Ä–µ–ø–æ–¥–Ω–æ—Å–∏–ª–∞—Å—å —Å–æ–≤–µ—Ç—Å–∫–æ–≥–æ –Ω–∞—Ä–æ–¥–∞, –∫–∞–∫  –∑–∞–ø—É—Ç–∞–Ω–æ –Ω–µ–º –¥–æ–±—ã—Ç–æ–≥–æ –ø—Ä–æ—Å—Ç–æ –ø—Ä–µ–¥–æ—Å—Ç–∞–≤–ª–µ–Ω–∏–µ–º –±–æ–ª—å—à–∏—Ö —ç–∫–æ–Ω–æ–º–∏—á–µ—Å–∫–∏—Ö –≤–æ–∑–º–æ–∂–Ω–æ—Å—Ç–µ–π, –∫–∞–∫  —Ä–∞—Å—à–∏—Ä–µ–Ω–∏–µ –¥–µ–º–æ–∫—Ä–∞—Ç–∏—á–µ—Å–∫–∏—Ö –ø—Ä–∞–≤ –∏ —Å–≤–æ–±–æ–¥ –≤—Å—è –≥–æ—Ä–±–∞—á–µ–≤—Å–∫–∞—è —Ñ—Ä–∞–∑–µ–æ–ª–æ–≥–∏–∏ —Ç–∞–∫–∞—è, –∫–∞–∫  —É–≥–ª—É–±–∏—Ç—å —Ä–∞—Å—à–∏—Ä–∏—Ç—å –≥–ª–∞–∑ —Ç–æ –µ—Å—Ç—å –Ω–æ–≤—ã–µ –º—ã—à–∏.
timur:  –ú—ã—à–ª–µ–Ω–∏–µ –µ–≥–æ –≥–∞–ª–∫–∞ –≥–

In [None]:
# —Å–∞–º –¥–∏–∞–ª–æ–≥
ipd.Audio(PATH)

### –ü–æ—Å–ª–µ –æ–±—É—á–µ–Ω–∏—è, –∑–∞–ø—É—Å—Ç–∏–º –µ—â–µ —Ä–∞–∑ –∞–ª–≥–æ—Ä–∏—Ç–º —Ä–∞—Å–ø–æ–∑–Ω–∞–≤–∞–Ω–∏—è –ø–æ –≥–æ–ª–æ—Å—É

In [26]:
# –º–æ–¥–µ–ª—åüôà, –¥–µ–ª–∞—é—â–∞—è —Ä–∞–∑–±–∏–µ–Ω–∏–µ —Ä–µ—á–∏ –Ω–∞ —Ç–µ–∫—Å—Ç
model_path = "vosk-model-ru-0.10"
# –º–æ–¥–µ–ª—å üôâ, –¥–µ–ª–∞—é—â–∞—è —Ä–∞–∑–±–∏–µ–Ω–∏–µ —Ç–µ–∫—Å—Ç–∞ –Ω–∞ –¥–∏–∞–ª–æ–≥–∏
spk_model_path = "vosk-model-spk-0.3"
# –ø—É—Ç—å –∫ –≤–æ–π—Å—É
PATH = 'sample/ffmpeg_output_3.wav'


wf = wave.open(PATH, "rb")
model = Model(model_path)
spk_model = SpkModel(spk_model_path)
rec = KaldiRecognizer(model, spk_model, wf.getframerate())


while True:
    data = wf.readframes(4000)
    if len(data) == 0:
        break
    if rec.AcceptWaveform(data):
        res = json.loads(rec.Result())
        print ("{}: ".format(find_person(res['spk'])), punctuation(res['text']))


timur:  –Ø –±—É–¥—É —É–±–æ—Ä—â–∏—Ü–µ–π.
danya:  –ö–æ—Ä–æ—á–µ —è –±—É–¥—É –æ—Ç—Å—á–∏—Ç—ã–≤–∞—Ç—å —É–ø–æ—Ä —è —É–±–æ—Ä—â–∏—Ü–∞.
danya:  –ó–¥—Ä–∞–≤—Å—Ç–≤—É–π—Ç–µ –∫–ª–∞–≤–∞ –º–∏—Ö–∞–π–ª–æ–≤–Ω–∞ —è –≤–∞—Å –ø—Ä–∏–≥–ª–∞—Å–∏–ª, —á—Ç–æ–±—ã –ø–æ–≥–æ–≤–æ—Ä–∏—Ç—å —Å –≤–∞–º–∏ –Ω–∞ —Ç–µ–º—É —Ç–æ–≥–æ, –∫–∞–∫  –ø–ª–æ—Ö–æ –≤—ã –ø–æ–º—ã–ª–∏ –ø–æ–ª—ã –≤ —Ç—É–∞–ª–µ—Ç–µ –æ–±—ä—è—Å–Ω–∏—Ç–µ –ø–æ–∂–∞–ª—É–π—Å—Ç–∞ –ø–æ—á–µ–º—É –≤—ã –Ω–µ –≤—ã—Ç–µ—Ä–ª–∏.
timur:  –ê —è, —á—Ç–æ —è –Ω–∏—á—Ç–æ —è —Å–∏–¥–µ–ª–∞ —Å–µ–±–µ —Å–ø–æ–∫–æ–π–Ω–æ —Å–º–æ—Ç—Ä–µ–ª–∞ —Å–≤–æ–∏ —Å–µ—Ä–∏–∞–ª—ã.
timur:  –í–æ—Ç –Ω—É –≤ —Å–º—ã—Å–ª–µ –Ω–µ —Å–µ—Ä–∏–∞–ª—ã —è —Å–º–æ—Ç—Ä–µ–ª–∞ –≤ –≤–æ–¥—É –≤ —É–Ω–∏—Ç–∞–∑–µ –∏ –≤—Å–µ –±—ã–ª–æ —Ö–æ—Ä–æ—à–æ –≥–∞–¥–∞–Ω–∏–µ –Ω–∞ –∫–æ—Ñ–µ–π–Ω–æ–π –≥—É—â–µ.
timur:  –ù—É —Ç–∞–º –¥—Ä—É–≥–∞—è –≥—É—â–µ.
UNKNOWN PERSON:  –¢–∞–∫ –ª–∞–¥–Ω–æ —è –¥—É–º–∞—é —ç—Ç–æ –ø–æ–π—Ç–∏.


In [None]:
ipd.Audio(PATH)

### –í—ã–≤–æ–¥—ã: