In [1]:
import os
import re
import sys
import shutil
import librosa
import pandas as pd

from tqdm import tqdm
from collections import Counter
from string import ascii_lowercase

In [2]:
sys.path.append("..")

In [3]:
from utils import transform_audio_file, recursive_search, create_dir, delete_dir

In [4]:
prj_path = os.path.dirname(os.getcwd())
data_path = os.path.join(prj_path, 'data')
voxforge_data_path = os.path.join(data_path, 'voxforge')
librivox_data_path = os.path.join(data_path, 'librivox')

## Prepare data

### Audio data

In [5]:
audio_librivox = recursive_search(os.path.join(librivox_data_path, 'audio'))
audio_librivox['duration'] = audio_librivox['file_path'].map(lambda x: librosa.get_duration(filename=x))
audio_librivox['file_size'] = audio_librivox['file_path'].map(lambda x: os.path.getsize(x))

In [6]:
print(audio_librivox.shape)
print(f"Total duration: {audio_librivox['duration'].sum()/60/60:.2f} hours")
print(f"Total size: {audio_librivox['file_size'].sum()/1024/1024:.2f} mb")
audio_librivox.sample(5)

(34193, 4)
Total duration: 83.23 hours
Total size: 4573.50 mb


Unnamed: 0,file_name,file_path,duration,file_size
18515,sumska__mykola_djerya_s000815.wav,/home/dima/Projects/stt_uk/data/librivox/audio...,13.609875,217802
25017,miskun__15YO_Capitan_s004171.wav,/home/dima/Projects/stt_uk/data/librivox/audio...,5.39575,86376
3062,sumska__kaydasheva_s001183.wav,/home/dima/Projects/stt_uk/data/librivox/audio...,6.809875,109002
15827,obruchov__fata_morgana_s000213.wav,/home/dima/Projects/stt_uk/data/librivox/audio...,4.429875,70922
17121,shepel__zvirobij_s007204.wav,/home/dima/Projects/stt_uk/data/librivox/audio...,5.380875,86138


In [7]:
audio_voxforge = recursive_search(os.path.join(voxforge_data_path, 'audio'))
audio_voxforge['duration'] = audio_voxforge['file_path'].map(lambda x: librosa.get_duration(filename=x))
audio_voxforge['file_size'] = audio_voxforge['file_path'].map(lambda x: os.path.getsize(x))

In [8]:
print(audio_voxforge.shape)
print(f"Total duration: {audio_voxforge['duration'].sum()/60/60:.2f} hours")
print(f"Total size: {audio_voxforge['file_size'].sum()/1024/1024:.2f} mb")
audio_voxforge.sample(5)

(390, 4)
Total duration: 1.00 hours
Total size: 55.10 mb


Unnamed: 0,file_name,file_path,duration,file_size
140,anonymous-20131222-ats__uk_0021.wav,/home/dima/Projects/stt_uk/data/voxforge/audio...,7.375,118044
289,Vasyl-20160403-phd__uk_0030.wav,/home/dima/Projects/stt_uk/data/voxforge/audio...,6.75,108044
93,anonymous-20131130-ybm__uk_0012.wav,/home/dima/Projects/stt_uk/data/voxforge/audio...,7.75,124044
249,anonymous-20131220-whf__uk_0040.wav,/home/dima/Projects/stt_uk/data/voxforge/audio...,13.5,216044
236,anonymous-20131222-mtm__uk_0036.wav,/home/dima/Projects/stt_uk/data/voxforge/audio...,13.25,212044


### gender

In [9]:
audio_df = pd.concat([audio_librivox, audio_voxforge])
audio_df['speaker_id'] = audio_df['file_name'].map(lambda x: x.split("__", 1)[0])

In [10]:
speaker_gender = {item: 'm' for item in audio_df['speaker_id'].unique()}

for item in ['Anna-20160402-kxf', 'sumska', 'Darrr-20170412-jau']:
    speaker_gender[item] = 'f'

audio_df['gender'] = audio_df['speaker_id'].map(speaker_gender)

In [11]:
print(audio_df.shape)
audio_df.sample(5)

(34583, 6)


Unnamed: 0,file_name,file_path,duration,file_size,speaker_id,gender
5316,pysariev__transformations_s000084.wav,/home/dima/Projects/stt_uk/data/librivox/audio...,13.239875,211882,pysariev,m
11311,pysariev__transformations_s000335.wav,/home/dima/Projects/stt_uk/data/librivox/audio...,4.839875,77482,pysariev,m
13198,obruchov__dorogoyu_tsinoyu_s000210.wav,/home/dima/Projects/stt_uk/data/librivox/audio...,10.559875,169002,obruchov,m
18676,obruchov__dorogoyu_tsinoyu_s000649.wav,/home/dima/Projects/stt_uk/data/librivox/audio...,5.249875,84042,obruchov,m
4248,obruchov__voly_14_f000304.wav,/home/dima/Projects/stt_uk/data/librivox/audio...,9.389875,150282,obruchov,m


### text

In [12]:
with open(os.path.join(voxforge_data_path, 'prompts.txt')) as file_:
    lst = [item.split(" ", 1) for item in file_.readlines()]
text_voxforge = pd.DataFrame(lst)
text_voxforge[2] = 'voxforge'

In [13]:
with open(os.path.join(librivox_data_path, 'prompts.txt')) as file_:
    lst = [item.split(" ", 1) for item in file_.readlines()]
text_librivox = pd.DataFrame(lst)
text_librivox[2] = 'librivox'

In [14]:
text_df = pd.concat([text_librivox, text_voxforge])
text_df.columns = ['file_name', 'transcript', 'source']

In [15]:
print(text_df.shape)
text_df.head()

(34579, 3)


Unnamed: 0,file_name,transcript,source
0,obruchov__tini_zabutyh_predkiv_s000001.wav,"Я ще заграю до танцю,- бадьорив він чугайстра ...",librivox
1,obruchov__tini_zabutyh_predkiv_s000002.wav,"Вони, здається, гойдалися з нею ще у колисці, ...",librivox
2,obruchov__tini_zabutyh_predkiv_s000003.wav,"І вона співанками косичила їх розлучення, Їй б...",librivox
3,obruchov__tini_zabutyh_predkiv_s000004.wav,"Іду, Марічко! - билась в Іванових грудях одпов...",librivox
4,obruchov__tini_zabutyh_predkiv_s000005.wav,Засідали за мережаний стіл. тяжкі в своїм овеч...,librivox


#### preprocess text

In [16]:
letters_to_replace = {"i": "і", "a": "а", "o": "о", "y": "у", "e": "е", 
                      "p": "р", "n": "п", "c": "с", "x": "х", "r": "г",
                      "m": "м", "h": "н", "b": "в", "t": "т"}

In [17]:
def regex_token(x):
    x = " ".join(re.findall(r"\w+", x)).lower()
    x = re.sub(r'\d+', '', x)
    x = x.replace('laissez donc le domestique ecoute', 'люсі дунк лю домєстік екют')

    for k, v in letters_to_replace.items():
        x = x.replace(k, v)
    return x

In [18]:
text_df['transcript_clean'] = text_df.transcript.map(regex_token)

In [19]:
words = []

for item in tqdm(text_df["transcript_clean"].str.split().values):
    words.extend(item)

100%|██████████| 34579/34579 [00:00<00:00, 2332087.25it/s]


In [20]:
Counter(words).most_common(10)

[('і', 12672),
 ('не', 12373),
 ('на', 11503),
 ('що', 8342),
 ('з', 8047),
 ('в', 7817),
 ('й', 7468),
 ('а', 6457),
 ('до', 5383),
 ('та', 5327)]

In [21]:
unique_words = list(set(words))
print(len(unique_words))

72530


In [22]:
skip = False
bad_word = []

for item in unique_words:
    skip = False
    for letter in ascii_lowercase:
        if skip:
            break
        if letter in item:
            bad_word.append(item)
            skip = True
len(bad_word)

6

In [23]:
bad_word

['хv', 'аvапті', 'v', 'кj', 'vоlу__f', 'wаv']

In [24]:
mask = text_df.transcript_clean.apply(lambda x: any(item for item in bad_word if item in x))
text_df = text_df[~mask]

In [25]:
print(text_df.shape)
text_df.head()

(34574, 4)


Unnamed: 0,file_name,transcript,source,transcript_clean
0,obruchov__tini_zabutyh_predkiv_s000001.wav,"Я ще заграю до танцю,- бадьорив він чугайстра ...",librivox,я ще заграю до танцю бадьорив він чугайстра й ...
1,obruchov__tini_zabutyh_predkiv_s000002.wav,"Вони, здається, гойдалися з нею ще у колисці, ...",librivox,вони здається гойдалися з нею ще у колисці хлю...
2,obruchov__tini_zabutyh_predkiv_s000003.wav,"І вона співанками косичила їх розлучення, Їй б...",librivox,і вона співанками косичила їх розлучення їй бу...
3,obruchov__tini_zabutyh_predkiv_s000004.wav,"Іду, Марічко! - билась в Іванових грудях одпов...",librivox,іду марічко билась в іванових грудях одповідь ...
4,obruchov__tini_zabutyh_predkiv_s000005.wav,Засідали за мережаний стіл. тяжкі в своїм овеч...,librivox,засідали за мережаний стіл тяжкі в своїм овечі...


### Final df

In [26]:
df = audio_df.merge(text_df, on='file_name')

In [27]:
print(df.shape)
df.head()

(34574, 9)


Unnamed: 0,file_name,file_path,duration,file_size,speaker_id,gender,transcript,source,transcript_clean
0,miskun__15YO_Capitan_s003791.wav,/home/dima/Projects/stt_uk/data/librivox/audio...,7.879875,126122,miskun,m,"Вітер чимдалі дужчав, однак не змінював свого ...",librivox,вітер чимдалі дужчав однак не змінював свого н...
1,loboda__chorna_rada_s002224.wav,/home/dima/Projects/stt_uk/data/librivox/audio...,3.417875,54730,loboda,m,"Ну, прощайте ж, братці, навіки!\n",librivox,ну прощайте ж братці навіки
2,loboda__zahar_berkut_s000726.wav,/home/dima/Projects/stt_uk/data/librivox/audio...,4.769875,76362,loboda,m,котра ген-ген сходилася з долиною Стрия.\n,librivox,котра ген ген сходилася з долиною стрия
3,obruchov__dorogoyu_tsinoyu_s000477.wav,/home/dima/Projects/stt_uk/data/librivox/audio...,15.389875,246282,obruchov,m,"Напоєний незабаром зіллям, із перев'язаною ран...",librivox,напоєний незабаром зіллям із перев язаною рано...
4,shepel__zvirobij_s003180.wav,/home/dima/Projects/stt_uk/data/librivox/audio...,8.661875,138634,shepel,m,"Ніхто не знав, за яких обставин її влучено: ма...",librivox,ніхто не знав за яких обставин її влучено мабу...


## Kaldi project requirements

In [28]:
proj_name = 'stt_uk'

kaldi_path = f'/home/{os.environ.get("USER")}/kaldi'
# set here correct path when necessary
kaldi_proj_path = os.path.join(kaldi_path, 'egs', proj_name, 's5')

In [29]:
kaldi_proj_path

'/home/dima/kaldi/egs/stt_uk/s5'

In [30]:
if not os.path.exists(kaldi_path):
    print("KALDI IS ABSENT!!!!!")

In [31]:
DELETE_ON_CREATING = False

folders = ["audio", "audio/train", "audio/test", 
           "data", "data/train", "data/test", "data/local", "data/local/dict", 
           "conf"]

if DELETE_ON_CREATING:
    delete_dir(kaldi_proj_path)

for folder in folders:
    create_dir(os.path.join(kaldi_proj_path, folder))

#### audio data

In [35]:
# train, test = df.loc[df.source != 'voxforge'], df.loc[df.source == 'voxforge']
train, test = df, df.loc[df.source == 'voxforge']

print(train.shape[0], test.shape[0])

34574 390


In [36]:
# test = test[test.speaker_id == 'Anna-20160402-kxf']
train = test

In [37]:
# TODO: fix test on train

In [38]:
for df, destination in [(train, 'train'), (test, 'test')]:
    for spkr in tqdm(df.speaker_id.unique()):
        create_dir(os.path.join(kaldi_proj_path, 'audio', destination, spkr))
        
        for row in df.loc[df.speaker_id == spkr].iterrows():
            shutil.copy2(row[1].file_path, 
                         os.path.join(kaldi_proj_path, 'audio', destination, spkr, row[1].file_name))

100%|██████████| 39/39 [00:00<00:00, 287.73it/s]
100%|██████████| 39/39 [00:00<00:00, 304.20it/s]


#### spk2gender

In [39]:
train[["speaker_id", "gender"]].drop_duplicates().to_csv(
    os.path.join(kaldi_proj_path, 'data', 'train', 'spk2gender'), 
    sep=" ", index=False, header=None)
test[["speaker_id", "gender"]].drop_duplicates().to_csv(
    os.path.join(kaldi_proj_path, 'data', 'test', 'spk2gender'), 
    sep=" ", index=False, header=None)

#### wav.scp

In [40]:
train["path"] = train.apply(lambda x: f"{kaldi_proj_path}/audio/train/{x.speaker_id}/{x.file_name}", 1)
test["path"] = test.apply(lambda x: f"{kaldi_proj_path}/audio/test/{x.speaker_id}/{x.file_name}", 1)

In [41]:
train[["file_name", "path"]].to_csv(os.path.join(kaldi_proj_path, 'data', 'train', 'wav.scp'), 
                                    sep=" ", index=False, header=None)
test[["file_name", "path"]].to_csv(os.path.join(kaldi_proj_path, 'data', 'test', 'wav.scp'), 
                                   sep=" ", index=False, header=None)

#### text

In [42]:
with open(os.path.join(kaldi_proj_path, 'data', 'train', 'text'), "w") as file_:
    for line in (train["file_name"] + " " + train["transcript_clean"]).values:
        file_.write(line.strip() + "\n")
        
with open(os.path.join(kaldi_proj_path, 'data', 'test', 'text'), "w") as file_:
    for line in (test["file_name"] + " " + test["transcript_clean"]).values:
        file_.write(line.strip() + "\n")

#### utt2spk

In [43]:
train[["file_name", "speaker_id"]].to_csv(os.path.join(kaldi_proj_path, 'data', 'train', 'utt2spk'), 
                                          sep=" ", index=False, header=None)
test[["file_name", "speaker_id"]].to_csv(os.path.join(kaldi_proj_path, 'data', 'test', 'utt2spk'), sep=" ", index=False, header=None)

#### corpus.txt

In [44]:
with open(os.path.join(kaldi_proj_path, 'data', 'local', 'corpus.txt'), "w") as file_:
    for line in df.loc[df["transcript_clean"].str.len() > 1]["transcript_clean"].unique():
        file_.write(line.strip() + "\n")

#### lexicon.txt

In [45]:
sil_phones = [("!SIL", "sil"), ("<UNK>", "spn")]

phoneme preparation mechanism may be different

In [46]:
# words = []

# for item in tqdm(text_df["transcript_clean"].str.split().values):
#     words.extend(item)
# unique_words = list(set(words))

# with open('lexicon_prep.txt', "w") as file_:
#     for line in unique_words:
#         file_.write(line.strip() + "\n")

In [47]:
with open(os.path.join(data_path, 'lexicon_prep_out.txt')) as file_:
    lexicon = file_.readlines()
    
lexicon_phones = [tuple(item.strip().split(" ", 1)) for item in lexicon]

In [48]:
lexicon_phones[:10]

[('позабризкувала', 'п о з а б р и з к у в а л а'),
 ('виховання', "в и х о в а н': а"),
 ('затишшя', "з а т и ш': а"),
 ('задихавшись', "з а д и х а ў ш и с'"),
 ('одинокий', 'о д и н о к и й'),
 ('знакомих', 'з н а к о м и х'),
 ('жалюгідна', "ж а л' у г' і д н а"),
 ('легкої', 'л е х к о й і'),
 ('вмілі', "ў м' і л' і"),
 ('наїдуть', "н а й і д у т'")]

In [49]:
with open(os.path.join(kaldi_proj_path, 'data', 'local', "dict", 'lexicon.txt'), "w") as file_:
    for word, phones in sil_phones + lexicon_phones:
        file_.write(word + " " + " ".join(phones.split(" ")) + "\n")

#### nonsilence_phones.txt

In [50]:
unique_phones = ["а"]

for k, v in lexicon_phones:
    unique_phones.extend(v.split(" "))

unique_phones = list(sorted(set(unique_phones)))
len(unique_phones)

79

In [51]:
with open(os.path.join(kaldi_proj_path, 'data', 'local', "dict", 'nonsilence_phones.txt'), "w") as file_:
    for phone in sorted(unique_phones):
        file_.write(phone + "\n")

#### silence_phones.txt

In [52]:
with open(os.path.join(kaldi_proj_path, 'data', 'local', "dict", 'silence_phones.txt'), "w") as file_:
    for phone in sorted(list(set(" ".join([item[1] for item in sil_phones]).split()))):
        file_.write(phone + "\n")

#### optional_silence.txt

In [53]:
with open(os.path.join(kaldi_proj_path, 'data', 'local', "dict", 'optional_silence.txt'), "w") as file_:
    for phone in ["sil"]:
        file_.write(phone + "\n")

### copy all required files to project dir

In [54]:
misc_path = os.path.join(prj_path, 'kaldi', 'misc')
conf_path = os.path.join(misc_path, 'conf')

In [55]:
for item in os.listdir(conf_path):
    shutil.copy2(os.path.join(conf_path, item), 
                 os.path.join(kaldi_proj_path, 'conf', item))

In [56]:
for item in ['cmd.sh', 'run.sh', 'path.sh']:
    shutil.copy2(os.path.join(misc_path, item), 
                 os.path.join(kaldi_proj_path, item))

In [57]:
! cp -R /home/$USER/kaldi/egs/stt_uk/s5/data/local/dict/* /home/$USER/kaldi/egs/stt_uk/s5/data/local/

make sure that here is a correct project path

In [58]:
# ! rm -rf /home/$USER/kaldi/egs/stt_uk/s5/steps /home/$USER/kaldi/egs/stt_uk/s5/utils /home/$USER/kaldi/egs/stt_uk/s5/local/

! cp -R /home/$USER/kaldi/egs/babel/s5d/steps /home/$USER/kaldi/egs/stt_uk/s5/steps
! cp -R /home/$USER/kaldi/egs/babel/s5d/utils /home/$USER/kaldi/egs/stt_uk/s5/utils
! cp -R /home/$USER/kaldi/egs/babel/s5d/local/ /home/$USER/kaldi/egs/stt_uk/s5/local/

In [59]:
! cp -R /home/$USER/kaldi/egs/wsj/s5/local/score.sh /home/$USER/kaldi/egs/stt_uk/s5/local/score.sh