In [76]:
import os
import re
import sys
import shutil

from tqdm import tqdm
from collections import Counter
from string import ascii_lowercase

In [9]:
sys.path.append("..")

In [50]:
from utils import transform_audio_file, recursive_search, create_dir, delete_dir

In [5]:
prj_path = os.path.dirname(os.getcwd())
data_path = os.path.join(prj_path, 'data')
voxforge_data_path = os.path.join(data_path, 'voxforge')
librivox_data_path = os.path.join(data_path, 'librivox')

## Prepare data

### Audio data

In [11]:
audio_librivox = recursive_search(os.path.join(librivox_data_path, 'audio'))
audio_librivox['duration'] = audio_librivox['file_path'].map(lambda x: librosa.get_duration(filename=x))
audio_librivox['file_size'] = audio_librivox['file_path'].map(lambda x: os.path.getsize(x))

In [12]:
print(audio_librivox.shape)
print(f"Total duration: {audio_librivox['duration'].sum()/60/60:.2f} hours")
print(f"Total size: {audio_librivox['file_size'].sum()/1024/1024:.2f} mb")
audio_librivox.sample(5)

(34193, 4)
Total duration: 83.23 hours
Total size: 4573.50 mb


Unnamed: 0,file_name,file_path,duration,file_size
8519,shepel__zvirobij_s006618.wav,/home/dima/Projects/stt_uk/data/librivox/audio...,8.399875,134442
32184,obruchov__fata_morgana_s001095.wav,/home/dima/Projects/stt_uk/data/librivox/audio...,13.329875,213322
26575,shepel__zvirobij_7_s005786.wav,/home/dima/Projects/stt_uk/data/librivox/audio...,7.429875,118922
22793,miskun__15YO_Capitan_s004703.wav,/home/dima/Projects/stt_uk/data/librivox/audio...,5.719875,91562
4301,miskun__15YO_Capitan_s002690.wav,/home/dima/Projects/stt_uk/data/librivox/audio...,8.649875,138442


In [13]:
audio_voxforge = recursive_search(os.path.join(voxforge_data_path, 'audio'))
audio_voxforge['duration'] = audio_voxforge['file_path'].map(lambda x: librosa.get_duration(filename=x))
audio_voxforge['file_size'] = audio_voxforge['file_path'].map(lambda x: os.path.getsize(x))

In [14]:
print(audio_voxforge.shape)
print(f"Total duration: {audio_voxforge['duration'].sum()/60/60:.2f} hours")
print(f"Total size: {audio_voxforge['file_size'].sum()/1024/1024:.2f} mb")
audio_voxforge.sample(5)

(390, 4)
Total duration: 1.00 hours
Total size: 55.10 mb


Unnamed: 0,file_name,file_path,duration,file_size
243,anonymous-20131219-pxl__uk_0035.wav,/home/dima/Projects/stt_uk/data/voxforge/audio...,13.75,220044
223,Anna-20160402-kxf__uk_0014.wav,/home/dima/Projects/stt_uk/data/voxforge/audio...,9.75,156044
63,anonymous-20131024-atf__uk_0024.wav,/home/dima/Projects/stt_uk/data/voxforge/audio...,8.96,143404
52,Taras-20160318-mxu__uk_0006.wav,/home/dima/Projects/stt_uk/data/voxforge/audio...,7.375,118044
360,anonymous-20140818-gwg__uk_0045.wav,/home/dima/Projects/stt_uk/data/voxforge/audio...,7.625,122044


### gender

In [15]:
audio_df = pd.concat([audio_librivox, audio_voxforge])
audio_df['speaker_id'] = audio_df['file_name'].map(lambda x: x.split("__", 1)[0])

In [16]:
speaker_gender = {item: 'm' for item in audio_df['speaker_id'].unique()}

for item in ['Anna-20160402-kxf', 'sumska', 'Darrr-20170412-jau']:
    speaker_gender[item] = 'f'

audio_df['gender'] = audio_df['speaker_id'].map(speaker_gender)

In [17]:
print(audio_df.shape)
audio_df.sample(5)

(34583, 6)


Unnamed: 0,file_name,file_path,duration,file_size,speaker_id,gender
2304,obruchov__voly_23_f000250.wav,/home/dima/Projects/stt_uk/data/librivox/audio...,3.419875,54762,obruchov,m
16648,shepel__zvirobij_s007565.wav,/home/dima/Projects/stt_uk/data/librivox/audio...,9.009875,144202,shepel,m
3305,sumska__mykola_djerya_s001444.wav,/home/dima/Projects/stt_uk/data/librivox/audio...,11.807875,188970,sumska,f
29185,sumska__mykola_djerya_s001156.wav,/home/dima/Projects/stt_uk/data/librivox/audio...,3.65075,58456,sumska,f
23236,shepel__zvirobij_s007555.wav,/home/dima/Projects/stt_uk/data/librivox/audio...,9.339875,149482,shepel,m


### text

In [18]:
with open(os.path.join(voxforge_data_path, 'prompts.txt')) as file_:
    lst = [item.split(" ", 1) for item in file_.readlines()]
text_voxforge = pd.DataFrame(lst)
text_voxforge[2] = 'voxforge'

In [19]:
with open(os.path.join(librivox_data_path, 'prompts.txt')) as file_:
    lst = [item.split(" ", 1) for item in file_.readlines()]
text_librivox = pd.DataFrame(lst)
text_librivox[2] = 'librivox'

In [20]:
text_df = pd.concat([text_librivox, text_voxforge])
text_df.columns = ['file_name', 'transcript', 'source']

In [21]:
print(text_df.shape)
text_df.head()

(34579, 3)


Unnamed: 0,file_name,transcript,source
0,obruchov__tini_zabutyh_predkiv_s000001.wav,"Я ще заграю до танцю,- бадьорив він чугайстра ...",librivox
1,obruchov__tini_zabutyh_predkiv_s000002.wav,"Вони, здається, гойдалися з нею ще у колисці, ...",librivox
2,obruchov__tini_zabutyh_predkiv_s000003.wav,"І вона співанками косичила їх розлучення, Їй б...",librivox
3,obruchov__tini_zabutyh_predkiv_s000004.wav,"Іду, Марічко! - билась в Іванових грудях одпов...",librivox
4,obruchov__tini_zabutyh_predkiv_s000005.wav,Засідали за мережаний стіл. тяжкі в своїм овеч...,librivox


#### preprocess text

In [23]:
letters_to_replace = {"i": "і", "a": "а", "o": "о", "y": "у", "e": "е", 
                      "p": "р", "n": "п", "c": "с", "x": "х", "r": "г",
                      "m": "м", "h": "н", "b": "в", "t": "т"}

In [24]:
def regex_token(x):
    x = " ".join(re.findall(r"\w+", x)).lower()
    x = re.sub(r'\d+', '', x)
    x = x.replace('laissez donc le domestique ecoute', 'люсі дунк лю домєстік екют')

    for k, v in letters_to_replace.items():
        x = x.replace(k, v)
    return x

In [25]:
text_df['transcript_clean'] = text_df.transcript.map(regex_token)

In [26]:
words = []

for item in tqdm(text_df["transcript_clean"].str.split().values):
    words.extend(item)

100%|██████████| 34579/34579 [00:00<00:00, 2352399.49it/s]


In [27]:
Counter(words).most_common(10)

[('і', 12672),
 ('не', 12373),
 ('на', 11503),
 ('що', 8342),
 ('з', 8047),
 ('в', 7817),
 ('й', 7468),
 ('а', 6457),
 ('до', 5383),
 ('та', 5327)]

In [31]:
unique_words = list(set(words))
print(len(unique_words))

72530


In [32]:
skip = False
bad_word = []

for item in unique_words:
    skip = False
    for letter in ascii_lowercase:
        if skip:
            break
        if letter in item:
            bad_word.append(item)
            skip = True
len(bad_word)

6

In [33]:
bad_word

['v', 'vоlу__f', 'wаv', 'хv', 'кj', 'аvапті']

In [36]:
mask = text_df.transcript_clean.apply(lambda x: any(item for item in bad_word if item in x))
text_df = text_df[~mask]

In [37]:
print(text_df.shape)
text_df.head()

(34574, 4)


Unnamed: 0,file_name,transcript,source,transcript_clean
0,obruchov__tini_zabutyh_predkiv_s000001.wav,"Я ще заграю до танцю,- бадьорив він чугайстра ...",librivox,я ще заграю до танцю бадьорив він чугайстра й ...
1,obruchov__tini_zabutyh_predkiv_s000002.wav,"Вони, здається, гойдалися з нею ще у колисці, ...",librivox,вони здається гойдалися з нею ще у колисці хлю...
2,obruchov__tini_zabutyh_predkiv_s000003.wav,"І вона співанками косичила їх розлучення, Їй б...",librivox,і вона співанками косичила їх розлучення їй бу...
3,obruchov__tini_zabutyh_predkiv_s000004.wav,"Іду, Марічко! - билась в Іванових грудях одпов...",librivox,іду марічко билась в іванових грудях одповідь ...
4,obruchov__tini_zabutyh_predkiv_s000005.wav,Засідали за мережаний стіл. тяжкі в своїм овеч...,librivox,засідали за мережаний стіл тяжкі в своїм овечі...


### Final df

In [38]:
df = audio_df.merge(text_df, on='file_name')

In [39]:
print(df.shape)
df.head()

(34574, 9)


Unnamed: 0,file_name,file_path,duration,file_size,speaker_id,gender,transcript,source,transcript_clean
0,miskun__15YO_Capitan_s003791.wav,/home/dima/Projects/stt_uk/data/librivox/audio...,7.879875,126122,miskun,m,"Вітер чимдалі дужчав, однак не змінював свого ...",librivox,вітер чимдалі дужчав однак не змінював свого н...
1,loboda__chorna_rada_s002224.wav,/home/dima/Projects/stt_uk/data/librivox/audio...,3.417875,54730,loboda,m,"Ну, прощайте ж, братці, навіки!\n",librivox,ну прощайте ж братці навіки
2,loboda__zahar_berkut_s000726.wav,/home/dima/Projects/stt_uk/data/librivox/audio...,4.769875,76362,loboda,m,котра ген-ген сходилася з долиною Стрия.\n,librivox,котра ген ген сходилася з долиною стрия
3,obruchov__dorogoyu_tsinoyu_s000477.wav,/home/dima/Projects/stt_uk/data/librivox/audio...,15.389875,246282,obruchov,m,"Напоєний незабаром зіллям, із перев'язаною ран...",librivox,напоєний незабаром зіллям із перев язаною рано...
4,shepel__zvirobij_s003180.wav,/home/dima/Projects/stt_uk/data/librivox/audio...,8.661875,138634,shepel,m,"Ніхто не знав, за яких обставин її влучено: ма...",librivox,ніхто не знав за яких обставин її влучено мабу...


## Kaldi project requirements

In [232]:
proj_name = 'stt_uk/s5'

kaldi_path = f'/home/{os.environ.get("USER")}/kaldi'
# set here correct path when necessary
kaldi_proj_path_true = os.path.join(kaldi_path, 'egs', proj_name)
# kaldi_proj_path = os.path.join(prj_path, 'kaldi', 'egs', proj_name)
kaldi_proj_path = kaldi_proj_path_true

In [233]:
print(kaldi_proj_path)

/home/dima/kaldi/egs/stt_uk/s5


In [234]:
if not os.path.exists(kaldi_path):
    print("KALDI IS ABSENT!!!!!")

In [235]:
DELETE_ON_CREATING = False

folders = ["audio", "audio/train", "audio/test", 
           "data", "data/train", "data/test", "data/local", "data/local/dict", 
           "local", "conf"]

if DELETE_ON_CREATING:
    delete_dir(kaldi_proj_path)

for folder in folders:
    create_dir(os.path.join(kaldi_proj_path, folder))

#### audio data

In [236]:
# train, test = df.loc[df.source != 'voxforge'], df.loc[df.source == 'voxforge']
train, test = df, df.loc[df.source == 'voxforge']

print(train.shape[0], test.shape[0])

390 390


In [237]:
for df, destination in [(test, 'train'), (test, 'test')]:
    for spkr in tqdm(df.speaker_id.unique()):
        create_dir(os.path.join(kaldi_proj_path, 'audio', destination, spkr))
        
        for row in df.loc[df.speaker_id == spkr].iterrows():
            shutil.copy2(row[1].file_path, 
                         os.path.join(kaldi_proj_path, 'audio', destination, spkr, row[1].file_name))

100%|██████████| 39/39 [00:00<00:00, 286.37it/s]
100%|██████████| 39/39 [00:00<00:00, 290.70it/s]


#### spk2gender

In [238]:
train[["speaker_id", "gender"]].drop_duplicates().to_csv(
    os.path.join(kaldi_proj_path, 'data', 'train', 'spk2gender'), 
    sep=" ", index=False, header=None)
test[["speaker_id", "gender"]].drop_duplicates().to_csv(
    os.path.join(kaldi_proj_path, 'data', 'test', 'spk2gender'), 
    sep=" ", index=False, header=None)

#### wav.scp

In [239]:
train["path"] = train.apply(lambda x: f"{kaldi_proj_path_true}/audio/train/{x.speaker_id}/{x.file_name}", 1)
test["path"] = test.apply(lambda x: f"{kaldi_proj_path_true}/audio/test/{x.speaker_id}/{x.file_name}", 1)

In [240]:
train[["file_name", "path"]].to_csv(os.path.join(kaldi_proj_path, 'data', 'train', 'wav.scp'), 
                                    sep=" ", index=False, header=None)
test[["file_name", "path"]].to_csv(os.path.join(kaldi_proj_path, 'data', 'test', 'wav.scp'), 
                                   sep=" ", index=False, header=None)

#### text

In [241]:
with open(os.path.join(kaldi_proj_path, 'data', 'train', 'text'), "w") as file_:
    for line in (train["file_name"] + " " + train["transcript_clean"]).values:
        file_.write(line.strip() + "\n")
        
with open(os.path.join(kaldi_proj_path, 'data', 'test', 'text'), "w") as file_:
    for line in (test["file_name"] + " " + test["transcript_clean"]).values:
        file_.write(line.strip() + "\n")

#### utt2spk

In [242]:
train[["file_name", "speaker_id"]].to_csv(os.path.join(kaldi_proj_path, 'data', 'train', 'utt2spk'), 
                                          sep=" ", index=False, header=None)
test[["file_name", "speaker_id"]].to_csv(os.path.join(kaldi_proj_path, 'data', 'test', 'utt2spk'), sep=" ", index=False, header=None)

#### corpus.txt

In [243]:
with open(os.path.join(kaldi_proj_path, 'data', 'local', 'corpus.txt'), "w") as file_:
    for line in df.loc[df["transcript_clean"].str.len() > 1]["transcript_clean"].unique():
        file_.write(line.strip() + "\n")

#### lexicon.txt

In [244]:
sil_phones = [("!SIL", "sil"), ("<UNK>", "spn")]

phoneme preparation mechanism may be different

In [245]:
# words = []

# for item in tqdm(text_df["transcript_clean"].str.split().values):
#     words.extend(item)
# unique_words = list(set(words))

# with open('lexicon_prep.txt', "w") as file_:
#     for line in unique_words:
#         file_.write(line.strip() + "\n")

In [246]:
with open(os.path.join(data_path, 'lexicon_prep_out.txt')) as file_:
    lexicon = file_.readlines()
    
lexicon_phones = [tuple(item.strip().split(" ", 1)) for item in lexicon]

In [247]:
lexicon_phones[:10]

[('позабризкувала', 'п о з а б р и з к у в а л а'),
 ('виховання', "в и х о в а н': а"),
 ('затишшя', "з а т и ш': а"),
 ('задихавшись', "з а д и х а ў ш и с'"),
 ('одинокий', 'о д и н о к и й'),
 ('знакомих', 'з н а к о м и х'),
 ('жалюгідна', "ж а л' у г' і д н а"),
 ('легкої', 'л е х к о й і'),
 ('вмілі', "ў м' і л' і"),
 ('наїдуть', "н а й і д у т'")]

In [248]:
with open(os.path.join(kaldi_proj_path, 'data', 'local', "dict", 'lexicon.txt'), "w") as file_:
    for word, phones in sil_phones + lexicon_phones:
        file_.write(word + " " + " ".join(phones.split(" ")) + "\n")

#### nonsilence_phones.txt

In [249]:
unique_phones = ["а"]

for k, v in lexicon_phones:
    unique_phones.extend(v.split(" "))

unique_phones = list(sorted(set(unique_phones)))
len(unique_phones)

79

In [250]:
with open(os.path.join(kaldi_proj_path, 'data', 'local', "dict", 'nonsilence_phones.txt'), "w") as file_:
    for phone in sorted(unique_phones):
        file_.write(phone + "\n")

#### silence_phones.txt

In [251]:
with open(os.path.join(kaldi_proj_path, 'data', 'local', "dict", 'silence_phones.txt'), "w") as file_:
    for phone in sorted(list(set(" ".join([item[1] for item in sil_phones]).split()))):
        file_.write(phone + "\n")

#### optional_silence.txt

In [252]:
with open(os.path.join(kaldi_proj_path, 'data', 'local', "dict", 'optional_silence.txt'), "w") as file_:
    for phone in ["sil"]:
        file_.write(phone + "\n")

### copy all required files to project dir

In [253]:
conf_path = os.path.join(prj_path, 'kaldi', 'misc', 'conf')
run_script_path = os.path.join(prj_path, 'kaldi', 'misc', 'run.sh')

In [254]:
for item in os.listdir(conf_path):
    shutil.copy2(os.path.join(conf_path, item), 
                 os.path.join(kaldi_proj_path, 'conf', item))

In [255]:
for item in ['cmd.sh', 'run.sh', 'path.sh']:
    shutil.copy2(run_script_path, os.path.join(kaldi_proj_path, item))

In [256]:
! cp -R /home/$USER/kaldi/egs/stt_uk/s5/data/local/dict/* /home/$USER/kaldi/egs/stt_uk/s5/data/local/

make sure that here is a correct project path

In [257]:
! cp -R /home/$USER/kaldi/egs/wsj/s5/steps /home/$USER/kaldi/egs/stt_uk/s5/steps
! cp -R /home/$USER/kaldi/egs/wsj/s5/utils /home/$USER/kaldi/egs/stt_uk/s5/utils

In [258]:
! cp -R /home/$USER/kaldi/egs/babel/s5d/local/ /home/$USER/kaldi/egs/stt_uk/s5/local/
! cp -R /home/$USER/kaldi/egs/wsj/s5/local/score.sh /home/$USER/kaldi/egs/stt_uk/s5/local/score.sh