In [1]:
import os
import librosa

import pandas as pd

In [2]:
from utils.misc import recursive_search

In [3]:
DATA_PATH = os.path.join(os.getcwd(), "data", "voxforge")
audio_converted_path = os.path.join(DATA_PATH, "audio_converted")

In [4]:
audio_converted_path

'/home/dima/Projects/stt_uk/data/voxforge/audio_converted'

## Audio data

In [5]:
audio_df = recursive_search(audio_converted_path)
audio_df['duration'] = audio_df['file_path'].map(lambda x: librosa.get_duration(filename=x))
audio_df['file_size'] = audio_df['file_path'].map(lambda x: os.path.getsize(x))

In [6]:
print(audio_df.shape)
print(f"Total duration: {audio_df['duration'].sum()/60/60:.2f} hours")
print(f"Total size: {audio_df['file_size'].sum()/1024/1024:.2f} mb")
audio_df.sample(5)

(50, 4)
Total duration: 0.13 hours
Total size: 13.79 mb


Unnamed: 0,file_name,file_path,duration,file_size
37,Alias_uk_0028.wav,/home/dima/Projects/stt_uk/data/voxforge/audio...,8.75,280044
21,Alias_uk_0023.wav,/home/dima/Projects/stt_uk/data/voxforge/audio...,5.25,168044
26,Neverice_uk_0038.wav,/home/dima/Projects/stt_uk/data/voxforge/audio...,5.75,184044
15,Anna_uk_0005.wav,/home/dima/Projects/stt_uk/data/voxforge/audio...,6.125,196044
9,Taras_uk_0050.wav,/home/dima/Projects/stt_uk/data/voxforge/audio...,6.5,208044


## text and gender

In [7]:
text_df = pd.read_csv(os.path.join(DATA_PATH, "voxforge_transcript.csv"))
text_df['speaker_id'] = text_df.file_name.map(lambda x: x.split("_", 1)[0])

In [8]:
print(text_df.shape)
text_df.head()

(50, 4)


Unnamed: 0,file_name,transcript,gender,speaker_id
0,Neverice_uk_0031.wav,"Водночас у тексті є багато думок, на які хочет...",m,Neverice
1,Neverice_uk_0032.wav,"Тяжкою працею, з допомогою щоразу більшої груп...",m,Neverice
2,Neverice_uk_0033.wav,"Досі не бралася за цю тему, вважаючи, що це зр...",m,Neverice
3,Neverice_uk_0034.wav,На початку його каденції ці рішення не були по...,m,Neverice
4,Neverice_uk_0035.wav,"Розподіл української єпархії між галичанами, я...",m,Neverice


### Prepare text

In [9]:
import re

from tqdm import tqdm
from collections import Counter

In [10]:
def regex_token(x):
    return " ".join(re.findall(r"\w+", x)).lower()

def clean_text(x):
    letter_mapping = {"i": "і"}
    for en, uk in letter_mapping.items():
        x = x.replace(en, uk)
    return x

In [11]:
text_df['transcript_clean'] = text_df.transcript.map(regex_token)
text_df['transcript_clean'] = text_df.transcript_clean.map(clean_text)

In [12]:
words = []

for item in tqdm(text_df["transcript_clean"].str.split().values):
    words.extend(item)

100%|██████████| 50/50 [00:00<00:00, 191171.56it/s]


In [13]:
Counter(words).most_common(10)

[('не', 32),
 ('і', 27),
 ('що', 21),
 ('на', 12),
 ('та', 10),
 ('те', 8),
 ('щось', 8),
 ('все', 8),
 ('є', 7),
 ('з', 7)]

## Kaldi requirements

In [14]:
import shutil

In [15]:
def mkdir(path):
    try:
        os.mkdir(path)
    except FileExistsError:
        pass

In [16]:
kaldi_proj_path = os.path.join(os.getcwd(), "kaldi_baseline")
kaldi_audio = os.path.join(kaldi_proj_path, 'audio')
kaldi_data = os.path.join(kaldi_proj_path, 'data')

In [17]:
folders = ["audio", "audio/train", "audio/test", "data", "data/train", "data/test", "data/local", "data/local/dict", "local", "conf"]


for folder in folders:
    mkdir(os.path.join(kaldi_proj_path, folder))

In [18]:
df = audio_df.merge(text_df, on='file_name')

In [19]:
print(df.shape)
df.head()

(50, 8)


Unnamed: 0,file_name,file_path,duration,file_size,transcript,gender,speaker_id,transcript_clean
0,Darrr_uk_0045.wav,/home/dima/Projects/stt_uk/data/voxforge/audio...,6.75,216044,"Щодо ракових недуг, то необхідно дотримуватися...",f,Darrr,щодо ракових недуг то необхідно дотримуватися ...
1,Alias_uk_0024.wav,/home/dima/Projects/stt_uk/data/voxforge/audio...,9.75,312044,Скільки коштуватиме організаторам дійство за у...,m,Alias,скільки коштуватиме організаторам дійство за у...
2,Neverice_uk_0034.wav,/home/dima/Projects/stt_uk/data/voxforge/audio...,7.261312,232406,На початку його каденції ці рішення не були по...,m,Neverice,на початку його каденції ці рішення не були по...
3,Anna_uk_0006.wav,/home/dima/Projects/stt_uk/data/voxforge/audio...,7.875,252044,"жах не в тому, що щось змінеться, - жах у тому...",f,Anna,жах не в тому що щось змінеться жах у тому що ...
4,Darrr_uk_0001.wav,/home/dima/Projects/stt_uk/data/voxforge/audio...,13.875,444044,Три наріччя української мови виділялися не зав...,f,Darrr,три наріччя української мови виділялися не зав...


In [20]:
df.speaker_id.value_counts().to_dict()

{'Anna': 10, 'Alias': 10, 'Taras': 10, 'Neverice': 10, 'Darrr': 10}

In [21]:
train, test = df.loc[df.speaker_id != 'Taras'], df.loc[df.speaker_id == 'Taras']
print(train.shape[0], test.shape[0])

40 10


#### audio_data

In [22]:
for spkr in tqdm(train.speaker_id.unique()):
    mkdir(os.path.join(kaldi_proj_path, 'audio', 'train', spkr))
    for file_ in train.loc[train.speaker_id == spkr].file_name.values:
        shutil.copy2(os.path.join(audio_converted_path, file_), 
                     os.path.join(kaldi_proj_path, 'audio', 'train', spkr))

100%|██████████| 4/4 [00:00<00:00, 238.69it/s]


In [23]:
for spkr in tqdm(test.speaker_id.unique()):
    mkdir(os.path.join(kaldi_proj_path, 'audio', 'test', spkr))
    for file_ in test.loc[test.speaker_id == spkr].file_name.values:
        shutil.copy2(os.path.join(audio_converted_path, file_), 
                     os.path.join(kaldi_proj_path, 'audio', 'test', spkr))

100%|██████████| 1/1 [00:00<00:00, 214.87it/s]


In [24]:
train[["speaker_id", "gender"]].drop_duplicates()

Unnamed: 0,speaker_id,gender
0,Darrr,f
1,Alias,m
2,Neverice,m
3,Anna,f


#### spk2gender

In [25]:
train[["speaker_id", "gender"]].drop_duplicates().to_csv(os.path.join(kaldi_proj_path, 'data', 'train', 'spk2gender'), sep=" ", index=False, header=None)
test[["speaker_id", "gender"]].drop_duplicates().to_csv(os.path.join(kaldi_proj_path, 'data', 'test', 'spk2gender'), sep=" ", index=False, header=None)

#### wav.scp

In [26]:
project_path = "/home/dima/kaldi/egs/stt_uk/"

In [27]:
train["path"] = train.apply(lambda x: f"{project_path}audio/train/{x.speaker_id}/{x.file_name}", 1)
test["path"] = test.apply(lambda x: f"{project_path}audio/test/{x.speaker_id}/{x.file_name}", 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [28]:
train[["file_name", "path"]].to_csv(os.path.join(kaldi_proj_path, 'data', 'train', 'wav.scp'), 
                                    sep=" ", index=False, header=None)
test[["file_name", "path"]].to_csv(os.path.join(kaldi_proj_path, 'data', 'test', 'wav.scp'), 
                                   sep=" ", index=False, header=None)

#### text

In [29]:
with open(os.path.join(kaldi_proj_path, 'data', 'train', 'text'), "w") as file_:
    for line in (train["file_name"] + " " + train["transcript_clean"]).values:
        file_.write(line.strip() + "\n")
        
with open(os.path.join(kaldi_proj_path, 'data', 'test', 'text'), "w") as file_:
    for line in (test["file_name"] + " " + test["transcript_clean"]).values:
        file_.write(line.strip() + "\n")

#### utt2spk

In [30]:
train[["file_name", "speaker_id"]].to_csv(os.path.join(kaldi_proj_path, 'data', 'train', 'utt2spk'), 
                                          sep=" ", index=False, header=None)
test[["file_name", "speaker_id"]].to_csv(os.path.join(kaldi_proj_path, 'data', 'test', 'utt2spk'), sep=" ", index=False, header=None)

#### corpus.txt

In [31]:
with open(os.path.join(kaldi_proj_path, 'data', 'local', 'corpus.txt'), "w") as file_:
    for line in df.loc[df["transcript_clean"].str.len() > 1]["transcript_clean"].unique():
        file_.write(line.strip() + "\n")

#### lexicon.txt

In [32]:
from utils.g2p import word2phonemes

In [33]:
sil_phones = [("!SIL", "sil"), ("<UNK>", "spn")]

In [34]:
unique_words = list(set(words))

In [35]:
lexicon = []

for word in tqdm(sorted(unique_words)):
    lexicon.append((word, word2phonemes(word)))

100%|██████████| 424/424 [00:00<00:00, 224713.79it/s]


In [36]:
lexicon[:10]

[('а', 'а'),
 ('але', 'а л е'),
 ('американської', "а м е р и к а н с' к о й і"),
 ('аналогічних', "а н а л о г' і ч н и х"),
 ('архітектурно', "а р х' і т е к т у р н о"),
 ('багато', 'б а г а т о'),
 ('батька', "б а т' к а"),
 ('бо', 'б о'),
 ('боляче', "б о л' а ч е"),
 ('бралася', "б р а л а с' а")]

In [37]:
with open(os.path.join(kaldi_proj_path, 'data', 'local', "dict", 'lexicon.txt'), "w") as file_:
    for word, phones in sil_phones + lexicon:
        file_.write(word + " " + " ".join(phones.split(" ")) + "\n")

#### nonsilence_phones.txt

In [38]:
with open(os.path.join(kaldi_proj_path, 'data', 'local', "dict", 'nonsilence_phones.txt'), "w") as file_:
    for phone in sorted(list(set(" ".join([item[1] for item in lexicon]).split()))):
        file_.write(phone + "\n")

#### silence_phones.txt

In [39]:
with open(os.path.join(kaldi_proj_path, 'data', 'local', "dict", 'silence_phones.txt'), "w") as file_:
    for phone in sorted(list(set(" ".join([item[1] for item in sil_phones]).split()))):
        file_.write(phone + "\n")

#### optional_silence.txt

In [40]:
with open(os.path.join(kaldi_proj_path, 'data', 'local', "dict", 'optional_silence.txt'), "w") as file_:
    for phone in ["sil"]:
        file_.write(phone + "\n")

### Copy prepared project to kaldi

In [41]:
! rm -rf /home/dima/kaldi/egs/stt_uk

In [42]:
! cp -R kaldi_baseline /home/dima/kaldi/egs/stt_uk

In [43]:
! cp -R /home/dima/kaldi/egs/wsj/s5/steps /home/dima/kaldi/egs/stt_uk/steps
! cp -R /home/dima/kaldi/egs/wsj/s5/utils /home/dima/kaldi/egs/stt_uk/utils
! cp -R /home/dima/kaldi/egs/voxforge/s5/local/score.sh /home/dima/kaldi/egs/stt_uk/local/score.sh