In [11]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import matplotlib.font_manager as mfm
from display import Disp
from kss_df import KssDf

font_path = '/Users/dev/Fonts/Noto_Sans_KR/NotoSansKR-Regular.otf'
k_prop = mfm.FontProperties(fname=font_path)
plt.style.use('dark_background')

# Display handler with some nice helpers
disp = Disp(display)
# disp.code('display.py', label='display.py ')


## Generate Features from PRAAT TextGrid

In [12]:
transcript_file = 'data/korean-single-speaker/transcript.v.1.4.txt'
tr_df = pd.read_csv(transcript_file, sep="|", names=["audio_file", "tr_w_num", "tr_syl", "tr_char", "dur", "en"])


Inspecting the transcript there are 3 different transcriptions.

| <!-- -->    | <!-- -->    |
|-------------|-------------|
|tr_w_num|Keeps numerics in number form|
|tr_syl|String encoding is syllable by syllable|
|tr_char|String encoding is character by character|



In [13]:
# Since we are not interested in numerics as numbers, we drop that column
tr_df = tr_df.drop(columns=['tr_w_num'])
tr_df

Unnamed: 0,audio_file,tr_syl,tr_char,dur,en
0,1/1_0000.wav,그는 괜찮은 척하려고 애쓰는 것 같았다.,그는 괜찮은 척하려고 애쓰는 것 같았다.,3.5,He seemed to be pretending to be okay.
1,1/1_0001.wav,그녀의 사랑을 얻기 위해 애썼지만 헛수고였다.,그녀의 사랑을 얻기 위해 애썼지만 헛수고ᄋ...,4.0,I tried in vain to win her love.
2,1/1_0002.wav,용돈을 아껴 써라.,용돈을 아껴 써라.,1.8,Save your pocket money.
3,1/1_0003.wav,그는 아내를 많이 아낀다.,그는 아내를 많이 아낀다.,2.3,He cherishes his wife.
4,1/1_0004.wav,그 애 전화번호 알아?,그 애 전화번호 알아?,1.3,Do you know his number?
...,...,...,...,...,...
12849,4/4_5627.wav,깨진 유리 조각에 손가락을 벴어요.,깨진 유리 조각에 손가락을 벴어요.,3.9,I cut my finger on a piece of broken glass.
12850,4/4_5628.wav,아드님이 친구들과 야구를 하다 유리창을 깼어요.,아드님이 친구들과 야구를 하다 유리창을 ᄁ...,5.4,Your son was playing baseball with his friends...
12851,4/4_5629.wav,일찍 신청하는 사람이 유리할 거야.,일찍 신청하는 사람이 유리할 거야.,3.7,Early applicants will be given advantages.
12852,4/4_5630.wav,이 두 소설은 줄거리가 유사해요.,이 두 소설은 줄거리가 유사해요.,3.5,These two novels are much alike in their synop...


## Audio Analysis

Informational:
* Consonant Phonemes duration 25-50 msec
* Vowel Phonemes duration 50-100+ msec


In [14]:
# /kaggle/input/korean-single-speaker-speech-dataset/transcript.v.1.4.txt
# /kaggle/input/korean-single-speaker-speech-dataset/kss/1/1_0845.wav (example)

# from comet_ml import Experiment
import librosa
import librosa.display
from scipy.io import wavfile as wav
import IPython

In [15]:
import textgrid

from kss_textgrid import KssTextGrid
# TODO: Refactor to use KssDf
# from kss_chunk import KssChunk, KssChunkType

# (self, df, kss_id, type, base_dir=None):

In [20]:
# tg_dir = 'data/korean-single-speaker/kss'
kss_id = '1_0006'
# chunk_dir = 'data/korean-single-speaker/kss_chunks'
# chunk_char_path = f'{chunk_dir}/chunk_char.csv'
# chunk_syl_path = f'{chunk_dir}/chunk_syl.csv'
# tg = KssTextGrid(kss_id)
# kss_char = KssChunk(tg.chars_df(), kss_id, KssChunkType.CHAR)
# kss_syl = KssChunk(tg.syls_df(), kss_id, KssChunkType.SYL)
# disp.obj(kss_char.df, label=f'KSS {kss_id} Character Dataframe')
# disp.obj(kss_syl.df, label=f'KSS {kss_id} Syllable Dataframe')

kss = KssDf(kss_id)


disp.obj(kss.tg_path, label='TextGrid path')
disp.obj(kss.csv_paths, label='CSV paths')

# Setting save=True will save the TextGrid data to an csv file.
dfs = kss.load_tg(save=False)

TextGrid path

'./data/korean-single-speaker/kss/1/1_0006.TextGrid'

CSV paths

{'char': './data/korean-single-speaker/kss-csv/1_0006_char.csv',
 'syl': './data/korean-single-speaker/kss-csv/1_0006_syl.csv'}

KssDf[INFO]: Saving: ./data/korean-single-speaker/kss-csv/1_0006_char.csv
KssDf[INFO]: Saving: ./data/korean-single-speaker/kss-csv/1_0006_syl.csv


## Statistics

In [17]:
# disp.obj((kss_syl.df['stop'] - kss_syl.df['start']).describe(), 'Interval Statistics (using describe)')

In [18]:

# # WRITE to CSV
# kss_char.save()
# kss_syl.save()


In [19]:
# chunk_dir = 'data/korean-single-speaker/kss_chunks'
#
# def k_ax(ax, **kwargs):
#     ax.set_xticklabels(ax.get_xticklabels(), fontproperties=k_prop, rotation=90, **kwargs)
#
# def filter_counts(df, val):
#     counts = df.value_counts()
#     return counts[counts>val]
#
#
# all_chunk_char_df, all_chunk_syl_df = KssChunk.load_all()
#
# # Plot histogram of characters and syllables
# fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15,4))
# fig.suptitle('Horizontally stacked subplots')
# k_ax(ax1, size=16)
# k_ax(ax2, size=12)
# filter_counts(all_chunk_char_df['char'], 4).plot(kind='bar', ax=ax1)
# filter_counts(all_chunk_syl_df['syl'], 2).plot(kind='bar', ax=ax2)
# disp.obj(all_chunk_char_df, 'Character Interval Dataframe')
# disp.obj(all_chunk_char_df['audio'].unique(), 'Audio Files')
# disp.obj(all_chunk_syl_df, 'Syllable Interval Dataframe')
# disp.obj(all_chunk_syl_df['audio'].unique(), 'Audio Files')