In [3]:
from collections import Counter

import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)

from config import Project
from src.display import Disp

# Display handler with some nice helpers
disp = Disp(display)
# Uncomment to see the source code
# disp.code('display.py', label='display.py ')


# disp examples
# disp(IPython.display.Audio('./data/korean-single-speaker/kss/1/1_0000.wav'))
# disp_file_source('marker_df.py', label='marker_df')
# disp_audio_file('./data/korean-single-speaker/kss/1/1_0000.wav', label='1_0000.wav')

## Analyze KSS transcriptions

In [4]:
import re

def to_chars(sentence, split_pattern):
    words = re.split(split_pattern, sentence)
    chs = []
    for word in words:
        chs = chs + list(word)

    return chs

transcript_file = Project.path.TRANSCRIPT
tr_df = pd.read_csv(transcript_file, sep="|", names=["audio_file", "tr_w_num", "tr_syl", "tr_char", "dur", "en"])

disp.obj(tr_df['tr_char'][0], label='First Transcription Entry')


from nltk.tokenize import RegexpTokenizer
# Create an instance of RegexpTokenizer for alphanumeric tokens
tokeniser = RegexpTokenizer('[^\s.!?]')

tr_df['tr_char_tok'] = tr_df['tr_char'].apply(lambda row: tokeniser.tokenize(row))
tr_df['tr_syl_tok'] = tr_df['tr_syl'].apply(lambda row: tokeniser.tokenize(row))


First Transcription Entry

'그는 괜찮은 척하려고 애쓰는 것 같았다.'

In [5]:
def list_to_freq_dict(a):
    freq = [a.count(i) for i in a]
    return dict(list(zip(a,freq)))

c_dict_char = Counter()
for char_toks in tr_df['tr_char_tok']:
    c_dict_char = c_dict_char + Counter(list_to_freq_dict(char_toks))

c_dict_syl = Counter()
for syl_toks in tr_df['tr_syl_tok']:
    c_dict_syl = c_dict_syl + Counter(list_to_freq_dict(syl_toks))

### Counts by Character and Syllable

In [6]:
disp.obj(c_dict_char.most_common(20), label='20 most common characters')
disp.obj(c_dict_syl.most_common(10), label='10 most common syllables')

20 most common characters

[('ᄋ', 49336),
 ('ᅡ', 39344),
 ('ᅵ', 27039),
 ('ᅳ', 22833),
 ('ᆫ', 22224),
 ('ᄀ', 20349),
 ('ᅥ', 20172),
 ('ᆯ', 16219),
 ('ᄌ', 15104),
 ('ᄉ', 15042),
 ('ᄂ', 13965),
 ('ᅩ', 13423),
 ('ᄃ', 12996),
 ('ᄒ', 11132),
 ('ᅮ', 10761),
 ('ᄅ', 10298),
 ('ᅦ', 9771),
 ('ᄆ', 8413),
 ('ᆼ', 8236),
 ('ᅢ', 7893)]

10 most common syllables

[('이', 6482),
 ('요', 5905),
 ('다', 5351),
 ('는', 4918),
 ('어', 4773),
 ('에', 4315),
 ('가', 3626),
 ('을', 3169),
 ('은', 2794),
 ('아', 2587)]

Consonant Phonemes duration 25-50 msec
Vowel Phonemes duration 50-100+ msec






