In [27]:
import os
import pandas as pd

In [28]:
mcv_dir = '/mnt/data/mozilla_common_voice/en_1488h_2019-12-10'
train = os.path.join(mcv_dir, 'train.tsv')
dev = os.path.join(mcv_dir, 'dev.tsv')
test = os.path.join(mcv_dir, 'test.tsv')

In [29]:
train_df = pd.read_csv(train, sep='\t')
train_df['partition'] = 'train'

dev_df = pd.read_csv(dev, sep='\t')
dev_df['partition'] = 'dev'

test_df = pd.read_csv(test, sep='\t')
test_df['partition'] = 'test'

all_df = pd.concat([train_df, dev_df, test_df])

### Average utterances per speaker

In [30]:
all_utts = len(train_df)
speakers = len(train_df['client_id'].unique())
accent_utts = len(train_df[~train_df['accent'].isnull()])
accent_speakers = len(train_df[~train_df['accent'].isnull()]['client_id'].unique())

print(f"""Total train utts: {all_utts}
Utts with accent labels: {accent_utts}

Unique speakers: {speakers}
Speakers with accent labels: {accent_speakers}""")

Total train utts: 232975
Utts with accent labels: 135391

Unique speakers: 10013
Speakers with accent labels: 3220


In [31]:
all_df.groupby('partition').agg(
    {'client_id': lambda x:len(x) / len(x.unique())}
)

Unnamed: 0_level_0,client_id
partition,Unnamed: 1_level_1
dev,3.551566
test,1.639675
train,23.267253


In [32]:
dev_df['client_id'].value_counts().value_counts().sort_index(ascending=False)

5     871
4    2146
3     460
2     316
1     580
Name: client_id, dtype: int64

In [33]:
test_df['client_id'].value_counts().value_counts().sort_index(ascending=False)

3    1546
2    2967
1    4959
Name: client_id, dtype: int64

Many speakers in train set with lots of utterances.

In [34]:
(train_df['client_id'].value_counts() > 100).value_counts()

False    9654
True      359
Name: client_id, dtype: int64

In [35]:
train_df['client_id'].value_counts().value_counts().sort_index(ascending=False).head()

6441    1
4234    1
2772    1
2693    1
2681    1
Name: client_id, dtype: int64

In [36]:
train_df['client_id'].value_counts().head()

ab72e9ab22713aec03a3189202a0713e56016ea07569b3041fcd65d77c9eb3f8c692f7ccaec75c16bbc62476d528f434d829193994cf07fcde9e3b1e4a2fa93a    6441
7963691c43c8cc498c58f117527522bf772c76c38530570bc55ef04834f67fb7a9227bd0fa1f13e64e8de1cde6594f3501e172ab86559697c08726cac26f4c6f    4234
7eff9a54bdb0619deffda7609d5b8565278e3328de99e63908be2a37c1ad16516240b9c92bc6b68d8130b20dc556f57005dc053f4874a49589f5971a31b97e98    2772
e6dbbe39377aa0fe2851852b2c695c9c31094c80c9ccd244bac8dfdc8a523b2a114362d4c717bd9d628499fe1d3c11971221be5ec4dadce15c9ff8c5254ae368    2693
29b8505586cd43382cd695da6b943f401104be710a5b60e814ac5fe7e06b39459cf8fe1701ca83f8154b3ccd749df7c2aef33ff23950bb1a135b1e1c393dbcf6    2681
Name: client_id, dtype: int64

In [37]:
small_train = train_df.groupby('client_id').apply(lambda x: x.sample(min(len(x), 100))).reset_index(drop=True)

In [38]:
small_train[['accent', 'partition']].pivot_table(
    index='accent', columns='partition',
    aggfunc=len, fill_value=0, margins=True
).sort_values('All', ascending=False)

partition,train,All
accent,Unnamed: 1_level_1,Unnamed: 2_level_1
All,76429,76429
us,39701,39701
england,11672,11672
indian,8641,8641
canada,5598,5598
australia,4697,4697
african,1192,1192
ireland,1084,1084
scotland,987,987
newzealand,969,969


In [39]:
small_utts = len(small_train)
small_speakers = len(small_train['client_id'].unique())
small_accent_utts = len(small_train[~small_train['accent'].isnull()])
small_accent_speakers = len(small_train[~small_train['accent'].isnull()]['client_id'].unique())

print(f"""Total train utts: {small_utts}
Utts with accent labels: {small_accent_utts}

Unique speakers: {small_speakers}
Speakers with accent labels: {small_accent_speakers}""")

Total train utts: 152892
Utts with accent labels: 76429

Unique speakers: 10013
Speakers with accent labels: 3220


In [40]:
small_train.groupby('accent').agg(
    {'client_id': lambda x:len(x) / len(x.unique())}
)

Unnamed: 0_level_0,client_id
accent,Unnamed: 1_level_1
african,24.326531
australia,29.727848
bermuda,7.6
canada,26.784689
england,23.437751
hongkong,8.111111
indian,22.328165
ireland,27.1
malaysia,14.434783
newzealand,27.685714


### Accent utterance counts per partition

In [136]:
len(train_df[~train_df['accent'].isnull()]) / len(train_df)

0.5811396072539972

In [138]:
all_df[['accent', 'partition']].pivot_table(
    index='accent', columns='partition',
    aggfunc=len, fill_value=0, margins=True,
).sort_values('All', ascending=False)

partition,dev,test,train,All
accent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
All,2100,1398,135391,138889
us,1048,669,72392,74109
england,304,153,19204,19661
australia,85,28,12208,12321
indian,324,314,11395,12033
canada,114,63,9150,9327
scotland,7,14,3686,3707
newzealand,31,13,1949,1993
ireland,33,18,1866,1917
african,40,26,1326,1392


### Accent and gender counts per partition

In [42]:
train_df[['accent', 'gender']].pivot_table(
    index='accent', columns='gender',
    aggfunc=len, fill_value=0, margins=True
).sort_values('All', ascending=False)

gender,female,male,other,All
accent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
All,29444,97992,6340,133776
us,16593,53118,1238,70949
england,3346,14978,811,19135
australia,942,6953,4269,12164
indian,1609,9776,0,11385
canada,2588,6545,5,9138
scotland,2950,736,0,3686
newzealand,261,1658,0,1919
ireland,562,1294,10,1866
african,238,1088,0,1326


In [43]:
dev_df[['accent', 'gender']].pivot_table(
    index='accent', columns='gender',
    aggfunc=len, fill_value=0, margins=True
).sort_values('All', ascending=False)

gender,female,male,other,All
accent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
All,414,1648,7,2069
us,246,783,4,1033
indian,9,311,0,320
england,43,254,3,300
canada,36,74,0,110
australia,21,64,0,85
african,5,35,0,40
newzealand,8,23,0,31
wales,14,17,0,31
ireland,4,25,0,29


In [44]:
test_df[['accent', 'gender']].pivot_table(
    index='accent', columns='gender',
    aggfunc=len, fill_value=0, margins=True
).sort_values('All', ascending=False)

gender,female,male,other,All
accent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
All,227,1141,18,1386
us,150,509,5,664
indian,18,294,0,312
england,18,130,4,152
canada,7,49,6,62
other,2,37,1,40
african,5,21,0,26
australia,8,17,0,25
ireland,3,13,2,18
philippines,3,12,0,15


### Transcriptions

In [45]:
import string

train_split_text = train_df['sentence'].str.lower().apply(
    lambda x: x.translate(str.maketrans('', '', string.punctuation))
).str.split(' ')

In [46]:
tokens = train_split_text.agg(len).sum()
types = set()
train_split_text.apply(lambda x: types.update(x))
print(f"""Total tokens: {tokens}
Total types: {len(types)}""")

Total tokens: 2379928
Total types: 124677


In [47]:
# overlap with dev/test

### Audio metrics

In [50]:

audio_dir = os.path.join(mcv_dir, 'clips')

# avg utterance length
# summed per speaker
# per accent
# per partition

In [54]:
import glob

audio_files = glob.glob(f"{audio_dir}/*")

In [78]:
all_df['path'].apply(lambda x: os.path.exists(os.path.join(audio_dir, x))).sum() == len(all_df)

True

In [98]:
import sys
from mutagen.mp3 import MP3

def get_mp3_dur(f, audio_dir=None):
    """Return duration in seconds of mp3 file"""
    if audio_dir is not None:
        f = os.path.join(audio_dir, f)
    try:
        a = MP3(f)
        return a.info.length
    except Exception as e:
        return repr(e)

In [65]:
train_df['path'].head().apply(lambda x: get_mp3_dur(x, audio_dir)).sum()

CPU times: user 8 µs, sys: 0 ns, total: 8 µs
Wall time: 17.2 µs


27.456

In [99]:
%%time
x = test_df['path'].head()
x[0] = 'blah.mp3'
y = x.apply(lambda x: get_mp3_dur(x, audio_dir))

CPU times: user 2.33 ms, sys: 275 µs, total: 2.6 ms
Wall time: 3.87 ms


In [102]:
%%time
all_df['audio_dur'] = all_df['path'].apply(lambda x: get_mp3_dur(x, audio_dir))

CPU times: user 1min 19s, sys: 7.61 s, total: 1min 27s
Wall time: 3min 58s


In [152]:
# seem like empty files
# these are all recordings from a single client_id, consecutive filenames => single session?
# that person has 650 other recordings so no worries just to drop these
all_df[all_df['audio_dur'].map(type) != float]

Unnamed: 0,client_id,path,sentence,up_votes,down_votes,age,gender,accent,partition,audio_dur
185141,dc3db1a53620d85b9664bf89a66c12905ad39c9c4cd2bf...,common_voice_en_19411965.mp3,They possessed middle and rear back vertebrae ...,2,0,thirties,other,england,train,"HeaderNotFoundError(""can't sync to MPEG frame"")"
185142,dc3db1a53620d85b9664bf89a66c12905ad39c9c4cd2bf...,common_voice_en_19411966.mp3,Chairman of the Gateway Group till his death.,2,0,thirties,other,england,train,"HeaderNotFoundError(""can't sync to MPEG frame"")"
185143,dc3db1a53620d85b9664bf89a66c12905ad39c9c4cd2bf...,common_voice_en_19411967.mp3,The team played its home games at Fenway Park.,2,0,thirties,other,england,train,"HeaderNotFoundError(""can't sync to MPEG frame"")"
185144,dc3db1a53620d85b9664bf89a66c12905ad39c9c4cd2bf...,common_voice_en_19411968.mp3,"This team consists of Fireball, Kuei, Phantasm...",2,0,thirties,other,england,train,"HeaderNotFoundError(""can't sync to MPEG frame"")"
185145,dc3db1a53620d85b9664bf89a66c12905ad39c9c4cd2bf...,common_voice_en_19411974.mp3,"Murray tried to restrain the MacDonalds, but t...",2,0,thirties,other,england,train,"HeaderNotFoundError(""can't sync to MPEG frame"")"
185146,dc3db1a53620d85b9664bf89a66c12905ad39c9c4cd2bf...,common_voice_en_19411975.mp3,"It may be the remnant of the core of a larger,...",2,0,thirties,other,england,train,"HeaderNotFoundError(""can't sync to MPEG frame"")"
185147,dc3db1a53620d85b9664bf89a66c12905ad39c9c4cd2bf...,common_voice_en_19411976.mp3,This enzyme is extremely important in the proc...,2,0,thirties,other,england,train,"HeaderNotFoundError(""can't sync to MPEG frame"")"
185148,dc3db1a53620d85b9664bf89a66c12905ad39c9c4cd2bf...,common_voice_en_19411977.mp3,"Josceline's son was Reginald, bishop of Bath.",2,0,thirties,other,england,train,"HeaderNotFoundError(""can't sync to MPEG frame"")"
185149,dc3db1a53620d85b9664bf89a66c12905ad39c9c4cd2bf...,common_voice_en_19411978.mp3,Many of the students are either international ...,2,0,thirties,other,england,train,"HeaderNotFoundError(""can't sync to MPEG frame"")"
185150,dc3db1a53620d85b9664bf89a66c12905ad39c9c4cd2bf...,common_voice_en_19411984.mp3,She told them that there is no reality in signs.,2,0,thirties,other,england,train,"HeaderNotFoundError(""can't sync to MPEG frame"")"


In [124]:
train_df['audio_dur'] = train_df['path'].apply(lambda x: get_mp3_dur(x, audio_dir))
dev_df['audio_dur'] = dev_df['path'].apply(lambda x: get_mp3_dur(x, audio_dir))
test_df['audio_dur'] = test_df['path'].apply(lambda x: get_mp3_dur(x, audio_dir))

In [132]:
print(f"Train data: {train_df[train_df['audio_dur'].map(type) == float]['audio_dur'].sum() / 3600.:.1f}h")
print(f"Dev data: {dev_df[dev_df['audio_dur'].map(type) == float]['audio_dur'].sum() / 3600.:.1f}h")
print(f"Test data: {test_df[test_df['audio_dur'].map(type) == float]['audio_dur'].sum() / 3600.:.1f}h")

Train data: 364.4h
Dev data: 26.1h
Test data: 24.7h


In [142]:
# hours of audio per accent per partition

all_df[all_df['audio_dur'].map(type) == float].pivot_table(
    index='accent', columns='partition', values='audio_dur',
    aggfunc=sum, fill_value=0, margins=True,
) / 3600

partition,dev,test,train,All
accent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
african,0.062347,0.0444,2.1264,2.233147
australia,0.14184,0.040553,18.603627,18.78602
bermuda,0.007393,0.01274,0.06532,0.085453
canada,0.1841,0.13018,14.184707,14.498987
england,0.50472,0.239787,28.513049,29.257556
hongkong,0.029373,0.017587,0.11854,0.1655
indian,0.53528,0.499613,18.324633,19.359527
ireland,0.048613,0.026333,2.924753,2.9997
malaysia,0.013693,0.0167,0.529267,0.55966
newzealand,0.054793,0.022213,3.203592,3.280598


In [183]:
import datetime

def pretty_time_agg(series):
    s = int(series.sum())
    m = s // 60
    h = m // 60
    d = h // 24
    h = h % 24
    m = m % 60
    s = s % 60
    if d > 0:
        return "{0}d {1}".format(d, datetime.time(h, m, s).strftime("%Hh %Mm %Ss"))
    else:
        return datetime.time(h, m, s).strftime("%Hh %Mm %Ss")

all_df[all_df['audio_dur'].map(type) == float].pivot_table(
    index='accent', columns='partition', values='audio_dur',
    aggfunc=pretty_time_agg, fill_value=0, margins=True,
)

partition,dev,test,train,All
accent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
african,00h 03m 44s,00h 02m 39s,02h 07m 35s,02h 13m 59s
australia,00h 08m 30s,00h 02m 25s,18h 36m 13s,18h 47m 09s
bermuda,00h 00m 26s,00h 00m 45s,00h 03m 55s,00h 05m 07s
canada,00h 11m 02s,00h 07m 48s,14h 11m 04s,14h 29m 56s
england,00h 30m 16s,00h 14m 23s,1d 04h 30m 46s,1d 05h 15m 27s
hongkong,00h 01m 45s,00h 01m 03s,00h 07m 06s,00h 09m 55s
indian,00h 32m 07s,00h 29m 58s,18h 19m 28s,19h 21m 34s
ireland,00h 02m 55s,00h 01m 34s,02h 55m 29s,02h 59m 58s
malaysia,00h 00m 49s,00h 01m 00s,00h 31m 45s,00h 33m 34s
newzealand,00h 03m 17s,00h 01m 19s,03h 12m 12s,03h 16m 50s
