In [2]:
import os
import pandas as pd

In [5]:
mcv_path = os.getcwd()
train = os.path.join(mcv_path, 'train.tsv')
dev = os.path.join(mcv_path, 'dev.tsv')
test = os.path.join(mcv_path, 'test.tsv')

In [6]:
train_df = pd.read_csv(train, sep='\t')
train_df['partition'] = 'train'

dev_df = pd.read_csv(dev, sep='\t')
dev_df['partition'] = 'dev'

test_df = pd.read_csv(test, sep='\t')
test_df['partition'] = 'test'

all_df = pd.concat([train_df, dev_df, test_df])

### Average utterances per speaker

In [7]:
all_utts = len(train_df)
speakers = len(train_df['client_id'].unique())
accent_utts = len(train_df[~train_df['accent'].isnull()])
accent_speakers = len(train_df[~train_df['accent'].isnull()]['client_id'].unique())

print(f"""Total train utts: {all_utts}
Utts with accent labels: {accent_utts}

Unique speakers: {speakers}
Speakers with accent labels: {accent_speakers}""")

Total train utts: 232975
Utts with accent labels: 135391

Unique speakers: 10013
Speakers with accent labels: 3220


In [72]:
all_df.groupby('partition').agg(
    {'client_id': lambda x:len(x) / len(x.unique())}
)

Unnamed: 0_level_0,client_id
partition,Unnamed: 1_level_1
dev,3.551566
test,1.639675
train,23.267253


In [78]:
dev_df['client_id'].value_counts().value_counts().sort_index(ascending=False)

5     871
4    2146
3     460
2     316
1     580
Name: client_id, dtype: int64

In [79]:
test_df['client_id'].value_counts().value_counts().sort_index(ascending=False)

3    1546
2    2967
1    4959
Name: client_id, dtype: int64

Many speakers in train set with lots of utterances.

In [99]:
(train_df['client_id'].value_counts() > 100).value_counts()

False    9654
True      359
Name: client_id, dtype: int64

In [74]:
train_df['client_id'].value_counts().value_counts().sort_index(ascending=False).head()

6441    1
4234    1
2772    1
2693    1
2681    1
Name: client_id, dtype: int64

In [86]:
train_df['client_id'].value_counts().head()

ab72e9ab22713aec03a3189202a0713e56016ea07569b3041fcd65d77c9eb3f8c692f7ccaec75c16bbc62476d528f434d829193994cf07fcde9e3b1e4a2fa93a    6441
7963691c43c8cc498c58f117527522bf772c76c38530570bc55ef04834f67fb7a9227bd0fa1f13e64e8de1cde6594f3501e172ab86559697c08726cac26f4c6f    4234
7eff9a54bdb0619deffda7609d5b8565278e3328de99e63908be2a37c1ad16516240b9c92bc6b68d8130b20dc556f57005dc053f4874a49589f5971a31b97e98    2772
e6dbbe39377aa0fe2851852b2c695c9c31094c80c9ccd244bac8dfdc8a523b2a114362d4c717bd9d628499fe1d3c11971221be5ec4dadce15c9ff8c5254ae368    2693
29b8505586cd43382cd695da6b943f401104be710a5b60e814ac5fe7e06b39459cf8fe1701ca83f8154b3ccd749df7c2aef33ff23950bb1a135b1e1c393dbcf6    2681
Name: client_id, dtype: int64

In [93]:
small_train = train_df.groupby('client_id').apply(lambda x: x.sample(min(len(x), 100))).reset_index(drop=True)

In [94]:
small_train[['accent', 'partition']].pivot_table(
    index='accent', columns='partition',
    aggfunc=len, fill_value=0, margins=True
).sort_values('All', ascending=False)

partition,train,All
accent,Unnamed: 1_level_1,Unnamed: 2_level_1
All,76429,76429
us,39701,39701
england,11672,11672
indian,8641,8641
canada,5598,5598
australia,4697,4697
african,1192,1192
ireland,1084,1084
scotland,987,987
newzealand,969,969


In [109]:
small_utts = len(small_train)
small_speakers = len(small_train['client_id'].unique())
small_accent_utts = len(small_train[~small_train['accent'].isnull()])
small_accent_speakers = len(small_train[~small_train['accent'].isnull()]['client_id'].unique())

print(f"""Total train utts: {small_utts}
Utts with accent labels: {small_accent_utts}

Unique speakers: {small_speakers}
Speakers with accent labels: {small_accent_speakers}""")

Total train utts: 152892
Utts with accent labels: 76429

Unique speakers: 10013
Speakers with accent labels: 3220


In [108]:
small_train.groupby('accent').agg(
    {'client_id': lambda x:len(x) / len(x.unique())}
)

Unnamed: 0_level_0,client_id
accent,Unnamed: 1_level_1
african,24.326531
australia,29.727848
bermuda,7.6
canada,26.784689
england,23.437751
hongkong,8.111111
indian,22.328165
ireland,27.1
malaysia,14.434783
newzealand,27.685714


### Accent utterance counts per partition

In [60]:
all_df[['accent', 'partition']].pivot_table(
    index='accent', columns='partition',
    aggfunc=len, fill_value=0, margins=True
).sort_values('All', ascending=False)

partition,dev,test,train,All
accent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
All,2100,1398,135391,138889
us,1048,669,72392,74109
england,304,153,19204,19661
australia,85,28,12208,12321
indian,324,314,11395,12033
canada,114,63,9150,9327
scotland,7,14,3686,3707
newzealand,31,13,1949,1993
ireland,33,18,1866,1917
african,40,26,1326,1392


### Accent and gender counts per partition

In [42]:
train_df[['accent', 'gender']].pivot_table(
    index='accent', columns='gender',
    aggfunc=len, fill_value=0, margins=True
).sort_values('All', ascending=False)

gender,female,male,other,All
accent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
All,29444,97992,6340,133776
us,16593,53118,1238,70949
england,3346,14978,811,19135
australia,942,6953,4269,12164
indian,1609,9776,0,11385
canada,2588,6545,5,9138
scotland,2950,736,0,3686
newzealand,261,1658,0,1919
ireland,562,1294,10,1866
african,238,1088,0,1326


In [44]:
dev_df[['accent', 'gender']].pivot_table(
    index='accent', columns='gender',
    aggfunc=len, fill_value=0, margins=True
).sort_values('All', ascending=False)

gender,female,male,other,All
accent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
All,414,1648,7,2069
us,246,783,4,1033
indian,9,311,0,320
england,43,254,3,300
canada,36,74,0,110
australia,21,64,0,85
african,5,35,0,40
newzealand,8,23,0,31
wales,14,17,0,31
ireland,4,25,0,29


In [45]:
test_df[['accent', 'gender']].pivot_table(
    index='accent', columns='gender',
    aggfunc=len, fill_value=0, margins=True
).sort_values('All', ascending=False)

gender,female,male,other,All
accent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
All,227,1141,18,1386
us,150,509,5,664
indian,18,294,0,312
england,18,130,4,152
canada,7,49,6,62
other,2,37,1,40
african,5,21,0,26
australia,8,17,0,25
ireland,3,13,2,18
philippines,3,12,0,15


### Transcriptions

In [123]:
import string

train_split_text = train_df['sentence'].str.lower().apply(
    lambda x: x.translate(str.maketrans('', '', string.punctuation))
).str.split(' ')

In [137]:
tokens = train_split_text.agg(len).sum()
types = set()
train_split_text.apply(lambda x: types.update(x))
print(f"""Total tokens: {tokens}
Total types: {len(types)}""")

Total tokens: 2379928
Total types: 124677


In [138]:
# overlap with dev/test

### Audio metrics

In [139]:
# avg utterance length
# summed per speaker
# per accent
# per partition