# Analysis of tokens and phoneme labels

Data sources:
- Audio: CommonVoice (Punjabi & Hindi), retrieve from https://commonvoice.mozilla.org/en/datasets 
- TextGrids (grapheme-to-phoneme + forced alignment): VoxCommunis (Punjabi & Hindi), retrieved from https://osf.io/t957v/files/osfstorage

In [1]:
import ATDS
import pandas as pd

from pathlib import Path

code_times_df = pd.concat([ 
    pd.read_parquet(p, columns=["wav_file", "code_start_time", "code_end_time"]) \
        .assign(lang=p.name.split("_")[0].split("-")[0])
    for p in sorted(Path("/workspace/data/artefacts/ATDS/embeddings_xlsr-128/indic-CV/").glob("*embeddings.parquet"))
])

codes_df = pd.concat([ 
    pd.read_parquet(p, columns=["cluster_id"]) for p in
    sorted(Path("/workspace/data/artefacts/ATDS/embeddings_xlsr-128_clustered/indic-CV/").glob("*embeddings-clustered.parquet"))
])

code_times_df = pd.concat([ code_times_df, codes_df], axis=1)

char_offset = 34
code_times_df["cluster_char"] = [ chr(i + char_offset) for i in code_times_df.cluster_id ]

code_times_df

Unnamed: 0,wav_file,code_start_time,code_end_time,lang,cluster_id,cluster_char
0,common_voice_hi_25983327.wav,1.425657,1.445737,hindi,90,|
1,common_voice_hi_25983327.wav,1.445737,1.465817,hindi,433,Ǔ
2,common_voice_hi_25983327.wav,1.465817,1.485896,hindi,366,Ɛ
3,common_voice_hi_25983327.wav,1.485896,1.505976,hindi,84,v
4,common_voice_hi_25983327.wav,1.505976,1.526056,hindi,121,
...,...,...,...,...,...,...
436553,common_voice_pa-IN_23355474.wav,2.632627,2.652723,punjabi,361,Ƌ
436554,common_voice_pa-IN_23355474.wav,2.652723,2.672819,punjabi,46,P
436555,common_voice_pa-IN_23355474.wav,2.672819,2.692916,punjabi,361,Ƌ
436556,common_voice_pa-IN_23355474.wav,2.692916,2.713012,punjabi,361,Ƌ


In [2]:
all_utts_df = ATDS.make_all_utts_df(code_times_df)

all_utts_df

Unnamed: 0,lang,wav_file,cluster_char
0,hindi,common_voice_hi_23795238.wav,^ÛƐǺľęŜtƅħOǹp#ǀƺĴÈőƖǭ<stśȌħj­ƻǂŵłƖǭ8ÕÜƓ*®ţƵ...
1,hindi,common_voice_hi_23795241.wav,àYǮ£ĮǠŁġƅŪǜOÀįŋƵ;Ƶŀ{ǷKQſſƩÌSŇýǴhŠȀ±ǤŹř{Ƿnĸ©Ǧ...
2,hindi,common_voice_hi_23795246.wav,Yī£ęġǩÀ(ĚÈÓŠƠ±ǤÌgȅĂÜƝ®Ńȅǁ#ƢtƉǌVõĚǗ.¾Ǿ´żǁǪǡdįŭ...
3,hindi,common_voice_hi_23795248.wav,ŽȐġÃŪǨ3ǝǅşºōĿÝƃ\ģŌĊƺȅǁƢȌĶ ǗÓŠŸǪȎſƩSŇǴšª...
4,hindi,common_voice_hi_23795250.wav,YǮĮǠŁġtƅśħOÀįÀŐƺƃƨęïǤÌæǪǡƩƩř{ǷKQſŹgÇǴčǭŠǾ...
...,...,...,...
6926,punjabi,common_voice_pa-IN_37044799.wav,ƚYǮĮŁ(¶ŰůBƼǭÂĆƫƇūîŚ·ÜƫƇãi»ĭcƍŉňþÙƾĿŰůÈTǭ¼íǩÀ...
6927,punjabi,common_voice_pa-IN_37044801.wav,YǮĮűƀRĆǻƫŤƇi»ƼŉÂĆƝãBØƼcĵ8íƑƾ¦z{ǐǷKĀŹȂhǥĵƣƕKǳǍ...
6928,punjabi,common_voice_pa-IN_37044803.wav,ǓĢǻN{Ƿǉ&ÍƥǇƂǎ½(ȅ#ƢǡƩïřľy?ǠµŁt@ħOjõpý9ćSz...
6929,punjabi,common_voice_pa-IN_37044804.wav,ǓľƨǅæÕƫƇĴƨćǤţƿƎäǎƢ@ħÀðƃǟ[ƍǶǀĊiZǥƣǣķŏƝãi»ǥ...


In [3]:
import sentencepiece as spm

# For reproducibility of this analysis we'll use a pre-trained sub-word model
# to ensure that the (arbitrarily-assigned) piece ids are stable across runs
pa_spm = spm.SentencePieceProcessor(model_file='/workspace/data/artefacts/ATDS/punjabi-CV_10k-sentencepiece.model')

all_utts_df["utt_pieces"] = all_utts_df.cluster_char.apply(lambda x: pa_spm.encode(x, out_type=str))
all_utts_df["utt_piece_ids"] = all_utts_df.cluster_char.apply(lambda x: pa_spm.encode(x, out_type=int))

all_utts_df

Unnamed: 0,lang,wav_file,cluster_char,utt_pieces,utt_piece_ids
0,hindi,common_voice_hi_23795238.wav,^ÛƐǺľęŜtƅħOǹp#ǀƺĴÈőƖǭ<stśȌħj­ƻǂŵłƖǭ8ÕÜƓ*®ţƵ...,"[▁^, ÛƐ, , Ǻ, ľ, , ęŜt, ƅ, , ħOǹp, #, ǀ, ƺĴ...","[365, 1088, 71, 79, 189, 9, 1612, 430, 1, 3787..."
1,hindi,common_voice_hi_23795241.wav,àYǮ£ĮǠŁġƅŪǜOÀįŋƵ;Ƶŀ{ǷKQſſƩÌSŇýǴhŠȀ±ǤŹř{Ƿnĸ©Ǧ...,"[▁àYǮ, £, ĮǠŁġƅŪ, ǜO, Àį, ŋ, Ƶ, ;, Ƶ, ŀ, {, Ƿ,...","[1198, 127, 3580, 3774, 2699, 62, 145, 40, 145..."
2,hindi,common_voice_hi_23795246.wav,Yī£ęġǩÀ(ĚÈÓŠƠ±ǤÌgȅĂÜƝ®Ńȅǁ#ƢtƉǌVõĚǗ.¾Ǿ´żǁǪǡdįŭ...,"[▁Yī, £, ęġ, ǩÀ, (, Ě, ÈÓŠ, Ơ, ±, Ǥ, Ìgȅ, , Ă...","[537, 127, 3221, 705, 74, 201, 2527, 170, 20, ..."
3,hindi,common_voice_hi_23795248.wav,ŽȐġÃŪǨ3ǝǅşºōĿÝƃ\ģŌĊƺȅǁƢȌĶ ǗÓŠŸǪȎſƩSŇǴšª...,"[▁ŽȐ, ġÃŪ, , Ǩ, 3, ǝ, ǅ, şº, ōĿ, , Ý, ƃ, \,...","[2734, 5160, 71, 420, 64, 130, 149, 462, 1211,..."
4,hindi,common_voice_hi_23795250.wav,YǮĮǠŁġtƅśħOÀįÀŐƺƃƨęïǤÌæǪǡƩƩř{ǷKQſŹgÇǴčǭŠǾ...,"[▁YǮ, ĮǠŁġ, t, ƅś, ħO, Àį, À, Ő, ƺ, ƃ, , ƨ, ...","[410, 4012, 1079, 2895, 1199, 2699, 385, 309, ..."
...,...,...,...,...,...
6926,punjabi,common_voice_pa-IN_37044799.wav,ƚYǮĮŁ(¶ŰůBƼǭÂĆƫƇūîŚ·ÜƫƇãi»ĭcƍŉňþÙƾĿŰůÈTǭ¼íǩÀ...,"[▁ƚYǮ, ĮŁ, (, ¶, ŰůBƼ, ǭ, ÂĆƫ, ƇūîŚ, ·, ÜƫƇ, ...","[4061, 7688, 232, 54, 5902, 231, 4107, 5660, 2..."
6927,punjabi,common_voice_pa-IN_37044801.wav,YǮĮűƀRĆǻƫŤƇi»ƼŉÂĆƝãBØƼcĵ8íƑƾ¦z{ǐǷKĀŹȂhǥĵƣƕKǳǍ...,"[▁YǮĮű, ƀR, ĆǻƫŤ, Ƈi, », ƼŉÂĆƝã, BØ, Ƽcĵ, 8, ...","[6921, 829, 6943, 1185, 4, 8616, 1067, 7382, 2..."
6928,punjabi,common_voice_pa-IN_37044803.wav,ǓĢǻN{Ƿǉ&ÍƥǇƂǎ½(ȅ#ƢǡƩïřľy?ǠµŁt@ħOjõpý9ćSz...,"[▁, ǓĢǻN, {, Ƿ, ǉ, &, ÍƥǇƂǎ, ½(, ȅ, #, Ƣ, ǡ...","[105, 4104, 42, 182, 323, 3968, 9304, 944, 250..."
6929,punjabi,common_voice_pa-IN_37044804.wav,ǓľƨǅæÕƫƇĴƨćǤţƿƎäǎƢ@ħÀðƃǟ[ƍǶǀĊiZǥƣǣķŏƝãi»ǥ...,"[▁Ǔ, , ľƨǅ, , æÕ, , ƫƇĴ, , ƨ, ć, Ǥ, ţ, ƿƎä...","[521, 94, 2599, 59, 1853, 48, 6123, 72, 283, 3..."


In [53]:
from tqdm import tqdm
from itertools import chain
from pympi.Praat import TextGrid

for _, file_data in tqdm(all_utts_df.iterrows(), total=len(all_utts_df)):
    
    tg_folder = Path(f"/workspace/data/VoxCommunis/{file_data.lang}_textgrids")
    tg_file = (tg_folder / file_data.wav_file).with_suffix(".TextGrid")
    
    if not tg_file.exists():
        # Some files in CV don't actually exist in VoxCommunis for some reason        
        continue

    tg_data = TextGrid(tg_file)

    file_times = code_times_df[ code_times_df.wav_file == file_data.wav_file ].reset_index(drop=True)

    # Append code tier (i.e. at wav2vec 49 Hz emission rate)
    code_tier = tg_data.add_tier("code")
    
    for j, r in file_times.iterrows():
        code_tier.add_interval(r.code_start_time, r.code_end_time, str(int(r.cluster_id)))
        
    code_consecutives = file_times['cluster_id'].diff().ne(0).cumsum().rename('dedup_cluster_number')
    
    # Append de-duplicated code tier
    dedup_code_df = file_times[["code_start_time", "code_end_time", "cluster_id"]] \
        .groupby(code_consecutives) \
        .agg(list) \
        .assign(
            code_start_time = lambda df: [ min(t) for t in df.code_start_time ],
            code_end_time = lambda df: [ max(t) for t in df.code_end_time ],
            cluster_id = lambda df: [ c[0] for c in df.cluster_id ]
        )

    dedup_code_tier = tg_data.add_tier("dedup-code")

    for k, r in dedup_code_df.iterrows():
        dedup_code_tier.add_interval(r.code_start_time, r.code_end_time, str(int(r.cluster_id)))
        
    # Add token tier
    file_pieces = all_utts_df[ all_utts_df.wav_file == file_data.wav_file ].utt_piece_ids.to_list()[0]
    
    pieces_expanded = [ [ ord(p) - 34 for p in list(pa_spm.id_to_piece(i)) if p != '▁' ] for i in file_pieces ]
    
    piece_int = list(chain.from_iterable([ [ p_id ] * len(p_comps) for (p_id, p_comps) in zip(file_pieces, pieces_expanded) ]))
    piece_str = list(chain.from_iterable([ [ f"{p_id} ({('-'.join([ str(c) for c in p_comps ]))})" ] * len(p_comps) for (p_id, p_comps) in zip(file_pieces, pieces_expanded) ]))
    
    dedup_code_df["piece_int"] = piece_int
    dedup_code_df["piece_str"] = piece_str
    
    piece_consecutives = dedup_code_df['piece_int'].diff().ne(0).cumsum().rename('piece_number')

    dedup_piece_df = dedup_code_df[["code_start_time", "code_end_time", "piece_str"]] \
        .groupby(piece_consecutives) \
        .agg(list) \
        .assign(
            code_start_time = lambda df: [ min(t) for t in df.code_start_time ],
            code_end_time = lambda df: [ max(t) for t in df.code_end_time ],
            piece_str = lambda df: [ p[0] for p in df.piece_str ]
        )

    dedup_piece_tier = tg_data.add_tier("piece")

    for _, r in dedup_piece_df.iterrows():
        dedup_piece_tier.add_interval(r.code_start_time, r.code_end_time, r.piece_str)
    
    # Write out new file
    out_file = f"/workspace/data/artefacts/ATDS/VoxCommunis_pseudolabeled/{file_data.wav_file}".replace(".wav", ".TextGrid")
    tg_data.to_file(out_file)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6931/6931 [04:07<00:00, 27.95it/s]


Read newly-written TextGrids back in

In [54]:
tgs_for_analysis = sorted(list(Path("/workspace/data/artefacts/ATDS/VoxCommunis_pseudolabeled/").glob("*.TextGrid")))

all_tg_dfs = []

for tg_file in tqdm(tgs_for_analysis):
    tg_data = TextGrid(tg_file)
    
    phones_tier = [ t for t in tg_data.get_tiers() if 'phones' in t.name ][0]
    phones_df = pd.DataFrame([ p for p in phones_tier.get_intervals() if p[2].strip() != '' ], columns=["phone_start_s", "phone_end_s", "phone"])
    
    pieces_df = pd.DataFrame([ p for p in tg_data.get_tier('piece').get_intervals() if p[2].strip() != '' ], columns=["piece_start_s", "piece_end_s", "piece_str"])
    pieces_df["piece_int"] = [ int(p.split(" (")[0]) for p in pieces_df.piece_str ]
    
    merged_df = phones_df.merge(pieces_df, how='cross') \
        .query("(piece_start_s + (piece_end_s-piece_start_s)/2) >= phone_start_s and (piece_start_s + (piece_end_s-piece_start_s)/2) <= phone_end_s") \
        .sort_values("phone_start_s")
    
    merged_df["tg_file"] = tg_file.name
    
    all_tg_dfs.append(merged_df)
    
all_tg_df = pd.concat(all_tg_dfs)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4353/4353 [00:21<00:00, 199.86it/s]


In [55]:
all_tg_df['lang'] = all_tg_df.tg_file.str.extract("(hi|pa-IN)")

In [56]:
all_tg_df

Unnamed: 0,phone_start_s,phone_end_s,phone,piece_start_s,piece_end_s,piece_str,piece_int,tg_file,lang
58,1.37,1.41,u,1.365084,1.385159,365 (60),365,common_voice_hi_23795238.TextGrid,hi
59,1.37,1.41,u,1.385159,1.425308,1088 (185-366),1088,common_voice_hi_23795238.TextGrid,hi
118,1.41,1.50,m,1.425308,1.445383,71 (121),71,common_voice_hi_23795238.TextGrid,hi
119,1.41,1.50,m,1.445383,1.465458,79 (472),79,common_voice_hi_23795238.TextGrid,hi
120,1.41,1.50,m,1.465458,1.485533,189 (284),189,common_voice_hi_23795238.TextGrid,hi
...,...,...,...,...,...,...,...,...,...
744,2.70,2.75,p,2.705838,2.745924,955 (318-382),955,common_voice_pa-IN_37029497.TextGrid,pa-IN
792,2.75,2.79,ə,2.745924,2.765968,128 (337),128,common_voice_pa-IN_37029497.TextGrid,pa-IN
793,2.75,2.79,ə,2.765968,2.786011,93 (146),93,common_voice_pa-IN_37029497.TextGrid,pa-IN
841,2.79,2.83,ɾ,2.786011,2.846141,2858 (383-346-432),2858,common_voice_pa-IN_37029497.TextGrid,pa-IN


In [57]:
all_tg_df.to_csv("/workspace/data/artefacts/ATDS/VoxCommunis_token-phonemes-alignments.csv", index=False)

Calculate conditional probabilities

In [58]:
probs_df =  pd.concat([
        piece_df.assign(prob_phone_and_piece_given_piece = lambda x: x.count_phone_and_piece / x.count_phone_and_piece.sum())
        for _, piece_df in 
        all_tg_df \
            .groupby(['lang', 'piece_int', 'piece_str', 'phone']) \
            .size() \
            .to_frame("count_phone_and_piece") \
            .reset_index() \
            .groupby(['lang', 'piece_int', 'piece_str'])
    ]) \
    .sort_values(['lang', 'piece_int', 'prob_phone_and_piece_given_piece'], ascending=[True, True, False])

probs_df

Unnamed: 0,lang,piece_int,piece_str,phone,count_phone_and_piece,prob_phone_and_piece_given_piece
1,hi,1,1 (104),aː,412,0.439232
20,hi,1,1 (104),r,151,0.160981
15,hi,1,1 (104),n,68,0.072495
37,hi,1,1 (104),ɦ,48,0.051173
13,hi,1,1 (104),l,36,0.038380
...,...,...,...,...,...,...
59046,pa-IN,9996,9996 (409-136-216-337),ə,1,1.000000
59047,pa-IN,9997,9997 (488-228-455),ə,2,1.000000
59048,pa-IN,9998,9998 (136-249-402-228),s,2,1.000000
59049,pa-IN,9999,9999 (337-300-383-346-219),ɾ,1,1.000000


What are the phones most commonly associated with piece 1 in both Punjabi and Hindi? Looks like mainly low vowels aː and ɑ

In [62]:
probs_df.query("piece_int==1").groupby('lang').head(3)

Unnamed: 0,lang,piece_int,piece_str,phone,count_phone_and_piece,prob_phone_and_piece_given_piece
1,hi,1,1 (104),aː,412,0.439232
20,hi,1,1 (104),r,151,0.160981
15,hi,1,1 (104),n,68,0.072495
34621,pa-IN,1,1 (104),ɑ,401,0.687822
34625,pa-IN,1,1 (104),ə,112,0.19211
34626,pa-IN,1,1 (104),ə̃,17,0.02916


What are the phones most commonly associated with piece 2 in both Punjabi and Hindi? Looks like mainly high back vowels...

In [61]:
probs_df.query("piece_int==2").groupby('lang').head(3)

Unnamed: 0,lang,piece_int,piece_str,phone,count_phone_and_piece,prob_phone_and_piece_given_piece
53,hi,2,2 (100),o,216,0.55102
58,hi,2,2 (100),uː,53,0.135204
57,hi,2,2 (100),u,29,0.07398
34637,pa-IN,2,2 (100),o,238,0.563981
34640,pa-IN,2,2 (100),u,76,0.180095
34647,pa-IN,2,2 (100),ʊ,37,0.087678


In [63]:
probs_df.to_csv("/workspace/data/artefacts/ATDS/VoxCommunis_token-phonemes-aggregated.csv", index=False)