In [1]:
import jiwer
import pandas as pd

from pathlib import Path

List all reference files (generated by `/fairseq/blob/main/examples/speech_recognition/infer.py`)

In [2]:
asr_results_refs = sorted(list(Path("/workspace/data/artefacts/asr-results").glob("*/ref.word-*.txt")))

asr_results_refs[:10]

[PosixPath('/workspace/data/artefacts/asr-results/galician/ref.word-xls-r_cpt-galician-10h_ft-gl-1h.pt-test.txt'),
 PosixPath('/workspace/data/artefacts/asr-results/galician/ref.word-xls-r_cpt-galician-10h_portuguese-60h_ft-gl-1h.pt-test.txt'),
 PosixPath('/workspace/data/artefacts/asr-results/galician/ref.word-xls-r_cpt-galician-10h_spanish-60h_ft-gl-1h.pt-test.txt'),
 PosixPath('/workspace/data/artefacts/asr-results/galician/ref.word-xlsr2_300m_ft-gl-1h.pt-test.txt'),
 PosixPath('/workspace/data/artefacts/asr-results/iban/ref.word-xls-r_cpt_iban-7h_ft-iba-1h.pt-test.txt'),
 PosixPath('/workspace/data/artefacts/asr-results/iban/ref.word-xls-r_cpt_iban-7h_indonesian-60h_ft-iba-1h.pt-test.txt'),
 PosixPath('/workspace/data/artefacts/asr-results/iban/ref.word-xls-r_cpt_iban-7h_malay-60h_ft-iba-1h.pt-test.txt'),
 PosixPath('/workspace/data/artefacts/asr-results/iban/ref.word-xls-r_ft-iba-1h.pt-test.txt'),
 PosixPath('/workspace/data/artefacts/asr-results/punjabi/ref.word-xls-r_cpt_punjabi

Each reference line consists of the text, and a speaker-utterance ID `(None-1234)` (speaker is `None` since we didn't provide any speaker info)

In [3]:
!head /workspace/data/artefacts/asr-results/galician/ref.word-xls-r_cpt-galician-10h_ft-gl-1h.pt-test.txt

a organización política ou estrutura constitucional que caracteriza unha federación coñécese como federalismo (None-1286)
nalgúns países existen placas provisionais mentres non se completaron os trámites de matriculación (None-3334)
nas publicacións coincidían exiliados de diferentes ideoloxías que fuxían do conformismo e do folclorismo (None-4953)
a estrela de seis puntas representa o seis condados de irlanda do norte (None-1785)
úsase como alternativa para mulleres sensibles ó estróxeno (None-281)
estes sistemas xurdiron das tradicións anteriores dos sistemas de símbolos nos primeiros tempos neolíticos (None-5314)
completamente incomodado amadeo tornou a torino onde recibiu o título de duque de aosta (None-2816)
saliéntase a construción de templos e hipoxeos (None-6617)
a súa cerámica estaba pouco decorada (None-2439)
o escocés tivo a oportunidade de pilotar o monopraza durante varios adestramentos libres (None-2648)


Gather both reference and hypotheses (one for each fine-tuned model) into a single data frame:

In [4]:
all_eval_dfs = []

for ref_txt in asr_results_refs:
    hyp_txt = str(ref_txt).replace("ref.", "hypo.")
    
    with open(ref_txt) as r, open(hyp_txt) as h:
        ref_lines = [ l.replace(")\n", "").split(" (None-") for l in r.readlines() ]
        hyp_lines = [ l.replace(")\n", "").split(" (None-") for l in h.readlines() ]
        
    ref_df = pd.DataFrame(ref_lines, columns=["ref_text", "utt_id"])
    hyp_df = pd.DataFrame(hyp_lines, columns=["hyp_text", "utt_id"])
    
    checkpoint_name = ref_txt.name.split(".pt")[0].replace("ref.word-", "")
    
    eval_df = ref_df.merge(hyp_df, how='left').assign(checkpoint=checkpoint_name)
    
    all_eval_dfs.append(eval_df)
    
all_eval_dfs = pd.concat(all_eval_dfs)

all_eval_dfs

Unnamed: 0,ref_text,utt_id,hyp_text,checkpoint
0,a organización política ou estrutura constituc...,1286,a organización política ou estrutura constituc...,xls-r_cpt-galician-10h_ft-gl-1h
1,nalgúns países existen placas provisionais men...,3334,nalgúns países existen placas provisionais men...,xls-r_cpt-galician-10h_ft-gl-1h
2,nas publicacións coincidían exiliados de difer...,4953,nas publicacións coincidían exiliados de difer...,xls-r_cpt-galician-10h_ft-gl-1h
3,a estrela de seis puntas representa o seis con...,1785,a estrela de seis puntas representa os seis co...,xls-r_cpt-galician-10h_ft-gl-1h
4,úsase como alternativa para mulleres sensibles...,281,úsase como alternativa para mulleres sensibles...,xls-r_cpt-galician-10h_ft-gl-1h
...,...,...,...,...
2083,kedi,846,kedi,xls-r_ft-tsn-1h
2084,sport,1069,sport,xls-r_ft-tsn-1h
2085,neo,1854,neo,xls-r_ft-tsn-1h
2086,tirelo,558,tirelo,xls-r_ft-tsn-1h


Calculate per-checkpoint word error rates

In [5]:
wer_df = pd.concat([
    pd.DataFrame({
        'checkpoint' : [ cp_name ],
        'test_wer' : jiwer.wer(eval_data.ref_text.to_list(), eval_data.hyp_text.to_list())
    }) for cp_name, eval_data in all_eval_dfs.groupby('checkpoint')
]).reset_index(drop=True)

wer_df

Unnamed: 0,checkpoint,test_wer
0,xls-r_cpt-galician-10h_ft-gl-1h,0.149782
1,xls-r_cpt-galician-10h_portuguese-60h_ft-gl-1h,0.139209
2,xls-r_cpt-galician-10h_spanish-60h_ft-gl-1h,0.137354
3,xls-r_cpt_iban-7h_ft-iba-1h,0.166334
4,xls-r_cpt_iban-7h_indonesian-60h_ft-iba-1h,0.164337
5,xls-r_cpt_iban-7h_malay-60h_ft-iba-1h,0.158698
6,xls-r_cpt_punjabi-10h_10k_ft-pa-1h,0.249594
7,xls-r_cpt_punjabi-10h_bengali-60h-seed-1_ft-pa-1h,0.250503
8,xls-r_cpt_punjabi-10h_bengali-60h-seed-2_ft-pa-1h,0.2518
9,xls-r_cpt_punjabi-10h_bengali-60h-seed-3_ft-pa-1h,0.251865


In [8]:
def parse_cpt_data(checkpoint_str):
    if not "cpt" in checkpoint_str:
        return "N/A"
    else:
        return checkpoint_str.replace("xls-r_cpt_", "").replace("xls-r_cpt-", "").split("_ft-")[0]

lang_mapping = {
    # ft-X-1h = fine-tuned on 1 hour of target language
    'ft-gl-1h': "Galician",
    "ft-iba-1h": "Iban",
    "ft-pa-1h" : "Punjabi",
    "ft-tsn-1h": "Setswana"
}

wer_df["target_lang"] = [ lang_mapping[l] for l in wer_df.checkpoint.str.extract("(ft-gl-1h|ft-iba-1h|ft-pa-1h|ft-tsn-1h)")[0] ]

wer_df["CPT"] = wer_df.checkpoint.str.contains("cpt")

wer_df["CPT_data"] = wer_df.checkpoint.apply(parse_cpt_data).str.replace("_\d+k", "", regex=True)

wer_df = wer_df.sort_values(['target_lang', 'CPT']).reset_index(drop=True)

wer_df

Unnamed: 0,checkpoint,test_wer,target_lang,CPT,CPT_data
0,xlsr2_300m_ft-gl-1h,0.154093,Galician,False,
1,xls-r_cpt-galician-10h_ft-gl-1h,0.149782,Galician,True,galician-10h
2,xls-r_cpt-galician-10h_portuguese-60h_ft-gl-1h,0.139209,Galician,True,galician-10h_portuguese-60h
3,xls-r_cpt-galician-10h_spanish-60h_ft-gl-1h,0.137354,Galician,True,galician-10h_spanish-60h
4,xls-r_ft-iba-1h,0.213556,Iban,False,
5,xls-r_cpt_iban-7h_ft-iba-1h,0.166334,Iban,True,iban-7h
6,xls-r_cpt_iban-7h_indonesian-60h_ft-iba-1h,0.164337,Iban,True,iban-7h_indonesian-60h
7,xls-r_cpt_iban-7h_malay-60h_ft-iba-1h,0.158698,Iban,True,iban-7h_malay-60h
8,xlsr2_300m_ft-pa-1h,0.308181,Punjabi,False,
9,xls-r_cpt_punjabi-10h_10k_ft-pa-1h,0.249594,Punjabi,True,punjabi-10h


Compile Punjabi ASR results (multiple seeds per donor)

In [31]:
def get_werr_df(target_df, baseline):
    baseline_wer = round(target_df[ target_df.checkpoint == baseline ].test_wer.to_list()[0] * 100, 1)

    target_df["percent_wer"] = round(target_df.test_wer * 100, 1)

    target_df["percent_werr"] = round((baseline_wer - target_df.percent_wer)/baseline_wer * 100, 1)

    return target_df

In [42]:
punjabi_werr = get_werr_df(wer_df[wer_df.target_lang == "Punjabi"].copy(), "xls-r_cpt_punjabi-10h_10k_ft-pa-1h")

punjabi_werr

Unnamed: 0,checkpoint,test_wer,target_lang,CPT,CPT_data,percent_wer,percent_werr
8,xlsr2_300m_ft-pa-1h,0.308181,Punjabi,False,,30.8,-23.2
9,xls-r_cpt_punjabi-10h_10k_ft-pa-1h,0.249594,Punjabi,True,punjabi-10h,25.0,0.0
10,xls-r_cpt_punjabi-10h_bengali-60h-seed-1_ft-pa-1h,0.250503,Punjabi,True,punjabi-10h_bengali-60h-seed-1,25.1,-0.4
11,xls-r_cpt_punjabi-10h_bengali-60h-seed-2_ft-pa-1h,0.2518,Punjabi,True,punjabi-10h_bengali-60h-seed-2,25.2,-0.8
12,xls-r_cpt_punjabi-10h_bengali-60h-seed-3_ft-pa-1h,0.251865,Punjabi,True,punjabi-10h_bengali-60h-seed-3,25.2,-0.8
13,xls-r_cpt_punjabi-10h_gujarati-60h-seed-1_ft-p...,0.244209,Punjabi,True,punjabi-10h_gujarati-60h-seed-1,24.4,2.4
14,xls-r_cpt_punjabi-10h_gujarati-60h-seed-2_ft-p...,0.24408,Punjabi,True,punjabi-10h_gujarati-60h-seed-2,24.4,2.4
15,xls-r_cpt_punjabi-10h_gujarati-60h-seed-3_ft-p...,0.242004,Punjabi,True,punjabi-10h_gujarati-60h-seed-3,24.2,3.2
16,xls-r_cpt_punjabi-10h_hindi-60h-seed-1_ft-pa-1h,0.234413,Punjabi,True,punjabi-10h_hindi-60h-seed-1,23.4,6.4
17,xls-r_cpt_punjabi-10h_hindi-60h-seed-2_ft-pa-1h,0.238046,Punjabi,True,punjabi-10h_hindi-60h-seed-2,23.8,4.8


In [64]:
punjabi_werr_donors = punjabi_werr[ punjabi_werr.CPT_data.str.contains("seed") ].copy()

punjabi_werr_donors.CPT_data = punjabi_werr_donors.CPT_data.str.replace("-seed-\d", "", regex=True)

punjabi_werr_donors.groupby('CPT_data').agg(
    median_wer = ('percent_wer', 'median'),
    median_werr = ('percent_werr', 'median'),
    min_wer = ('percent_wer', 'min'),
    max_wer = ('percent_wer', 'max')
).reset_index().sort_values('median_werr', ascending=False)

Unnamed: 0,CPT_data,median_wer,median_werr,min_wer,max_wer
2,punjabi-10h_hindi-60h,23.5,6.0,23.4,23.8
1,punjabi-10h_gujarati-60h,24.4,2.4,24.2,24.4
7,punjabi-10h_urdu-60h,24.4,2.4,24.3,24.5
4,punjabi-10h_marathi-60h,24.6,1.6,24.5,24.7
5,punjabi-10h_odia-60h,25.0,0.0,25.0,25.2
3,punjabi-10h_malayalam-60h,25.1,-0.4,25.0,25.3
6,punjabi-10h_tamil-60h,25.1,-0.4,25.0,25.4
0,punjabi-10h_bengali-60h,25.2,-0.8,25.1,25.2


In [36]:
get_werr_df(wer_df[wer_df.target_lang == "Galician"].copy(), "xls-r_cpt-galician-10h_ft-gl-1h")

Unnamed: 0,checkpoint,test_wer,target_lang,CPT,CPT_data,percent_wer,percent_werr
0,xlsr2_300m_ft-gl-1h,0.154093,Galician,False,,15.4,-2.7
1,xls-r_cpt-galician-10h_ft-gl-1h,0.149782,Galician,True,galician-10h,15.0,0.0
2,xls-r_cpt-galician-10h_portuguese-60h_ft-gl-1h,0.139209,Galician,True,galician-10h_portuguese-60h,13.9,7.3
3,xls-r_cpt-galician-10h_spanish-60h_ft-gl-1h,0.137354,Galician,True,galician-10h_spanish-60h,13.7,8.7


In [37]:
get_werr_df(wer_df[wer_df.target_lang == "Iban"].copy(), "xls-r_cpt_iban-7h_ft-iba-1h")

Unnamed: 0,checkpoint,test_wer,target_lang,CPT,CPT_data,percent_wer,percent_werr
4,xls-r_ft-iba-1h,0.213556,Iban,False,,21.4,-28.9
5,xls-r_cpt_iban-7h_ft-iba-1h,0.166334,Iban,True,iban-7h,16.6,0.0
6,xls-r_cpt_iban-7h_indonesian-60h_ft-iba-1h,0.164337,Iban,True,iban-7h_indonesian-60h,16.4,1.2
7,xls-r_cpt_iban-7h_malay-60h_ft-iba-1h,0.158698,Iban,True,iban-7h_malay-60h,15.9,4.2


In [38]:
get_werr_df(wer_df[wer_df.target_lang == "Setswana"].copy(), "xls-r_cpt_setswana-10h_ft-tsn-1h")

Unnamed: 0,checkpoint,test_wer,target_lang,CPT,CPT_data,percent_wer,percent_werr
35,xls-r_ft-tsn-1h,0.208321,Setswana,False,,20.8,-65.1
36,xls-r_cpt_setswana-10h_ft-tsn-1h,0.125713,Setswana,True,setswana-10h,12.6,0.0
37,xls-r_cpt_setswana-10h_sepedi-56h_ft-tsn-1h,0.119512,Setswana,True,setswana-10h_sepedi-56h,12.0,4.8
38,xls-r_cpt_setswana-10h_sesotho-56h_ft-tsn-1h,0.115812,Setswana,True,setswana-10h_sesotho-56h,11.6,7.9
