# GMU Evaluation Data
This performs evaluation and comparison to gold standard for sentences from the [Speech Accent Archive](https://accent.gmu.edu). 

In [1]:
from pathlib import Path

from datasets import load_dataset, Audio, Dataset
import pandas as pd
from transformers import pipeline

import multipa
import multipa.data_utils
import multipa.evaluate

device = "cuda"

gmu_data_dir = Path("../data/gmu")
transcriptions_path = Path(gmu_data_dir) / "gold_transcriptions.csv"
# Column headers for transcriptions csv file
gmu_mod = "GMU MOD"
gmu_og = "GMU OG"


our_model = "ginic/data_seed_bs64_4_wav2vec2-large-xlsr-53-buckeye-ipa"
taguchi_2k = "/home/vcpartridge_umass_edu/.cache/huggingface/hub/models--ctaguchi--wav2vec2-large-xlsr-japlmthufielta-ipa-plus-2000/snapshots/92cfe1211b01f9026ffca191c84ce06161926f45"
taguchi_1k = "ctaguchi/wav2vec2-large-xlsr-japlmthufielta-ipa1000-ns"

  from .autonotebook import tqdm as notebook_tqdm
Using the latest cached version of the module from /home/vcpartridge_umass_edu/.cache/huggingface/modules/evaluate_modules/metrics/ginic--phone_errors/6e56791b592b56908011d38628c25fd7a753442e543c55e5cade54bc2f1ee58d (last modified on Tue Nov 19 03:45:59 2024) since it couldn't be found locally at ginic--phone_errors, or remotely on the Hugging Face Hub.


In [2]:
gmu_wavs = list(gmu_data_dir.glob("*.wav"))
print("Total wav files:", len(gmu_wavs))

Total wav files: 66


In [3]:
def add_filename(dataset_entry):
    filename = Path(dataset_entry["audio"]["path"]).name
    dataset_entry["filename"] = filename
    return dataset_entry

In [4]:
# Read wavs as huggingface audio dataset
audio_dataset = load_dataset("audiofolder", data_dir = gmu_data_dir).cast_column("audio", Audio(sampling_rate=16_000))["train"]
print("Length audio dataset:", len(audio_dataset))

audio_dataset = audio_dataset.map(add_filename)
print("First audio file:", audio_dataset[0])

# Add predictions as new columns in dataset
for model in [our_model, taguchi_1k, taguchi_2k]:
    print("Running", model)
    pipe = pipeline("automatic-speech-recognition", model=model, device=device)
    transcriptions = pipe(audio_dataset["audio"])
    print(transcriptions[0])
    transcriptions = [multipa.data_utils.clean_text(x, is_remove_space=True, text_key="text")["text"] for x in transcriptions]
    audio_dataset = audio_dataset.add_column(name=model, column=transcriptions)
    print(audio_dataset[0])

Resolving data files: 100%|██████████| 67/67 [00:00<00:00, 101340.92it/s]
Using custom data configuration default-375cfec279f10a6d
Found cached dataset audiofolder (/home/vcpartridge_umass_edu/.cache/huggingface/datasets/audiofolder/default-375cfec279f10a6d/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc)
100%|██████████| 1/1 [00:00<00:00, 167.04it/s]
Loading cached processed dataset at /home/vcpartridge_umass_edu/.cache/huggingface/datasets/audiofolder/default-375cfec279f10a6d/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc/cache-4e514f8397b03833.arrow


Length audio dataset: 66
First audio file: {'audio': {'path': None, 'array': array([-0.00402832, -0.00717163, -0.00552368, ..., -0.00326538,
       -0.00241089,  0.        ]), 'sampling_rate': 16000}, 'filename': 'finnish1.wav'}
Running ginic/data_seed_bs64_4_wav2vec2-large-xlsr-53-buckeye-ipa
{'text': 'plizkolstɪlʌæskhɹ̩tɪlpɹɪŋθizθɪŋzwizhɹ̩fɹʌmðɪstoʊlsikspunoʌfɹɪʃnoʊpisfaɪvθikstlɛpsʌflutʃisɛnmeɪbiɛʃnɛkfʊ̩hɹ̩bɹʌðɹ̩ʌbʌpwiɔlsoʊniɾɪsmɔlplɛsɪkneɪkɛmbiktɪljɪfoʊkfoʊldtʊkitʃʃikɪnskupθizθɪŋɪnθɹiɹɛdbɛksɛnwiwɪl̩ɡoʊmithɹ̩wɛnʃteɪæðɪtɹeɪnʃteɪʃʌn'}
{'audio': {'path': None, 'array': array([-0.00402832, -0.00717163, -0.00552368, ..., -0.00326538,
       -0.00241089,  0.        ]), 'sampling_rate': 16000}, 'filename': 'finnish1.wav', 'ginic/data_seed_bs64_4_wav2vec2-large-xlsr-53-buckeye-ipa': 'plizkolstɪlʌæskhɹ̩tɪlpɹɪŋθizθɪŋzwizhɹ̩fɹʌmðɪstoʊlsikspunoʌfɹɪʃnoʊpisfaɪvθikstlɛpsʌflutʃisɛnmeɪbiɛʃnɛkfʊ̩hɹ̩bɹʌðɹ̩ʌbʌpwiɔlsoʊniɾɪsmɔlplɛsɪkneɪkɛmbiktɪljɪfoʊkfoʊldtʊkitʃʃikɪnskupθizθɪŋɪnθɹiɹɛdbɛksɛnwiwɪl̩ɡoʊmithɹ̩

  return cls._concat_blocks(pa_tables_to_concat_vertically, axis=0)
Some weights of the model checkpoint at ctaguchi/wav2vec2-large-xlsr-japlmthufielta-ipa1000-ns were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at ctaguchi/wav2vec2-large-xlsr-japlmthufielta-ipa1000-ns and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.o

{'text': 'pliskɔːstɛwaɛskxɔrtɔpʂɨɲtɨjstŋxujtxɔrfɔndɛʂtɔsikspunɔfwɛʂnɔpxisfajvtɨikslajapsɔfluwt͡ʂizɛnmɛjbijɛʐnajɛɡfɔxɛbradɛrbabujɔlsɔnirɛsmɔwɔplasɨksnɛikɛmbɛjɡt͡sʂɔjfɔkɔtɔkirʂikɛnskuptɨjstiŋɡsint͡srɨjwɛdːɛɡsɛnwiwɨɡɛ̃wmitxɔwɛntʂtɛjɛtːɛtrɛ̃ɕtajʂɛ'}
{'audio': {'path': None, 'array': array([-0.00402832, -0.00717163, -0.00552368, ..., -0.00326538,
       -0.00241089,  0.        ]), 'sampling_rate': 16000}, 'filename': 'finnish1.wav', 'ginic/data_seed_bs64_4_wav2vec2-large-xlsr-53-buckeye-ipa': 'plizkolstɪlʌæskhɹ̩tɪlpɹɪŋθizθɪŋzwizhɹ̩fɹʌmðɪstoʊlsikspunoʌfɹɪʃnoʊpisfaɪvθikstlɛpsʌflutʃisɛnmeɪbiɛʃnɛkfʊ̩hɹ̩bɹʌðɹ̩ʌbʌpwiɔlsoʊniɾɪsmɔlplɛsɪkneɪkɛmbiktɪljɪfoʊkfoʊldtʊkitʃʃikɪnskupθizθɪŋɪnθɹiɹɛdbɛksɛnwiwɪl̩ɡoʊmithɹ̩wɛnʃteɪæðɪtɹeɪnʃteɪʃʌn', 'ctaguchi/wav2vec2-large-xlsr-japlmthufielta-ipa1000-ns': 'pliskɔːstɛwaɛskxɔrtɔpʂɨɲtɨjstŋxujtxɔrfɔndɛʂtɔsikspunɔfwɛʂnɔpxisfajvtɨikslajapsɔfluwt͡ʂizɛnmɛjbijɛʐnajɛɡfɔxɛbradɛrbabujɔlsɔnirɛsmɔwɔplasɨksnɛikɛmbɛjɡt͡sʂɔjfɔkɔtɔkirʂikɛnskuptɨjstiŋɡsint͡srɨjwɛdːɛɡsɛnwiwɨɡɛ̃wmitxɔ

Some weights of the model checkpoint at /home/vcpartridge_umass_edu/.cache/huggingface/hub/models--ctaguchi--wav2vec2-large-xlsr-japlmthufielta-ipa-plus-2000/snapshots/92cfe1211b01f9026ffca191c84ce06161926f45 were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at /home/vcpartridge_umass_edu/.cache/huggingface/hub/models--ctaguchi--wav2vec2-large-xlsr-japlmthufie

{'text': 'pliskɔs tɛwa ɛskxartɔprɨŋtis tɨŋkswuj tʂɛr fɔmdɛstɔɔ sikspuwn ɔ fwɛʂ snɔwpis fajv tɨjk slapsɔv bluwt͡ʂiz ɛn mɛjvijɛʂnak fɔxɛr pradɛr bɔb ujɔlsɔ nidɛs mɔː plastɨksnɛkʲɛmbik t͡ʂɔjfɔk vɔt͡ʂɛkʲjtʂ xɕikʲɛn skuptɨj stɨŋks inθrujrɛt tɛks anwɨwɨ ɡɛw mitʂɛrwɛnt͡ʂt͡ʂɛj ɛz tɛ tʂɛ̃ʂtɛʂt͡ɕɛ'}
{'audio': {'path': None, 'array': array([-0.00402832, -0.00717163, -0.00552368, ..., -0.00326538,
       -0.00241089,  0.        ]), 'sampling_rate': 16000}, 'filename': 'finnish1.wav', 'ginic/data_seed_bs64_4_wav2vec2-large-xlsr-53-buckeye-ipa': 'plizkolstɪlʌæskhɹ̩tɪlpɹɪŋθizθɪŋzwizhɹ̩fɹʌmðɪstoʊlsikspunoʌfɹɪʃnoʊpisfaɪvθikstlɛpsʌflutʃisɛnmeɪbiɛʃnɛkfʊ̩hɹ̩bɹʌðɹ̩ʌbʌpwiɔlsoʊniɾɪsmɔlplɛsɪkneɪkɛmbiktɪljɪfoʊkfoʊldtʊkitʃʃikɪnskupθizθɪŋɪnθɹiɹɛdbɛksɛnwiwɪl̩ɡoʊmithɹ̩wɛnʃteɪæðɪtɹeɪnʃteɪʃʌn', 'ctaguchi/wav2vec2-large-xlsr-japlmthufielta-ipa1000-ns': 'pliskɔːstɛwaɛskxɔrtɔpʂɨɲtɨjstŋxujtxɔrfɔndɛʂtɔsikspunɔfwɛʂnɔpxisfajvtɨikslajapsɔfluwt͡ʂizɛnmɛjbijɛʐnajɛɡfɔxɛbradɛrbabujɔlsɔnirɛsmɔwɔplasɨksnɛikɛmbɛjɡt͡sʂɔjfɔkɔtɔkirʂik

In [5]:
audio_df = audio_dataset.to_pandas()
audio_df = audio_df.rename(columns={
    "/home/vcpartridge_umass_edu/.cache/huggingface/hub/models--ctaguchi--wav2vec2-large-xlsr-japlmthufielta-ipa-plus-2000/snapshots/92cfe1211b01f9026ffca191c84ce06161926f45":"ctaguchi/wav2vec2-large-xlsr-japlmthufielta-ipa-plus-2000"
})
audio_df.head()

Unnamed: 0,audio,filename,ginic/data_seed_bs64_4_wav2vec2-large-xlsr-53-buckeye-ipa,ctaguchi/wav2vec2-large-xlsr-japlmthufielta-ipa1000-ns,ctaguchi/wav2vec2-large-xlsr-japlmthufielta-ipa-plus-2000
0,{'bytes': b'RIFF\xb6q\x0c\x00WAVEfmt \x10\x00\...,finnish1.wav,plizkolstɪlʌæskhɹ̩tɪlpɹɪŋθizθɪŋzwizhɹ̩fɹʌmðɪst...,pliskɔːstɛwaɛskxɔrtɔpʂɨɲtɨjstŋxujtxɔrfɔndɛʂtɔs...,pliskɔstɛwaɛskxartɔprɨŋtistɨŋkswujtʂɛrfɔmdɛstɔ...
1,{'bytes': b'RIFF$\xc4\t\x00WAVEfmt \x10\x00\x0...,finnish2.wav,pliskuʊlstɪlʌæsɡoʌtʌbɹɪŋðisθɪŋzwɪtsoʊfʌmɪstoɹs...,plisɡɔɔfstɔwaaskøɛtbrindijɛstiɲɡzwitxɛlfɔmbɨst...,plisɡɔɔstɔwaɛskørɔtbrindijɛstiːnswiːtfɛlfondis...
2,{'bytes': b'RIFF\xfeY\n\x00WAVEfmt \x10\x00\x0...,finnish3.wav,plizkɔlstɪlʌæskhɹ̩tubɹɪŋðizθɪŋzwɪθhɹ̩fʌmðʌstoɔ...,pliːskɔwstɛlaaskɛ̃rt͡ɕbwindeːste̞ŋzwɨthalfamdɔ...,pliskɔɔstɛlaaskjat͡ɕbɻiɲdeːstɛŋkswɨtarfamdastɔ...
3,{'bytes': b'RIFF\xf0\xcf\x0c\x00WAVEfmt \x10\x...,finnish4.wav,pliskulstɪlɑskhaʊʌtsbɹɪŋðizθɪŋwɪθʌfɹʌmðʌstʊlsi...,pliskɔstɛlaskɛwat͡spriŋɡispiɲd͡ʑswɨɲaxɛfɔndɨst...,pliskɔstɛlaaskɛwat͡sbriŋvispiŋvswɨtawɛfɔndstɔs...
4,{'bytes': b'RIFF\xb2~\x0c\x00WAVEfmt \x10\x00\...,finnish5.wav,plizkɔlstɛlʌæskɹ̩ɾʌbɹɪŋðisθɪŋzwʌɹ̩fɹʌmʌstoʊɹsɪ...,plizkɑɔstɛlæskødbriŋðisteŋɡzøθøfɑmnæsitooɾsɨks...,pliːskɑstɛwaskørbriɲðistɛŋksæθørfɒmnɛɕtolsɨksp...


In [6]:
# Read and clean gold standard transcriptions
gold_standard_df = pd.read_csv(transcriptions_path)
print("Shape:", gold_standard_df.shape)
gold_standard_df[[gmu_mod, gmu_og]] = gold_standard_df[[gmu_mod, gmu_og]].applymap(lambda x: "".join(x.split()))
gold_standard_df.head()


Shape: (66, 3)


  gold_standard_df[[gmu_mod, gmu_og]] = gold_standard_df[[gmu_mod, gmu_og]].applymap(lambda x: "".join(x.split()))


Unnamed: 0,AudioFileName,GMU OG,GMU MOD
0,finnish1.wav,pliz̥kɔlstɪləæ̝skhɜɹt̪ŭb̥ɹɪ̃ŋðisθɪ̃ŋz̥wɪθhɜɹf...,plizkɔlstɪlʌæskhɜɹtubɹɪ̃ŋðisθɪŋzwɪθhɜɹfɹʌ̃mðʌs...
1,finnish2.wav,pliːskɑlstɛləæ̝skɜtəbɹiŋd̪iztiŋswɪthɜfɹʌmðəstɔ...,pliskɑlstɛlʌæskɜtʌbɹiŋdiztiŋswɪthɜfɹʌmðʌstɔɹsɪ...
2,finnish3.wav,pʰliz̥kʰɔlstɛləæskhɚtŭbɹɪ̃ŋðiz̥θɪ̃ŋzwɪθhɚfɹʌ̃...,plizkɔlstɛlʌæskhɚtubɹɪ̃ŋðizθɪ̃ŋzwɪθhɹ̩fɹʌ̃mdʌs...
3,finnish4.wav,pʰliz̥kɔlstɛlăæskhɜtŭbɹɪ̃ŋd̪iz̥θɪ̃ŋswɪθhɜfɹʌ...,plizkɔlstɛlaæskhɜtubɹɪ̃ŋdizθɪ̃ŋswɪθhɜfɹʌ̃mdʌst...
4,finnish5.wav,pʰliz̥kʰɑlstɛləæskəɹɾəbɹɪ̃ŋðiz̥θɪ̃ŋz̥wɪθhɚfɹʌ̃...,plizkɑlstɛlʌæskʌɹɾʌbɹɪ̃ŋðizθɪ̃ŋzwɪθhɹ̩fɹʌ̃mðʌs...


In [7]:
# Join results and gold standard on audio file name
full_comparison_df = pd.merge(gold_standard_df, audio_df, left_on="AudioFileName", right_on = "filename").drop(columns = ["audio", "filename"])
full_comparison_df.head()

Unnamed: 0,AudioFileName,GMU OG,GMU MOD,ginic/data_seed_bs64_4_wav2vec2-large-xlsr-53-buckeye-ipa,ctaguchi/wav2vec2-large-xlsr-japlmthufielta-ipa1000-ns,ctaguchi/wav2vec2-large-xlsr-japlmthufielta-ipa-plus-2000
0,finnish1.wav,pliz̥kɔlstɪləæ̝skhɜɹt̪ŭb̥ɹɪ̃ŋðisθɪ̃ŋz̥wɪθhɜɹf...,plizkɔlstɪlʌæskhɜɹtubɹɪ̃ŋðisθɪŋzwɪθhɜɹfɹʌ̃mðʌs...,plizkolstɪlʌæskhɹ̩tɪlpɹɪŋθizθɪŋzwizhɹ̩fɹʌmðɪst...,pliskɔːstɛwaɛskxɔrtɔpʂɨɲtɨjstŋxujtxɔrfɔndɛʂtɔs...,pliskɔstɛwaɛskxartɔprɨŋtistɨŋkswujtʂɛrfɔmdɛstɔ...
1,finnish2.wav,pliːskɑlstɛləæ̝skɜtəbɹiŋd̪iztiŋswɪthɜfɹʌmðəstɔ...,pliskɑlstɛlʌæskɜtʌbɹiŋdiztiŋswɪthɜfɹʌmðʌstɔɹsɪ...,pliskuʊlstɪlʌæsɡoʌtʌbɹɪŋðisθɪŋzwɪtsoʊfʌmɪstoɹs...,plisɡɔɔfstɔwaaskøɛtbrindijɛstiɲɡzwitxɛlfɔmbɨst...,plisɡɔɔstɔwaɛskørɔtbrindijɛstiːnswiːtfɛlfondis...
2,finnish3.wav,pʰliz̥kʰɔlstɛləæskhɚtŭbɹɪ̃ŋðiz̥θɪ̃ŋzwɪθhɚfɹʌ̃...,plizkɔlstɛlʌæskhɚtubɹɪ̃ŋðizθɪ̃ŋzwɪθhɹ̩fɹʌ̃mdʌs...,plizkɔlstɪlʌæskhɹ̩tubɹɪŋðizθɪŋzwɪθhɹ̩fʌmðʌstoɔ...,pliːskɔwstɛlaaskɛ̃rt͡ɕbwindeːste̞ŋzwɨthalfamdɔ...,pliskɔɔstɛlaaskjat͡ɕbɻiɲdeːstɛŋkswɨtarfamdastɔ...
3,finnish4.wav,pʰliz̥kɔlstɛlăæskhɜtŭbɹɪ̃ŋd̪iz̥θɪ̃ŋswɪθhɜfɹʌ...,plizkɔlstɛlaæskhɜtubɹɪ̃ŋdizθɪ̃ŋswɪθhɜfɹʌ̃mdʌst...,pliskulstɪlɑskhaʊʌtsbɹɪŋðizθɪŋwɪθʌfɹʌmðʌstʊlsi...,pliskɔstɛlaskɛwat͡spriŋɡispiɲd͡ʑswɨɲaxɛfɔndɨst...,pliskɔstɛlaaskɛwat͡sbriŋvispiŋvswɨtawɛfɔndstɔs...
4,finnish5.wav,pʰliz̥kʰɑlstɛləæskəɹɾəbɹɪ̃ŋðiz̥θɪ̃ŋz̥wɪθhɚfɹʌ̃...,plizkɑlstɛlʌæskʌɹɾʌbɹɪ̃ŋðizθɪ̃ŋzwɪθhɹ̩fɹʌ̃mðʌs...,plizkɔlstɛlʌæskɹ̩ɾʌbɹɪŋðisθɪŋzwʌɹ̩fɹʌmʌstoʊɹsɪ...,plizkɑɔstɛlæskødbriŋðisteŋɡzøθøfɑmnæsitooɾsɨks...,pliːskɑstɛwaskørbriɲðistɛŋksæθørfɒmnɛɕtolsɨksp...


In [9]:
# Compute performance metrics and write results
for gold in [gmu_og, gmu_mod]:
    model_eval = multipa.evaluate.ModelEvaluator()
    for model in [our_model, taguchi_1k, "ctaguchi/wav2vec2-large-xlsr-japlmthufielta-ipa-plus-2000"]:
        metrics = model_eval.eval_non_empty_transcriptions(model, full_comparison_df[model], full_comparison_df[gold])
        for m in ["phone_error_rates", "phone_feature_error_rates", "feature_error_rates"]:
            full_comparison_df[f"{m} {gold} VS {model}"] = metrics[m]
    model_eval.to_csv(f"{gold}_aggregate_results.csv")

full_comparison_df.to_csv("full_gmu_model_comparison.csv", index=False)