# Evaluation of Additional Modeling Pipelines
We should also compare performance on the evaluation data (Buckeye test split) with other readily available phonetic transcription options, to determine whether fine-tuning your own model is worth the effort. 
The two options we consider here are: 
- [Allosaurus](https://github.com/xinjli/allosaurus) is a pre-trained universal phone recognizer that claims to recognize phones in more than 2000 languages. 
- [Whisper](https://openai.com/index/whisper/) is the state-of-the-art sequence-to-sequence speech recognition model released by OpenAI. Details about the different model releases are available at https://github.com/openai/whisper/blob/main/model-card.md. There are multilingual and English fine-tuned versions. We follow these models with grapheme to phoneme conversion using Epitran.
- [excalibur12/wav2vec2-large-lv60_phoneme-timit_english_timit-4k](https://huggingface.co/excalibur12/wav2vec2-large-lv60_phoneme-timit_english_timit-4k) is a wav2vec2 model fine-tuned on TIMIT data. Because it uses the original TIMIT phonemes, we post-process using [phonecodes](https://pypi.org/project/phonecodes/) to convert predictions to IPA. 

These evaluations only need to be run and computed once. 

## Additional installation step for Epitran
To use Epitran for English, you also need to install https://github.com/festvox/flite. See the Epitran note at https://github.com/dmort27/epitran?tab=readme-ov-file#installation-of-flite-for-english-g2p.  I installed Flite on my mac:

```bash
$ git clone http://github.com/festvox/flite
$ cd flite
$ ./configure && make
$ sudo make install
$ cd testsuite
$ make lex_lookup
$ sudo cp lex_lookup /usr/local/bin
```



In [1]:
import itertools
import time
from pathlib import Path

import allosaurus.app
import allosaurus.bin.download_model
import datasets
import epitran
from phonecodes import phonecodes
import transformers
from tqdm import tqdm

from multipa.data_utils import load_buckeye_split, clean_text
from multipa.evaluation import ModelEvaluator, preprocess_test_data, write_detailed_prediction_results, DETAILED_PREDICTIONS_CSV_SUFFIX, PREDICTION_KEY, clean_model_name

VERBOSE_RESULTS_DIR = Path("../data/evaluation_results/detailed_predictions")
AGGREGATE_METRICS_CSV = Path("../data/evaluation_results/aggregate_metrics/epitran_allosaurus_eval.csv")
EDIT_DIST_DIR = Path("../data/evaluation_results/edit_distances/")

IS_REMOVE_SPACES = True
NUM_PROC = 8 # For HuggingFace dataset map and filter
DEVICE = 0

  import pynvml  # type: ignore[import]
  import pkg_resources


In [2]:
def allosaurus_predict(test_dataset, model="eng2102", phone_inventory="ipa"):
    print("Evaluating allosaurus. Model:", model, "Phone inventory:", phone_inventory)
    model_predictions = []
    model = allosaurus.app.read_recognizer(model)
    start = time.time()
    for audio in tqdm(test_dataset["audio"]):
        prediction = model.recognize(audio["path"], phone_inventory)
        model_predictions.append({PREDICTION_KEY: prediction})
    end = time.time()
    print("Eval time in seconds:", end-start)
    predictions_dataset = datasets.Dataset.from_list(model_predictions)
    predictions_dataset = predictions_dataset.map(
        lambda x: clean_text(x, text_key=PREDICTION_KEY, is_remove_space=IS_REMOVE_SPACES), num_proc=NUM_PROC
    )
    return predictions_dataset

def hf_model_to_epitran_predict(model_name, test_dataset):
    print("Building pipeline and downloading model")
    if model_name.endswith(".en"):
        pipe = transformers.pipeline("automatic-speech-recognition", model=model_name, device=DEVICE)
    else:
        pipe = transformers.pipeline(
            "automatic-speech-recognition", model=model_name, device=DEVICE, generate_kwargs={"language": "english"}
        )
    print("Predicting with", model_name)
    start = time.time()
    orthography_predictions = [d["text"] for d in pipe(test_dataset["audio"])]
    epi = epitran.Epitran('eng-Latn')
    print("Transliterating with Epitran")
    ipa_predictions = []
    for pred in tqdm(orthography_predictions):
        result = epi.transliterate(pred)
        ipa_predictions.append({PREDICTION_KEY: result})
    end = time.time()
    print("Eval time in seconds:", end-start)
    predictions_dataset = datasets.Dataset.from_list(ipa_predictions)
    predictions_dataset = predictions_dataset.map(
        lambda x: clean_text(x, text_key=PREDICTION_KEY, is_remove_space=IS_REMOVE_SPACES), num_proc=NUM_PROC
    )
    return predictions_dataset


def phonecodes_convert_batch(batch: dict, in_code="timit", out_code="ipa"): 
    """
    Phonecodes conversion that operates on Datasets
    """
    in_str = batch[in_code]
    conversion = phonecodes.convert(in_str, in_code, out_code)
    batch[out_code] = conversion
    return batch


def hf_to_phonecodes(
    test_dataset, 
    model_name="excalibur12/wav2vec2-large-lv60_phoneme-timit_english_timit-4k", 
    in_code="timit", out_code="ipa"):
    pipe = transformers.pipeline("automatic-speech-recognition", model=model_name, device=DEVICE)
    predictions_dataset = datasets.Dataset.from_list(
        [{in_code:d["text"]} for d in pipe(test_dataset["audio"])]
    )
    # convert to ipa
    predictions_dataset = predictions_dataset.map(
        lambda x: phonecodes_convert_batch(x, in_code, out_code), num_proc=NUM_PROC
    )
    # clean prediction output
    predictions_dataset = predictions_dataset.map(
        lambda x: clean_text(x, text_key=out_code, is_remove_space=IS_REMOVE_SPACES), num_proc=NUM_PROC
    )
    predictions_dataset = predictions_dataset.rename_column(out_code, PREDICTION_KEY)
    predictions_dataset = predictions_dataset.rename_column(in_code, f"{in_code}_{PREDICTION_KEY}")
    return predictions_dataset

In [3]:
input_data = load_buckeye_split("../data/buckeye", "test")
# Snippet of transcriptions
# Note that there don't appear to be any non-empty transcriptions,
# so this notebook skips looking at hallucinations
print("Data Preview")
print(input_data)
print(input_data[0])

non_empty_test_data, empty_test_data = preprocess_test_data(input_data, is_remove_space=True, num_proc=NUM_PROC)

print("Test data with speech transcriptions")
print(non_empty_test_data)
print(non_empty_test_data[0])
print("Test data without speech")
print(empty_test_data)

model_evaluator = ModelEvaluator()

Resolving data files:   0%|          | 0/18783 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/5606 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/5080 [00:00<?, ?it/s]

Data Preview
Dataset({
    features: ['utterance_id', 'duration', 'buckeye_transcript', 'text', 'ipa', 'speaker_id', 'speaker_gender', 'speaker_age_range', 'interviewer_gender', 'file_path', 'audio'],
    num_rows: 5079
})
{'utterance_id': 's2501a_Utt0', 'duration': 0.925981, 'buckeye_transcript': 'f ao r f ay v', 'text': 'four five', 'ipa': 'f ɔ ɹ f aɪ v', 'speaker_id': 'S25', 'speaker_gender': 'f', 'speaker_age_range': 'o', 'interviewer_gender': 'm', 'file_path': 'data/buckeye/test/s2501a_Utt0.wav', 'audio': {'bytes': None, 'path': '/work/pi_vcpartridge_umass_edu/multipa/data/buckeye/test/s2501a_Utt0.wav'}}


Map (num_proc=8):   0%|          | 0/5079 [00:00<?, ? examples/s]

Filter (num_proc=8):   0%|          | 0/5079 [00:00<?, ? examples/s]

Filter (num_proc=8):   0%|          | 0/5079 [00:00<?, ? examples/s]

Test data with speech transcriptions
Dataset({
    features: ['utterance_id', 'duration', 'buckeye_transcript', 'text', 'ipa', 'speaker_id', 'speaker_gender', 'speaker_age_range', 'interviewer_gender', 'file_path', 'audio'],
    num_rows: 5079
})
{'utterance_id': 's2501a_Utt0', 'duration': 0.925981, 'buckeye_transcript': 'f ao r f ay v', 'text': 'four five', 'ipa': 'fɔɹfaɪv', 'speaker_id': 'S25', 'speaker_gender': 'f', 'speaker_age_range': 'o', 'interviewer_gender': 'm', 'file_path': 'data/buckeye/test/s2501a_Utt0.wav', 'audio': {'path': '/work/pi_vcpartridge_umass_edu/multipa/data/buckeye/test/s2501a_Utt0.wav', 'array': array([-0.00997925, -0.01052856, -0.00958252, ...,  0.00085449,
        0.00061035,  0.00042725], shape=(14816,)), 'sampling_rate': 16000}}
Test data without speech
Dataset({
    features: ['utterance_id', 'duration', 'buckeye_transcript', 'text', 'ipa', 'speaker_id', 'speaker_gender', 'speaker_age_range', 'interviewer_gender', 'file_path', 'audio'],
    num_rows: 0
})

In [None]:
models = [
    "openai/whisper-large-v3-turbo",
    #"openai/whisper-large-v3",
    "openai/whisper-medium.en",
]
for m in models:
    # Epitran
    epitran_predictions = hf_model_to_epitran_predict(m, non_empty_test_data)
    model_name = f"{m}_to_epitran".replace("/", "_")
    metrics = model_evaluator.eval_non_empty_transcriptions(
        model_name, epitran_predictions[PREDICTION_KEY], non_empty_test_data["ipa"]
    )
    write_detailed_prediction_results(VERBOSE_RESULTS_DIR, model_name, non_empty_test_data, epitran_predictions, metrics)
    model_evaluator.write_edit_distance_results(model_name,EDIT_DIST_DIR)

In [None]:
# Define models and phone inventory to test
# allosaurus_models = ["uni2005", "eng2102"]
# phone_inventory = ["ipa", "eng"]

allosaurus_models = ["eng2102"]
phone_inventory = ["eng"]

# Download models
for m in allosaurus_models:
    allosaurus.bin.download_model.download_model(m)

# Predict and check against gold standard
for model, pi in itertools.product(allosaurus_models, phone_inventory):
    model_predictions = allosaurus_predict(non_empty_test_data, model, pi)
    model_name = f"allosaurus_{model}_{pi}"
    metrics = model_evaluator.eval_non_empty_transcriptions(model_name, model_predictions[PREDICTION_KEY], non_empty_test_data["ipa"])
    write_detailed_prediction_results(
        VERBOSE_RESULTS_DIR, model_name, non_empty_test_data, model_predictions, metrics
    )
    model_evaluator.write_edit_distance_results(model_name, EDIT_DIST_DIR)


In [None]:
hf_to_phonecodes_models = [("excalibur12/wav2vec2-large-lv60_phoneme-timit_english_timit-4k", "timit", "ipa")]

for model_name, in_code, out_code in hf_to_phonecodes_models: 
    model_predictions = hf_to_phonecodes(non_empty_test_data, model_name, in_code, out_code)
    print(model_predictions)
    metrics = model_evaluator.eval_non_empty_transcriptions(
        model_name, 
        model_predictions[PREDICTION_KEY], 
        non_empty_test_data["ipa"])
    write_detailed_prediction_results(
        VERBOSE_RESULTS_DIR, clean_model_name(model_name), non_empty_test_data, model_predictions, metrics
    )
    write_edit_distance_results(model_name, EDIT_DIST_DIR)


Some weights of the model checkpoint at excalibur12/wav2vec2-large-lv60_phoneme-timit_english_timit-4k were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at excalibur12/wav2vec2-large-lv60_phoneme-timit_english_timit-4k and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.par

In [None]:
# Write all results to file for comparison
model_evaluator.to_csv(AGGREGATE_METRICS_CSV)