# By-phone error rates: Buckeye test set
This computes and compares error rates at the phone level across different models. This helps us understand which phones are most frequently mistaken by the models we're interested in. Conversely, can also see which phones we're performing well on.

In [None]:
import collections
from pathlib import Path
import re

import ipatok
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from phonecodes import phonecode_tables
import seaborn as sns

from multipa.evaluation import ModelEvaluator, PREDICTION_KEY

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

DETAILED_EVAL_PATH = Path("../data/evaluation_results/detailed_predictions/")
# These are the models we're actually interested in evaluating in detail
DETAILED_EVAL_CSVS = [
    # Models trained on full 20K samples (full dataset but gender balanced)
    "data_models_train_duration_20000_samples_1_wav2vec2-large-xlsr-53-buckeye-ipa_detailed_predictions.csv",
    "data_models_train_duration_20000_samples_2_wav2vec2-large-xlsr-53-buckeye-ipa_detailed_predictions.csv",
    "data_models_train_duration_20000_samples_3_wav2vec2-large-xlsr-53-buckeye-ipa_detailed_predictions.csv",
    "data_models_train_duration_20000_samples_4_wav2vec2-large-xlsr-53-buckeye-ipa_detailed_predictions.csv",
    "data_models_train_duration_20000_samples_5_wav2vec2-large-xlsr-53-buckeye-ipa_detailed_predictions.csv",
    # Models trained on entire dataset
    "ginic_full_dataset_train_1_wav2vec2-large-xlsr-53-buckeye-ipa_detailed_predictions.csv",
    "ginic_full_dataset_train_2_wav2vec2-large-xlsr-53-buckeye-ipa_detailed_predictions.csv",
    "ginic_full_dataset_train_3_wav2vec2-large-xlsr-53-buckeye-ipa_detailed_predictions.csv",
    "ginic_full_dataset_train_4_wav2vec2-large-xlsr-53-buckeye-ipa_detailed_predictions.csv",
    "ginic_full_dataset_train_5_wav2vec2-large-xlsr-53-buckeye-ipa_detailed_predictions.csv",
    # Third party comparison models
    #"openai_whisper-large-v3-turbo_to_epitran_detailed_predictions.csv",
    "openai_whisper-medium.en_to_epitran_detailed_predictions.csv",
    "allosaurus_eng2102_eng_detailed_predictions.csv",
    #"facebook_wav2vec2-xlsr-53-espeak-cv-ft_detailed_predictions.csv",
    "facebook_wav2vec2-lv-60-espeak-cv-ft_detailed_predictions.csv",
    "ctaguchi_wav2vec2-large-xlsr-japlmthufielta-ipa1000-ns_detailed_predictions.csv"
]

REFERENCE_COL = "ipa"

VALID_BUCKEYE_PHONES = set(phonecode_tables._buckeye2ipa.values())

# This doesn't include all the nasalized vowels, only the ones that we computed the Pillai scores for
BUCKEYE_VOWELS = set(["ɑ", "æ", "ʌ", "ɔ", "aʊ", "aɪ", "ɛ", "ɹ̩", "eɪ", "ɪ", "i", "oʊ", "ɔɪ", "ʊ", "u", "æ̃", "ɔ̃",
                  "ə̃", "ĩ", "ẽɪ̃", "õʊ̃", "ãɪ̃", "ɑ̃", "ũ", "ɾ̃", "ə", "ɛ̃", "ʊ̃", "ãʊ̃", "ʌ̃", "ɪ̃", "ɹ̩̃", "ɔ̃ɪ̃"])

BUCKEYE_CONSONANTS = VALID_BUCKEYE_PHONES - BUCKEYE_VOWELS

# I'm just being picky about plot colors
HUE_ORDER = ["full_dataset_train", "train_duration_20000_samples", "openai_whisper-medium.en_to_epitran", "allosaurus_eng2102_eng", "facebook_wav2vec2-lv-60-espeak-cv-ft", "ctaguchi_wav2vec2-large-xlsr-japlmthufielta-ipa1000-ns"]

HUE_ORDER_INDEX = {key: i for i, key in enumerate(HUE_ORDER)}

PALETTE = "colorblind"

sns.set_palette(PALETTE)

def diphthong_merge(t1, t2, dipthongs):
    """For merge detected diphthongs in predicted output when using ipatok.tokenise"""
    if t1 + t2 in dipthongs:
        return True
    else:
        return False

# Use for Buckeye tokenization for convenience
buckeye_merge_func = lambda x,y: diphthong_merge(x, y, VALID_BUCKEYE_PHONES)

def get_model_group(model_name):
    for p in ["train_duration_20000_samples", "full_dataset_train"]:
        if model_name.startswith(p):
            return p
    return model_name

def compute_error_rate_confidence_intervals_df(error_rate_df, count_df, error_rate_join_key, count_join_key, error_rate_col, count_col, interval_const = 1.96):
    """Computes error rates for each vowel with a confidence interval of according to
    https://machinelearningmastery.com/report-classifier-performance-confidence-intervals/
    The default settings give a confidence interval of 95%.
    """
    joined_df = pd.merge(error_rate_df, count_df, left_on=error_rate_join_key, right_on=count_join_key, how="inner")
    error_series = joined_df[error_rate_col]
    joined_df["confidence_interval"] = interval_const * np.sqrt( (error_series *(1-error_series))/ joined_df[count_col])
    return joined_df

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Read in model results and re-generate comparison metrics
MODEL_EVALUATOR = ModelEvaluator(tokenise_options={"diphthongs":True, "merge":buckeye_merge_func})

for csv in DETAILED_EVAL_CSVS:
    print("Processing", csv)
    if csv.startswith("data_models_train_duration"):
        model_name = re.search(r'train_duration_20000_samples_[1-5]', csv).group()
    elif csv.startswith("ginic_full_dataset_train"):
        model_name = re.search(r'full_dataset_train_[1-5]', csv).group()
    else:
        model_name = csv.removesuffix("_detailed_predictions.csv")
    model_results = pd.read_csv(
            DETAILED_EVAL_PATH / csv,
            dtype={PREDICTION_KEY: str, REFERENCE_COL: str},
            keep_default_na=False
        )
    latest_ref_col = model_results[REFERENCE_COL]
    MODEL_EVALUATOR.eval_edit_distances(model_name, model_results[PREDICTION_KEY], latest_ref_col, compute_by_token_error_rates=True)


In [None]:
# Sanity check token counts against actual vocabulary
# The invalid token warnings are harmless, it's just removing some "NOISE" and disfluency markers that snuck through
# data preprocessing, but don't affect training
final_token_counts = {t:0 for t in VALID_BUCKEYE_PHONES}

tokens = []
for ref in latest_ref_col:
    tokens.extend(ipatok.tokenise(ref, diphthongs=True, merge = buckeye_merge_func))

test_token_counts = collections.Counter(tokens)
final_token_counts.update(test_token_counts)

for t in list(final_token_counts.keys()):
    if t not in VALID_BUCKEYE_PHONES:
        print("REMOVING INVALID TOKEN:", t, t.encode("unicode-escape"))
        del final_token_counts[t]

token_counts_df = pd.DataFrame.from_records(
        list(final_token_counts.items()),
        columns=["phone", "counts"]).sort_values("counts", ascending=False)

display(token_counts_df)

In [None]:
# Munge data into dataframe format for analysis
records = []
for model_name, eval_results in MODEL_EVALUATOR.results_to_write.items():
    for phone, err_rate in eval_results[MODEL_EVALUATOR.by_token_error_rates].items():
        if phone in VALID_BUCKEYE_PHONES:
            records.append((model_name, phone, err_rate))

all_error_rates_df = pd.DataFrame.from_records(records, columns=["model_name", "phone", "err_rate"])
all_error_rates_df["model_group"] = all_error_rates_df["model_name"].apply(get_model_group)

all_error_rates_df = compute_error_rate_confidence_intervals_df(all_error_rates_df,token_counts_df, "phone", "phone", "err_rate", "counts")

In [None]:
display(all_error_rates_df.head())

In [None]:
def plot_error_rates_by_phone_and_model(dataframe, groupby_key, xlabel="Phone", ylabel="Normalized Error Rate", title="Error rates by phone on the Buckeye test set", fontsize=14, use_confidence_intervals=False, phone_col="phone", err_rate_col="err_rate", confidence_interval_col="confidence_interval", figsize=(25, 4), legend_title="Experiment group/Model", palette=PALETTE, hue_order = HUE_ORDER):
    group_order = dataframe.groupby(phone_col)[err_rate_col].min().sort_values()
    tmp_df = dataframe.copy(deep=True)
    tmp_df["sort_order"] = tmp_df[phone_col].map(group_order)
    tmp_df = tmp_df.sort_values('sort_order')
    if use_confidence_intervals:
        tmp_df["upper"] = tmp_df[err_rate_col] + tmp_df[confidence_interval_col]
        tmp_df["lower"] = tmp_df[err_rate_col] - tmp_df[confidence_interval_col]
        fig, g = plt.subplots(figsize=figsize)
        palette = sns.color_palette(palette)
        for i, group in enumerate(hue_order):
            group_df = tmp_df[tmp_df[groupby_key] == group]
            color = palette[i]
            x = group_df[phone_col]
            g.plot(x, group_df[err_rate_col], label=group, color=color)
            g.plot(x, group_df["lower"], color=color, alpha=0.2)
            g.plot(x, group_df["upper"], color=color, alpha=0.2)
            g.fill_between(x, group_df["lower"], group_df["upper"], alpha=0.2)

    else:
        plt.figure(figsize=figsize)
        g = sns.lineplot(data = tmp_df, y=err_rate_col, x = phone_col, hue=groupby_key, style=groupby_key, palette=palette, hue_order=hue_order)

    g.set_xlabel(xlabel, fontsize=fontsize)
    g.set_ylabel(ylabel, fontsize=fontsize)
    g.set_title(title, fontsize=fontsize)
    g.tick_params(labelsize=fontsize)
    g.set_ylim(0, 1)
    plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left',fontsize=fontsize, title=legend_title, title_fontsize=fontsize)



In [None]:
consonant_df = all_error_rates_df[all_error_rates_df["phone"].isin(BUCKEYE_CONSONANTS)]
plot_error_rates_by_phone_and_model(consonant_df, "model_group", title="Error rates for consonants on the Buckeye test set")

In [None]:
vowel_df = all_error_rates_df[all_error_rates_df["phone"].isin(BUCKEYE_VOWELS)]
plot_error_rates_by_phone_and_model(vowel_df, "model_group", title="Error rates for vowels on the Buckeye test set")

In [None]:
our_models = all_error_rates_df[all_error_rates_df["model_group"].isin(["train_duration_20000_samples", "full_dataset_train"])]
model_orders = [f"train_duration_20000_samples_{i}" for i in range(1, 6)] + [f"full_dataset_train_{i}" for i in range(1, 6)]

In [None]:
our_models_consonants = our_models[our_models["phone"].isin(BUCKEYE_CONSONANTS)]
plot_error_rates_by_phone_and_model(our_models_consonants, "model_name", title="AutoIPA model error rates for consonants on the Buckeye test set\nwith confidence intervals", use_confidence_intervals=True, hue_order = model_orders)

In [None]:
plot_error_rates_by_phone_and_model(our_models_consonants, "model_name", title="AutoIPA model error rates for consonants on the Buckeye test set", hue_order = model_orders)

In [None]:
our_models_vowels = our_models[our_models["phone"].isin(BUCKEYE_VOWELS)]
plot_error_rates_by_phone_and_model(our_models_vowels, "model_name", title="AutoIPA model error rates for vowels on the Buckeye test set\nwith confidence intervals", use_confidence_intervals=True, hue_order = model_orders)

In [None]:
plot_error_rates_by_phone_and_model(our_models_vowels, "model_name", title="AutoIPA model error rates for vowels on the Buckeye test set", hue_order = model_orders)

In [None]:
plot_error_rates_by_phone_and_model(our_models_consonants, "model_group", title="Error rates for consonants on the Buckeye test\naveraged across AutoIPA experiment groups", hue_order=["full_dataset_train", "train_duration_20000_samples"])

In [None]:
plot_error_rates_by_phone_and_model(our_models_vowels, "model_group", title="Error rates for vowels on the Buckeye test\naveraged across AutoIPA experiment groups", hue_order=["full_dataset_train", "train_duration_20000_samples"])