## Notebook to evaluate different models:
- Ours Multipa
- Ctaguchi Model
- Allosaraus Model
- ZIPA Model (Pending Env Issues and IceFall, K2 not on mac?)

### Pending: Zipa Model

### Additional installation step for Epitran

```bash
$ git clone http://github.com/festvox/flite
$ cd flite
$ ./configure && make
$ sudo make install
$ cd testsuite
$ make lex_lookup
$ sudo cp lex_lookup /usr/local/bin
```

In [66]:
from pathlib import Path
from datasets import Audio, Dataset
import pandas as pd
from transformers import pipeline

import allosaurus.app
import allosaurus.bin.download_model

import multipa
import multipa.data_utils
import multipa.evaluate

import numpy as np
import soundfile as sf
import tempfile
import torch

device = -1  # -1 for CPU, or set GPU index if available

# Paths For TIMIT Database and TIMIT IPA
timit_data_dir = Path("/Users/parthbhangla/Desktop/Multipa_Datasets/TIMIT/COMPLETE")
transcriptions_path = Path("/Users/parthbhangla/Desktop/Multipa_Datasets/TIMIT/complete_ipa.csv")

# Models Evaluating
our_model = "ginic/data_seed_bs64_4_wav2vec2-large-xlsr-53-buckeye-ipa"
taguchi_1k = "ctaguchi/wav2vec2-large-xlsr-japlmthufielta-ipa1000-ns"

In [67]:
timit_wavs = [p for p in timit_data_dir.rglob("*") if p.suffix.lower() == ".wav"]
print("Total WAV files found:", len(timit_wavs))

data = [
    {"audio": {"path": str(p)}, "filename": "/" + str(p.relative_to(timit_data_dir.parent)).lower()}
    for p in timit_wavs
]

audio_dataset = Dataset.from_list(data)
audio_dataset = audio_dataset.cast_column("audio", Audio(sampling_rate=16_000))

# Test with a small subset if wanted
audio_subset = audio_dataset.select([i for i in list(range(10))])

Total WAV files found: 6300


In [68]:
def run_allosaurus(audio_dataset, model="eng2102", phone_inventory="ipa"):
    print(f"Running Allosaurus with model={model}, phone_inventory={phone_inventory}")
    recog = allosaurus.app.read_recognizer(model)
    predictions = []

    for f in audio_dataset["filename"]:
        wav_path = timit_data_dir.parent / f.lstrip("/")

        data, sr = sf.read(wav_path)
        with tempfile.NamedTemporaryFile(suffix=".wav") as tmp:
            sf.write(tmp.name, data, sr, format="WAV", subtype="PCM_16")
            pred = recog.recognize(tmp.name, phone_inventory)

        predictions.append(pred.replace(" ", ""))

    return predictions

allosaurus.bin.download_model.download_model("eng2102")

In [69]:
models = [our_model, taguchi_1k]

for model_name in models:
    print(f"Running ASR for model: {model_name}")
    
    asr_pipe = pipeline("automatic-speech-recognition", model=model_name, device=device)
    
    predictions = asr_pipe(audio_subset["audio"])
    
    cleaned_predictions = [
        multipa.data_utils.clean_text(x, is_remove_space=True, text_key="text")["text"]
        for x in predictions
    ]

    audio_subset = audio_subset.add_column(name=model_name, column=cleaned_predictions)

Running ASR for model: ginic/data_seed_bs64_4_wav2vec2-large-xlsr-53-buckeye-ipa
Running ASR for model: ctaguchi/wav2vec2-large-xlsr-japlmthufielta-ipa1000-ns


Some weights of the model checkpoint at ctaguchi/wav2vec2-large-xlsr-japlmthufielta-ipa1000-ns were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at ctaguchi/wav2vec2-large-xlsr-japlmthufielta-ipa1000-ns and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.we

In [70]:
allosaurus_predictions = run_allosaurus(audio_subset, model="eng2102", phone_inventory="ipa")
audio_subset = audio_subset.add_column("allosaurus_eng2102_ipa", allosaurus_predictions)

Running Allosaurus with model=eng2102, phone_inventory=ipa


  model_state_dict = torch.load(str(path), map_location=torch.device('cpu'))


In [71]:
gold_standard_df = pd.read_csv(transcriptions_path)

gold_standard_df["clean_ipa"] = gold_standard_df["ipa_transcription"].apply(
    lambda x: "".join(str(x).split())
)

gold_standard_df["filename"] = gold_standard_df["audio_filename"].str.lower()

audio_subset = audio_subset.map(lambda x: {**x, "filename": x["filename"].lower()})

predictions_df = audio_subset.to_pandas()

full_comparison_df = pd.merge(
    gold_standard_df,
    predictions_df,
    on="filename"
)

full_comparison_df = full_comparison_df.drop(
    columns=["audio_filename", "ipa_transcription", "audio"]
)

full_comparison_df = full_comparison_df[[
    "filename", 
    "clean_ipa", 
    "ginic/data_seed_bs64_4_wav2vec2-large-xlsr-53-buckeye-ipa", 
    "ctaguchi/wav2vec2-large-xlsr-japlmthufielta-ipa1000-ns",
    "allosaurus_eng2102_ipa"
]]

print(full_comparison_df.head())

output_path = Path("timit_subset_with_actual_and_predictions.csv")
full_comparison_df.to_csv(output_path, index=False)
print(f"Merged dataset saved to {output_path.resolve()}")

Map: 100%|██████████| 10/10 [00:00<00:00, 50.20 examples/s]


                         filename  \
0     /complete/dr4/mmdm0/sa1.wav   
1     /complete/dr4/mmdm0/sa2.wav   
2  /complete/dr4/mmdm0/si1311.wav   
3  /complete/dr4/mmdm0/si1941.wav   
4   /complete/dr4/mmdm0/si681.wav   

                                           clean_ipa  \
0                   ʃiædjɚdɑɹksʉɾɨngɹiziwɔʃwɑɾɚɔljɪɹ   
1                     doʊɾ̃æsmiɾɨkɪɹiɛɾ̃ɔliɹæglʌkðæt   
2  sɔlɹidɛpɨzɪʃɨnɨzɨvɑjuweɪɾɨdbaɪwɔʃɨŋklinswɑtʃɨz...   
3                                wl̩tɔkoʊvɚɨtjɝɑfə̥s   
4                     wɨdsʌtʃɨnæktɨvɹɨfjʉʒl̩bijʉsfl̩   

  ginic/data_seed_bs64_4_wav2vec2-large-xlsr-53-buckeye-ipa  \
0                ʃiædjɹ̩dɑɹksuɾɪnɡɹiziwʌʃwɑɾɹ̩aʊljiɹ          
1                   doʊɾ̃æskmiɾɪkɛɹiɛnoʊliɹæɡlaɪkðæʔ          
2  soʊlɹidɛpʊzɪʃɪnhɪzɪvæjueɪɾɪdbaɪwɑʃɪŋklinswɑtʃɪ...          
3                                wʊltɔkoʊvɹ̩ɪtjɹ̩fɪs          
4                     wɪθsʌtʃɪnæktʌvɹɪfjuzl̩bijusfl̩          

  ctaguchi/wav2vec2-large-xlsr-japlmthufielta-ipa1000-ns  \
0

In [72]:
model_eval = multipa.evaluate.ModelEvaluator()
gold_col = "clean_ipa"
model_names = [our_model, taguchi_1k, "allosaurus_eng2102_ipa"]

def extract_dialect(path_str):
    path = Path(path_str)
    parts = [p for p in path.parts if p.lower().startswith("dr")]
    return parts[0].upper() if parts else "UNKNOWN"

full_comparison_df["dialect"] = full_comparison_df["filename"].apply(extract_dialect)
print("Dialect groups found:", full_comparison_df["dialect"].unique())

summary_data = {}
dialect_results = []

for model_name in model_names:
    print(f"Evaluating model: {model_name}")
    
    predictions = full_comparison_df[model_name].tolist()
    references = full_comparison_df[gold_col].tolist()
    
    metrics = model_eval.eval_non_empty_transcriptions(model_name, predictions, references)

    for metric_name in ["phone_error_rates", "phone_feature_error_rates", "feature_error_rates"]:
        col_name = f"{metric_name} VS {model_name}"
        full_comparison_df[col_name] = metrics[metric_name]

    summary_data[model_name] = {
        metric_name: float(np.mean(metrics[metric_name]))
        for metric_name in ["phone_error_rates", "phone_feature_error_rates", "feature_error_rates"]
    }

    for dialect, df_group in full_comparison_df.groupby("dialect"):
        result_row = {
            "dialect": dialect,
            "model": model_name,
        }
        for metric_name in ["phone_error_rates", "phone_feature_error_rates", "feature_error_rates"]:
            col_name = f"{metric_name} VS {model_name}"
            result_row[metric_name] = df_group[col_name].mean()
        dialect_results.append(result_row)


summary_df = pd.DataFrame(summary_data).T
summary_df = summary_df[["phone_error_rates", "phone_feature_error_rates", "feature_error_rates"]]
summary_df = summary_df.reset_index()
summary_df = summary_df.rename(columns={"index": "model"})
summary_df.to_csv("timit_model_evaluation_summary.csv", index=False)
print("Average evaluation metrics per model saved to timit_model_evaluation_summary.csv")


dialect_summary_df = pd.DataFrame(dialect_results)
dialect_summary_df.to_csv("timit_dialect_model_comparison.csv", index=False)
print("Dialect evaluation complete. Results saved to timit_dialect_model_comparison.csv")

Dialect groups found: ['DR4']
Evaluating model: ginic/data_seed_bs64_4_wav2vec2-large-xlsr-53-buckeye-ipa
Evaluating model: ctaguchi/wav2vec2-large-xlsr-japlmthufielta-ipa1000-ns
Evaluating model: allosaurus_eng2102_ipa
Average evaluation metrics per model saved to timit_model_evaluation_summary.csv
Dialect evaluation complete. Results saved to timit_dialect_model_comparison.csv
