# Notebook to evaluate the Wav2Vec2-Large-XLS-R trained Model

**IMPORTANT**: Please run the notebook cells from top to bottom following instructions at each step.

## Step 1: Install all necessary libraries

In [1]:
%%capture
!pip install datasets
!pip install transformers
!pip install huggingface_hub
!pip install torchaudio
!pip install jiwer
!pip install pyctcdecode
!pip install https://github.com/kpu/kenlm/archive/master.zip
!pip install evaluate

## Step 2: Download the `ZambeziVoice` dataset

In [2]:
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, Wav2Vec2ProcessorWithLM
from datasets import load_dataset, DatasetDict, Audio
from transformers import AutoFeatureExtractor, pipeline
from typing import Dict
from glob import glob
from tqdm import tqdm

import pandas as pd
import numpy as np
import torchaudio
import librosa
import torch
import os
import re
import argparse
import warnings
#warnings.filterwarnings("error")

In [3]:
from evaluate import load
wer = load("wer")
cer = load("cer")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.60k [00:00<?, ?B/s]

Confirm that the *.csv files have been generated.

In [4]:
!git clone https://github.com/csikasote/bigcgen.git

Cloning into 'bigcgen'...
remote: Enumerating objects: 35681, done.[K
remote: Counting objects: 100% (10/10), done.[K
remote: Compressing objects: 100% (8/8), done.[K
remote: Total 35681 (delta 1), reused 8 (delta 1), pack-reused 35671 (from 4)[K
Receiving objects: 100% (35681/35681), 7.40 GiB | 80.42 MiB/s, done.
Resolving deltas: 100% (89/89), done.
Updating files: 100% (35640/35640), done.


In [5]:
audio_path = os.path.join(os.getcwd(),"bigcgen/audio/")
csv_path = os.path.join(os.getcwd(),"bigcgen/splits/")

In [6]:
processed_files = glob(f"{csv_path}*/*_processed.tsv")
len(processed_files)
for f in processed_files:
  os.remove(f)

In [7]:
from glob import glob
def prepare_data(audio_path, csv_path):
    split_list = ["male","female","balanced", "combined", "test"]
    for split in split_list:
        csv_file_list = glob(f"{csv_path}{split}/*.tsv")
        for csv_file in csv_file_list:
            split_file = os.path.basename(csv_file).split(".")[0]
            df = pd.read_csv(csv_file, sep="\t")
            df["path"] = audio_path + df['audio']
            df = df.dropna(subset=["path"])
            df = df.drop(columns=['audio'])
            df = df.rename(columns={'path':'audio'})
            df = df[["audio","sentence"]]
            df.to_csv(f"{csv_path}/{split}/{split_file}_processed.tsv", sep="\t", index=False)
            print(f"{split_file}_processed : ", len(df))

In [8]:
prepare_data(audio_path, csv_path)

train_male_20hrs_file_processed :  10284
train_male_10hrs_file_processed :  5171
validation_male_file_processed :  441
test_native_and_nonnative_male_file_processed :  461
train_male_5hrs_file_processed :  2577
train_male_15hrs_file_processed :  7749
train_male_30hrs_file_processed :  15463
train_male_25hrs_file_processed :  12857
train_female_30hrs_file_processed :  16036
train_female_15hrs_file_processed :  8008
train_female_25hrs_file_processed :  13347
train_female_10hrs_file_processed :  5352
validation_female_file_processed :  475
train_female_20hrs_file_processed :  10674
train_female_5hrs_file_processed :  2675
test_native_and_nonnative_female_file_processed :  472
validation_balanced_file_processed :  442
train_balanced_file_processed :  2631
train_combined_10hrs_file_processed :  5252
train_combined_5hrs_file_processed :  2631
train_combined_40hrs_file_processed :  20958
train_combined_25hrs_file_processed :  13129
train_combined_30hrs_file_processed :  15757
train_combined_5

In [9]:
chars_to_remove_regex = '[\,\_\?\.\$\&\\(\)!\-\;\:\"\“\%\‘\”\�\']'

def speech_file_to_array_fn(batch):
	batch["sentence"] = re.sub(chars_to_remove_regex, '', batch["sentence"]).lower()
	speech_array, sampling_rate = torchaudio.load(batch["audio"])
	batch["speech"] = speech_array[0].numpy()
	return batch

In [10]:
def prepare_dataset(batch):
    audio = batch["audio"]

    # batched output is "un-batched"
    batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
    batch["input_length"] = len(batch["input_values"])

    with processor.as_target_processor():
        batch["labels"] = processor(batch["sentence"]).input_ids
    return batch

In [11]:
def evaluate(batch):
	inputs = processor(batch["input_values"], sampling_rate=16_000, return_tensors="pt", padding=True)
	#inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)

	with torch.no_grad():
		logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits

	pred_ids = torch.argmax(logits, dim=-1)
	batch["pred_strings"] = processor.batch_decode(pred_ids)
	return batch

In [12]:
def create_subgroup_wers(result, file_name):
  wers = []
  references = result["sentence"]
  predictions = result["pred_strings"]
  for s, p in list(zip(references, predictions)):
    wer_result = 100 * wer.compute(references=[s], predictions=[p])
    wers.append(round(wer_result, 2))
    wer_df = pd.DataFrame(wers, columns=['wer'])
    wer_df.to_csv(f'{file_name}_wer.csv', index=False)

In [13]:
import numpy as np
from scipy.stats import f_oneway
from scipy.stats import kruskal
from scipy.stats import mannwhitneyu
def compute_Kruskal():
  male_df = pd.read_csv(f"male_wer.csv")
  female_df = pd.read_csv(f"female_wer.csv")
  male_list = male_df.values.tolist()
  female_list = female_df.values.tolist()
  kruskal_results = kruskal(male_list, female_list)
  print("Kruskal-Wallis Results:")
  print("F:",kruskal_results[0])
  print("P:",kruskal_results[1])

def compute_OneWayANOVA():
  male_df = pd.read_csv(f"male_wer.csv")
  female_df = pd.read_csv(f"female_wer.csv")
  male_list = male_df.values.tolist()
  female_list = female_df.values.tolist()
  anova_results = f_oneway(male_list, female_list)
  sig_value = ''
  if anova_results[1][0] < 0.05:
    sig_value = 'True'
  else:
    sig_value = 'False'
  print("One-Way ANOVA:")
  print("F Statistic:",anova_results[0])
  print("P value:",anova_results[1], ":Significant:",sig_value)


def compute_MannWhitneyU():
  male_df = pd.read_csv(f"male_wer.csv")
  female_df = pd.read_csv(f"female_wer.csv")
  male_list = male_df.values.tolist()
  female_list = female_df.values.tolist()
  U1, p = mannwhitneyu(male_list, female_list, method="exact")
  print("Mann Whitney U:",p)

### Model and Processor

In [14]:
def load_model(model_id):
  model = Wav2Vec2ForCTC.from_pretrained(model_id).to("cuda")
  processor = Wav2Vec2Processor.from_pretrained(model_id)
  print(model_id)
  return model, processor

In [15]:
def run_model(dataset, split_name, list_splits):
  wer_list = []
  cer_list = []
  for split_name in list_splits:
    #file_name = "test"
    #file_name = "combined"
    csv_file_tsv = os.path.join(os.getcwd(),f"{dataset}/splits/test/test_{split_name}_file_processed.tsv")
    test_file = load_dataset("csv", data_files={"test": csv_file_tsv}, delimiter="\t")["test"]
    test_file = test_file.map(speech_file_to_array_fn)
    test_file = test_file.cast_column("audio", Audio(sampling_rate=16_000))
    test_file = test_file.map(prepare_dataset)
    result = test_file.map(evaluate, batched=True, batch_size=8)
    create_subgroup_wers(result, split_name)
    wer_value = round(100 * wer.compute(references=result["sentence"], predictions=result["pred_strings"]), 2)
    cer_value = round(100 * cer.compute(references=result["sentence"], predictions=result["pred_strings"]), 2)
    wer_list.append(wer_value)
    cer_list.append(cer_value)
  bias_value = wer_list[0] - wer_list[1]
  print(" ")
  print("Male: {:2f}".format(wer_list[0]))
  print("Female: {:2f}".format(wer_list[1]))
  print("Combined: {:2f}".format(wer_list[2]))
  print("Bias: {:2f}".format(bias_value))
  compute_OneWayANOVA()

In [16]:
# Variables
dataset="bigcgen"
#split_name = "test"
list_splits = ["male", "female", "combined"]
model_id="facebook/mms-1b-all"

In [17]:
model, processor = load_model(model_id)

config.json:   0%|          | 0.00/2.04k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/254 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/397 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/96.0 [00:00<?, ?B/s]

facebook/mms-1b-all


In [19]:
run_model(dataset, None, list_splits)

Generating test split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/461 [00:00<?, ? examples/s]

Map:   0%|          | 0/461 [00:00<?, ? examples/s]



Map:   0%|          | 0/461 [00:00<?, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/472 [00:00<?, ? examples/s]

Map:   0%|          | 0/472 [00:00<?, ? examples/s]



Map:   0%|          | 0/472 [00:00<?, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/933 [00:00<?, ? examples/s]

Map:   0%|          | 0/933 [00:00<?, ? examples/s]



Map:   0%|          | 0/933 [00:00<?, ? examples/s]

 
Male: 81.710000
Female: 84.360000
Combined: 83.050000
Bias: -2.650000
One-Way ANOVA:
F Statistic: [1.09350506]
P value: [0.29596732] :Significant: False


In [None]:
#!rm -rf /content/toigen

In [None]:
dataset="nyagen"
#split_name = "test"
list_splits = ["male", "female"]
model_list = ["csikasote/mms-1b-nyagen-male-model",
              "csikasote/mms-1b-nyagen-female-model",
              "csikasote/mms-1b-nyagen-balanced-model",
              "csikasote/mms-1b-nyagen-combined-model"]

In [None]:
for model_id in model_list:
  print(model_id)
  model, processor = load_model(model_id)
  run_model(dataset, None, list_splits)
  print(" - ")

csikasote/mms-1b-nyagen-male-model


config.json:   0%|          | 0.00/2.06k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/254 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.13k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/393 [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/30.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/548 [00:00<?, ?B/s]

Generating test split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/196 [00:00<?, ? examples/s]

Map:   0%|          | 0/196 [00:00<?, ? examples/s]



Map:   0%|          | 0/196 [00:00<?, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/170 [00:00<?, ? examples/s]

Map:   0%|          | 0/170 [00:00<?, ? examples/s]



Map:   0%|          | 0/170 [00:00<?, ? examples/s]

 
Male: 20.970000
Female: 39.440000
Bias: -18.470000
One-Way ANOVA:
F Statistic: [110.5313611]
P value: [9.41658188e-23] :Significant: True
 - 
csikasote/mms-1b-nyagen-female-model


config.json:   0%|          | 0.00/2.06k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/254 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.13k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/393 [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/30.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/548 [00:00<?, ?B/s]

Map:   0%|          | 0/196 [00:00<?, ? examples/s]



Map:   0%|          | 0/196 [00:00<?, ? examples/s]

Map:   0%|          | 0/170 [00:00<?, ? examples/s]

Map:   0%|          | 0/170 [00:00<?, ? examples/s]

 
Male: 26.310000
Female: 25.880000
Bias: 0.430000
One-Way ANOVA:
F Statistic: [0.35986493]
P value: [0.54895451] :Significant: False
 - 
csikasote/mms-1b-nyagen-balanced-model


config.json:   0%|          | 0.00/2.05k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/254 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.13k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/393 [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/30.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/548 [00:00<?, ?B/s]

Map:   0%|          | 0/196 [00:00<?, ? examples/s]



Map:   0%|          | 0/196 [00:00<?, ? examples/s]

Map:   0%|          | 0/170 [00:00<?, ? examples/s]

Map:   0%|          | 0/170 [00:00<?, ? examples/s]

 
Male: 24.420000
Female: 32.190000
Bias: -7.770000
One-Way ANOVA:
F Statistic: [22.33914018]
P value: [3.27064596e-06] :Significant: True
 - 
csikasote/mms-1b-nyagen-combined-model


config.json:   0%|          | 0.00/2.06k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/254 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.13k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/393 [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/30.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/548 [00:00<?, ?B/s]

Map:   0%|          | 0/196 [00:00<?, ? examples/s]



Map:   0%|          | 0/196 [00:00<?, ? examples/s]

Map:   0%|          | 0/170 [00:00<?, ? examples/s]

Map:   0%|          | 0/170 [00:00<?, ? examples/s]

 
Male: 22.860000
Female: 29.790000
Bias: -6.930000
One-Way ANOVA:
F Statistic: [15.71565003]
P value: [8.86238239e-05] :Significant: True
 - 
