In [1]:
import pandas as pd
import pickle
df_19 = pd.read_csv("../data/cv-corpus-19.0-delta-2024-09-13/en/validated.tsv", delimiter="\t")
df_21 = pd.read_csv("../data/cv-corpus-21.0-delta-2025-03-14/en/validated.tsv", delimiter="\t")
df_17 = pd.read_csv('../data/cv-corpus-17.0-delta-2024-03-15/en/validated.tsv', delimiter="\t")

embeddings_17 = {}
with open('../data/cv_17.pkl', 'rb') as file:
    embeddings_17 = pickle.load(file)
embeddings_19 = {}
with open('../data/cv_19.pkl', 'rb') as file:
    embeddings_19 = pickle.load(file)
embeddings_21 = {}
with open('../data/cv_21.pkl', 'rb') as file:
    embeddings_21 = pickle.load(file)


In [2]:
from transformers import AutoProcessor, Wav2Vec2ForCTC

model_checkpoint = "facebook/wav2vec2-base-960h"
model = Wav2Vec2ForCTC.from_pretrained(model_checkpoint)
processor = AutoProcessor.from_pretrained(model_checkpoint)
lm_head = model.lm_head
vocab = processor.tokenizer.vocab.keys()



  from .autonotebook import tqdm as notebook_tqdm
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
import torch
from evaluate import load
import re
from tqdm import tqdm 
corpora = [17, 19, 21]
dfs = [df_17, df_19, df_21]
embedding_dicts = [embeddings_17, embeddings_19, embeddings_21]
rows = []
for df_idx, df in enumerate(dfs):
    paths = df.path.to_list()
    genders = df.gender.to_list()
    sentences = df.sentence.to_list()
    curr_dict = embedding_dicts[df_idx]
    for idx, path in tqdm(enumerate(paths)):
        if genders[idx] in ["male_masculine", "female_feminine"]:
            embedding = curr_dict[path.split(".")[0]]
            predictions = lm_head.forward(embedding)
            predicted_ids = torch.argmax(predictions, dim=-1)
            transcription = processor.batch_decode(predicted_ids)
            truth = sentences[idx]
            #clean
            truth = re.sub(r'[^\w\s]', '', truth)
            truth = truth.lower()
            wer = load("wer")
            cer = load("cer")
            prediction = transcription[0].lower()
            wer_score = wer.compute(predictions=[prediction], references=[truth])
            cer_score = cer.compute(predictions=[prediction], references=[truth])
            rows.append([path, genders[idx], prediction, truth, wer_score, cer_score, corpora[df_idx]])
df = pd.DataFrame(rows, columns=["path", "gender", "prediction", "truth", "wer", "cer", "cv_corpus"])
df.to_csv("predictions.csv", index=False)

378it [02:18,  1.15it/s]Using the latest cached version of the module from /Users/rafkiep1/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--cer/9cb90b752d5f15fb41161efdbefd13570adb3f32fa157290d8a55093c47428e1 (last modified on Fri Apr 18 12:20:21 2025) since it couldn't be found locally at evaluate-metric--cer, or remotely on the Hugging Face Hub.
1877it [28:29,  1.10it/s]
137it [02:00,  1.13it/s]
249it [03:29,  1.19it/s]


In [12]:
df_female = df[df["gender"] == "female_feminine"]
df_male = df[df["gender"] == "male_masculine"]
df_male.wer.describe(), df_male.cer.describe()


(count    1366.000000
 mean        0.207599
 std         0.226451
 min         0.000000
 25%         0.000000
 50%         0.153846
 75%         0.285714
 max         1.600000
 Name: wer, dtype: float64,
 count    1366.000000
 mean        0.070853
 std         0.101548
 min         0.000000
 25%         0.000000
 50%         0.037037
 75%         0.088235
 max         0.652174
 Name: cer, dtype: float64)

In [13]:
df_female.wer.describe(), df_female.cer.describe()

(count    318.000000
 mean       0.276328
 std        0.252172
 min        0.000000
 25%        0.083333
 50%        0.222222
 75%        0.428571
 max        1.500000
 Name: wer, dtype: float64,
 count    318.000000
 mean       0.100528
 std        0.106244
 min        0.000000
 25%        0.017395
 50%        0.067232
 75%        0.163288
 max        0.500000
 Name: cer, dtype: float64)