Install all required dependencies.

In [None]:
! pip3 install pywhispercpp jiwer sentence_transformers # spacy
# ! python3 -m spacy download en_core_web_sm

Pull in all available audio files from `samples/audio`. Assumes that the audio files all have a matching file within `sample/truth`.

In [None]:
import os

audio_dir = "samples/audio/"
transcription_dir = "samples/transcription/"
truth_dir = "samples/truth/"

files = os.listdir(audio_dir)

filenames = []
for file in files:
    name, ext = os.path.splitext(file)
    filenames.append(name)

print(filenames)

Use the python binding for whisper.cpp to inference whisper for audio recognition.

In [None]:
from pywhispercpp.model import Model

desired_whisper_model = "base.en"
model = Model(desired_whisper_model, n_threads=6, models_dir="./models")

for filename in filenames:
    audio_file = f"{audio_dir}{filename}.wav"
    print(audio_file)
    segments = model.transcribe(audio_file, speed_up=True)

    transcript = ""
    for segment in segments:
        transcript = " ".join([seg.text for seg in segments])

    transcript_file = f"{transcription_dir}{filename}.txt"
    with open(transcript_file, "w") as f:
        f.write(transcript)

Calculate Word Error Rate (WER) and Word Information Loss (WIL). WER measures word-level accuracy. WIL measures semantic fidelity. WER compares words. WIL compares meaning.

Sentence embedding model is used to perform cosine-similarity.

In [None]:
from jiwer import wer, process_words
from sentence_transformers import SentenceTransformer, util

# import spacy

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# nlp = spacy.load("en_core_web_sm")


for filename in filenames:
    truth = ""
    transcript = ""

    transcript_file = f"{transcription_dir}{filename}.txt"
    with open(transcript_file, "r") as f:
        transcript = f.read()

    truth_file = f"{truth_dir}{filename}.txt"
    with open(truth_file, "r") as f:
        truth = f.read()

    output = process_words(truth, transcript)

    wer = output.wer
    wil = output.wil

    # truth_sentences = [sent.text for sent in nlp(truth).sents]
    # transcript_sentences = [sent.text for sent in nlp(transcript).sents]

    # similarities = []
    # for sent1 in truth_sentences:
    #     embedding_1 = model.encode(sent1, convert_to_tensor=True)

    # for sent2 in transcript_sentences:
    #     embedding_2 = model.encode(sent2, convert_to_tensor=True)
    #     similarity = util.pytorch_cos_sim(embedding_1, embedding_2).item()
    #     similarities.append(similarity)

    # average_similarity = sum(similarities) / len(similarities)

    # print(
    #     f"[{filename}] (EXPERIMENTAL) Sentence Similarity: {average_similarity}"
    # )

    truth_embedding = model.encode(truth, convert_to_tensor=True)
    transcript_embedding = model.encode(transcript, convert_to_tensor=True)

    document_similarity = util.pytorch_cos_sim(
        truth_embedding, transcript_embedding
    ).item()

    print(f"[{filename}] Word Error Rate: {wer}")
    print(f"[{filename}] Word Information Loss: {wil}")
    print(f"[{filename}] (EXPERIMENTAL) Document Similarity: {document_similarity}")