Install all required dependencies via the CLI:

```bash
pip3 install pywhispercpp jiwer sentence_transformers # spacy
# python3 -m spacy download en_core_web_sm
```

Choose the Whisper model identifier that you would like to use.

In [1]:
desired_whisper_model = input("Whisper model version (e.g., base.en, base):")

Read in all available audio files from `samples/audio`. Assumes that the audio files all have a matching file within `sample/truth`.

In [2]:
import os

audio_dir = "samples/audio/"
transcription_dir = "samples/transcription/"
truth_dir = "samples/truth/"

files = os.listdir(audio_dir)

filenames = []
for file in files:
    name, ext = os.path.splitext(file)
    filenames.append(name)

print(filenames)

['3min47sec', '13min56sec', '0min12sec']


Use the python binding for whisper.cpp to inference whisper for audio recognition.

In [3]:
from pywhispercpp.model import Model

model = Model(desired_whisper_model, n_threads=6, models_dir="./models")

for filename in filenames:
    audio_file = f"{audio_dir}{filename}.wav"
    print(audio_file)
    segments = model.transcribe(audio_file, speed_up=True)

    transcript = ""
    for segment in segments:
        transcript = " ".join([seg.text for seg in segments])

    transcript_file = f"{transcription_dir}{filename}.txt"
    with open(transcript_file, "w") as f:
        f.write(transcript)

Downloading Model small.en ...: 100%|██████████| 465M/465M [00:38<00:00, 12.7MiB/s] 

[2023-10-12 17:34:27,957] {utils.py:63} INFO - Model downloaded to /home/jlaw/defense-unicorns/whisper-cpu-api/models/ggml-small.en.bin
[2023-10-12 17:34:27,969] {model.py:221} INFO - Initializing the model ...



whisper_init_from_file_no_state: loading model from '/home/jlaw/defense-unicorns/whisper-cpu-api/models/ggml-small.en.bin'
whisper_model_load: loading model
whisper_model_load: n_vocab       = 51864
whisper_model_load: n_audio_ctx   = 1500
whisper_model_load: n_audio_state = 768
whisper_model_load: n_audio_head  = 12
whisper_model_load: n_audio_layer = 12
whisper_model_load: n_text_ctx    = 448
whisper_model_load: n_text_state  = 768
whisper_model_load: n_text_head   = 12
whisper_model_load: n_text_layer  = 12
whisper_model_load: n_mels        = 80
whisper_model_load: ftype         = 1
whisper_model_load: type          = 3
whisper_model_load: mem required  =  743.00 MB (+   16.00 MB per decoder)
whisper_model_load: adding 1607 extra tokens
whisper_model_load: model ctx     =  464.56 MB
whisper_model_load: model size    =  464.44 MB
whisper_init_state: kv self size  =   15.75 MB
whisper_init_state: kv cross size =   52.73 MB


samples/audio/3min47sec.wav
[2023-10-12 17:34:29,444] {model.py:130} INFO - Transcribing ...
[2023-10-12 17:40:11,739] {model.py:133} INFO - Inference time: 342.295 s
samples/audio/13min56sec.wav


whisper_full_with_state: progress =   5%
whisper_full_with_state: progress =  10%
whisper_full_with_state: progress =  15%
whisper_full_with_state: progress =  20%
whisper_full_with_state: progress =  25%
whisper_full_with_state: progress =  30%
whisper_full_with_state: progress =  35%
whisper_full_with_state: progress =  40%
whisper_full_with_state: progress =  45%
whisper_full_with_state: progress =  50%
whisper_full_with_state: progress =  55%
whisper_full_with_state: progress =  60%
whisper_full_with_state: progress =  65%
whisper_full_with_state: progress =  70%
whisper_full_with_state: progress =  75%
whisper_full_with_state: progress =  80%
whisper_full_with_state: progress =  85%
whisper_full_with_state: progress =  90%
whisper_full_with_state: progress =  95%


[2023-10-12 17:40:11,956] {model.py:130} INFO - Transcribing ...
[2023-10-12 17:53:04,706] {model.py:133} INFO - Inference time: 772.750 s
samples/audio/0min12sec.wav
[2023-10-12 17:53:04,719] {model.py:130} INFO - Transcribing ...
[2023-10-12 17:54:00,651] {model.py:133} INFO - Inference time: 55.932 s


whisper_full_with_state: progress =   5%
whisper_full_with_state: progress =  10%
whisper_full_with_state: progress =  15%
whisper_full_with_state: progress =  20%
whisper_full_with_state: progress =  25%
whisper_full_with_state: progress =  30%
whisper_full_with_state: progress =  35%
whisper_full_with_state: progress =  40%
whisper_full_with_state: progress =  45%
whisper_full_with_state: progress =  50%
whisper_full_with_state: progress =  55%
whisper_full_with_state: progress =  60%
whisper_full_with_state: progress =  65%
whisper_full_with_state: progress =  70%
whisper_full_with_state: progress =  75%
whisper_full_with_state: progress =  80%
whisper_full_with_state: progress =  85%
whisper_full_with_state: progress =  90%
whisper_full_with_state: progress =  95%
whisper_full_with_state: progress =   5%
whisper_full_with_state: progress =  10%
whisper_full_with_state: progress =  15%
whisper_full_with_state: progress =  20%
whisper_full_with_state: progress =  25%
whisper_full_wit

Calculate Word Error Rate (WER) and Word Information Loss (WIL). WER measures word-level accuracy. WIL measures semantic fidelity. WER compares words. WIL compares meaning.

Sentence embedding model is used to perform cosine-similarity.

In [4]:
from jiwer import wer, process_words
from sentence_transformers import SentenceTransformer, util

# import spacy

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# nlp = spacy.load("en_core_web_sm")


for filename in filenames:
    truth = ""
    transcript = ""

    transcript_file = f"{transcription_dir}{filename}.txt"
    with open(transcript_file, "r") as f:
        transcript = f.read()

    truth_file = f"{truth_dir}{filename}.txt"
    with open(truth_file, "r") as f:
        truth = f.read()

    output = process_words(truth, transcript)

    wer = output.wer
    wil = output.wil

    # truth_sentences = [sent.text for sent in nlp(truth).sents]
    # transcript_sentences = [sent.text for sent in nlp(transcript).sents]

    # similarities = []
    # for sent1 in truth_sentences:
    #     embedding_1 = model.encode(sent1, convert_to_tensor=True)

    # for sent2 in transcript_sentences:
    #     embedding_2 = model.encode(sent2, convert_to_tensor=True)
    #     similarity = util.pytorch_cos_sim(embedding_1, embedding_2).item()
    #     similarities.append(similarity)

    # average_similarity = sum(similarities) / len(similarities)

    # print(
    #     f"[{filename}] (EXPERIMENTAL) Sentence Similarity: {average_similarity}"
    # )

    truth_embedding = model.encode(truth, convert_to_tensor=True)
    transcript_embedding = model.encode(transcript, convert_to_tensor=True)

    document_similarity = util.pytorch_cos_sim(
        truth_embedding, transcript_embedding
    ).item()

    print(f"[{filename}] Word Error Rate: {wer}")
    print(f"[{filename}] Word Information Loss: {wil}")
    print(f"[{filename}] (EXPERIMENTAL) Document Similarity: {document_similarity}")

  from .autonotebook import tqdm as notebook_tqdm


[2023-10-12 17:54:12,878] {SentenceTransformer.py:66} INFO - Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
[2023-10-12 17:54:14,203] {SentenceTransformer.py:105} INFO - Use pytorch device: cpu


Batches: 100%|██████████| 1/1 [00:01<00:00,  1.94s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  2.41it/s]


[3min47sec] Word Error Rate: 0.07777777777777778
[3min47sec] Word Information Loss: 0.13204933778182182
[3min47sec] (EXPERIMENTAL) Document Similarity: 0.9903446435928345


Batches: 100%|██████████| 1/1 [00:00<00:00,  1.27it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  2.76it/s]


[13min56sec] Word Error Rate: 0.0911880754055239
[13min56sec] Word Information Loss: 0.14883375136173949
[13min56sec] (EXPERIMENTAL) Document Similarity: 0.9443119764328003


Batches: 100%|██████████| 1/1 [00:00<00:00,  4.81it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  9.25it/s]

[0min12sec] Word Error Rate: 0.22857142857142856
[0min12sec] Word Information Loss: 0.36883116883116873
[0min12sec] (EXPERIMENTAL) Document Similarity: 0.9875556230545044



