# Installing Whisper

The commands below will install the Python packages needed to use Whisper models and evaluate the transcription results.

In [None]:
!git lfs install
!git clone https://github.com/babylonhealth/primock57.git

Git LFS initialized.
Cloning into 'primock57'...
remote: Enumerating objects: 424, done.[K
remote: Counting objects: 100% (424/424), done.[K
remote: Compressing objects: 100% (390/390), done.[K
remote: Total 424 (delta 53), reused 386 (delta 28), pack-reused 0 (from 0)[K
Receiving objects: 100% (424/424), 3.83 MiB | 12.74 MiB/s, done.
Resolving deltas: 100% (53/53), done.
Filtering content: 100% (114/114), 1.85 GiB | 69.62 MiB/s, done.


In [None]:
import os
import shutil

def remove_checkpoints_and_readme(directory):
    checkpoints_path = os.path.join(directory, ".ipynb_checkpoints")
    md_path = os.path.join(directory, ".md")
    readme_path = os.path.join(directory, "README.md")

    # Remove .ipynb_checkpoints folder
    if os.path.exists(checkpoints_path):
        shutil.rmtree(checkpoints_path)

    # Remove .md folder
    if os.path.exists(md_path):
        shutil.rmtree(md_path)

    # Remove README.md file
    if os.path.exists(readme_path):
        os.remove(readme_path)

# Remove unwanted files and folders from both directories
remove_checkpoints_and_readme("/content/primock57/audio/")
remove_checkpoints_and_readme("/content/primock57/transcripts/")


In [None]:
import os
import json
import re
!pip install textgrid
!pip install pydub
from textgrid import TextGrid
from pydub import AudioSegment

!pip install textgrid
!pip install pydub

def parse_textgrid(textgrid_path):
    tg = TextGrid.fromFile(textgrid_path)
    data = []

    for tier in tg:
        for interval in tier:
            cleaned_text = re.sub(r"<.*?>", "", interval.mark.strip())
            data.append({
                "xmin": interval.minTime,
                "xmax": interval.maxTime,
                "text": cleaned_text
            })

    return data

def split_audio(audio_path, textgrid_data, output_dir):
    """
    Splits audio into segments based on intervals from the TextGrid file.
    """
    audio = AudioSegment.from_file(audio_path)
    os.makedirs(output_dir, exist_ok=True)

    processed_entries = []
    for i, entry in enumerate(textgrid_data):
        # Skip entries with empty or whitespace-only text
        if not entry["text"].strip():
            continue

        start_time = entry["xmin"] * 1000  # Convert to milliseconds
        end_time = entry["xmax"] * 1000  # Convert to milliseconds
        segment = audio[start_time:end_time]

        segment_path = os.path.join(output_dir, f"{os.path.basename(audio_path).split('.')[0]}_segment_{i}.wav")
        segment.export(segment_path, format="wav")
        processed_entries.append({
            "audio_filepath": segment_path,
            "text": entry["text"]
        })

    return processed_entries

def process_primock57_dataset(audio_dir, transcripts_dir, output_audio_dir, output_json_path):
    """
    Processes the entire Primock57 dataset:
    - Parses TextGrid files.
    - Splits audio files into segments.
    - Outputs a JSON dataset compatible with Wav2Vec2.
    """
    all_processed_data = []

    # Ensure only valid audio and transcript files are considered
    audio_files = sorted([f for f in os.listdir(audio_dir) if f.endswith('.wav')])
    textgrid_files = sorted([f for f in os.listdir(transcripts_dir) if f.endswith('.TextGrid')])

    print(len(audio_files))        # 114
    print(len(textgrid_files))     # 114 should be 114 = 57*2

    # Ensure that the number of audio files and textgrid files match
    if len(audio_files) != len(textgrid_files):
        print("Warning: Mismatch in the number of audio and transcript files.")

    # Process each file pair
    for audio_file, textgrid_file in zip(audio_files, textgrid_files):
        audio_path = os.path.join(audio_dir, audio_file)
        textgrid_path = os.path.join(transcripts_dir, textgrid_file)

        # Check if file names match (optional)
        if os.path.splitext(audio_file)[0] != os.path.splitext(textgrid_file)[0]:
            print(f"Skipping unmatched pair: {audio_file} and {textgrid_file}")
            continue

        # Parse the TextGrid file
        textgrid_data = parse_textgrid(textgrid_path)

        # Split the audio into segments, skipping empty ones
        processed_data = split_audio(audio_path, textgrid_data, output_audio_dir)
        all_processed_data.extend(processed_data)

    # Save the processed data as a JSON file
    with open(output_json_path, "w") as f:
        json.dump(all_processed_data, f, indent=4)

    return all_processed_data

# Directories
audio_dir = "/content/primock57/audio/"           # Path to the audio directory
transcripts_dir = "/content/primock57/transcripts/"  # Path to the transcripts directory
output_audio_dir = "/content/processed_audio/"    # Directory to save processed audio segments
output_json_path = "/content/primock57_dataset.json"  # Path to save the final JSON dataset

# Get the audio files before processing the dataset
audio_files = sorted([f for f in os.listdir(audio_dir) if f.endswith('.wav')])  # Define audio_files here

# Process the dataset
all_processed_data = process_primock57_dataset(audio_dir, transcripts_dir, output_audio_dir, output_json_path)
print(f"Processed {len(all_processed_data)} segments from {len(audio_files)} audio files.")


Collecting textgrid
  Downloading TextGrid-1.6.1.tar.gz (9.4 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: textgrid
  Building wheel for textgrid (setup.py) ... [?25l[?25hdone
  Created wheel for textgrid: filename=TextGrid-1.6.1-py3-none-any.whl size=10147 sha256=b995ad372bd3069e98e9dbe3c09e43ae613b4b5d1ca29a4470c1167a5a90d0fc
  Stored in directory: /root/.cache/pip/wheels/23/41/f2/e2ef1817bd163de3c21dd078966bdd71bd5c4455841f4ec016
Successfully built textgrid
Installing collected packages: textgrid
Successfully installed textgrid-1.6.1
Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1
114
114
Processed 6727 segments from 114 audio files.


In [None]:
# Install necessary libraries
!pip install git+https://github.com/openai/whisper.git
!pip install jiwer
!pip install librosa

import os
import numpy as np
import json
import torch
import pandas as pd
import whisper
import librosa
from tqdm.notebook import tqdm
from whisper.normalizers import EnglishTextNormalizer
import jiwer

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Path to the Primock dataset JSON file
json_file = "/content/primock57_dataset.json"

# Load the Primock dataset
with open(json_file, "r") as f:
    data = json.load(f)

# Filter out entries with empty transcriptions
filtered_data = [entry for entry in data if entry["text"].strip()]

# Define a simple dataset class for Primock
class PrimockDataset(torch.utils.data.Dataset):
    """
    A custom dataset for the Primock dataset that loads audio and preprocesses it for Whisper.
    """
    def __init__(self, dataset, device=DEVICE):
        self.dataset = dataset
        self.device = device

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        entry = self.dataset[idx]
        audio_path = entry["audio_filepath"]
        text = entry["text"]

        # Load audio with librosa
        audio, sr = librosa.load(audio_path, sr=16000)
        audio = torch.tensor(audio).to(self.device)
        audio = whisper.pad_or_trim(audio)
        mel = whisper.log_mel_spectrogram(audio)

        return mel, text

# Create dataset and data loader
primock_dataset = PrimockDataset(filtered_data)
loader = torch.utils.data.DataLoader(primock_dataset, batch_size=16)

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-a4msx013
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-a4msx013
  Resolved https://github.com/openai/whisper.git to commit 173ff7dd1d9fb1c4fddea0d41d704cfefeb8908c
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


100%|████████████████████████████████████████| 139M/139M [00:01<00:00, 105MiB/s]
  checkpoint = torch.load(fp, map_location=device)


Model is English-only and has 71,825,408 parameters.


  0%|          | 0/421 [00:00<?, ?it/s]

In [None]:
# Load the Whisper model
model = whisper.load_model("base.en")
print(
    f"Model is {'multilingual' if model.is_multilingual else 'English-only'} "
    f"and has {sum(np.prod(p.shape) for p in model.parameters()):,} parameters."
)

# Define decoding options
options = whisper.DecodingOptions(language="en", without_timestamps=True)

In [None]:
# Run inference on the dataset
hypotheses = []
references = []

for mels, texts in tqdm(loader):
    results = model.decode(mels, options)
    hypotheses.extend([result.text for result in results])
    references.extend(texts)

# Create a DataFrame to store predictions and references
data = pd.DataFrame(dict(hypothesis=hypotheses, reference=references))

# Normalizing text for evaluation
normalizer = EnglishTextNormalizer()

data["hypothesis_clean"] = [normalizer(text) for text in data["hypothesis"]]
data["reference_clean"] = [normalizer(text) for text in data["reference"]]

In [None]:
# Calculate Word Error Rate (WER)
wer = jiwer.wer(list(data["reference_clean"]), list(data["hypothesis_clean"]))

print(f"WER: {wer * 100:.2f} %")

# Display predictions vs. references
for i in range(min(10, len(data))):  # Display first 10 results
    print(f"Sample {i + 1}:")
    print(f"Audio File: {filtered_data[i]['audio_filepath']}")
    display(Audio(filtered_data[i]['audio_filepath']))
    print(f"Prediction: {data['hypothesis'].iloc[i]}")
    print(f"Reference: {data['reference'].iloc[i]}\n")
