In [None]:
!git lfs install
!git clone https://github.com/babylonhealth/primock57.git

Git LFS initialized.
Cloning into 'primock57'...
remote: Enumerating objects: 424, done.[K
remote: Counting objects: 100% (424/424), done.[K
remote: Compressing objects: 100% (390/390), done.[K
remote: Total 424 (delta 53), reused 386 (delta 28), pack-reused 0 (from 0)[K
Receiving objects: 100% (424/424), 3.83 MiB | 7.93 MiB/s, done.
Resolving deltas: 100% (53/53), done.
Filtering content: 100% (114/114), 1.85 GiB | 41.83 MiB/s, done.


In [None]:
import os
import shutil

def remove_checkpoints_and_readme(directory):
    checkpoints_path = os.path.join(directory, ".ipynb_checkpoints")
    md_path = os.path.join(directory, ".md")
    readme_path = os.path.join(directory, "README.md")

    # Remove .ipynb_checkpoints folder
    if os.path.exists(checkpoints_path):
        shutil.rmtree(checkpoints_path)

    # Remove .md folder
    if os.path.exists(md_path):
        shutil.rmtree(md_path)

    # Remove README.md file
    if os.path.exists(readme_path):
        os.remove(readme_path)

# Remove unwanted files and folders from both directories
remove_checkpoints_and_readme("/content/primock57/audio/")
remove_checkpoints_and_readme("/content/primock57/transcripts/")


In [None]:
import os
import json
import re
!pip install textgrid
!pip install pydub
from textgrid import TextGrid
from pydub import AudioSegment

!pip install textgrid
!pip install pydub

def parse_textgrid(textgrid_path):
    tg = TextGrid.fromFile(textgrid_path)
    data = []

    for tier in tg:
        for interval in tier:
            cleaned_text = re.sub(r"<.*?>", "", interval.mark.strip())
            data.append({
                "xmin": interval.minTime,
                "xmax": interval.maxTime,
                "text": cleaned_text
            })

    return data

def split_audio(audio_path, textgrid_data, output_dir):
    """
    Splits audio into segments based on intervals from the TextGrid file.
    """
    audio = AudioSegment.from_file(audio_path)
    os.makedirs(output_dir, exist_ok=True)

    processed_entries = []
    for i, entry in enumerate(textgrid_data):
        # Skip entries with empty or whitespace-only text
        if not entry["text"].strip():
            continue

        start_time = entry["xmin"] * 1000  # Convert to milliseconds
        end_time = entry["xmax"] * 1000  # Convert to milliseconds
        segment = audio[start_time:end_time]

        segment_path = os.path.join(output_dir, f"{os.path.basename(audio_path).split('.')[0]}_segment_{i}.wav")
        segment.export(segment_path, format="wav")
        processed_entries.append({
            "audio_filepath": segment_path,
            "text": entry["text"]
        })

    return processed_entries

def process_primock57_dataset(audio_dir, transcripts_dir, output_audio_dir, output_json_path):
    """
    Processes the entire Primock57 dataset:
    - Parses TextGrid files.
    - Splits audio files into segments.
    - Outputs a JSON dataset compatible with Wav2Vec2.
    """
    all_processed_data = []

    # Ensure only valid audio and transcript files are considered
    audio_files = sorted([f for f in os.listdir(audio_dir) if f.endswith('.wav')])
    textgrid_files = sorted([f for f in os.listdir(transcripts_dir) if f.endswith('.TextGrid')])

    print(len(audio_files))        # 114
    print(len(textgrid_files))     # 114 should be 114 = 57*2

    # Ensure that the number of audio files and textgrid files match
    if len(audio_files) != len(textgrid_files):
        print("Warning: Mismatch in the number of audio and transcript files.")

    # Process each file pair
    for audio_file, textgrid_file in zip(audio_files, textgrid_files):
        audio_path = os.path.join(audio_dir, audio_file)
        textgrid_path = os.path.join(transcripts_dir, textgrid_file)

        # Check if file names match (optional)
        if os.path.splitext(audio_file)[0] != os.path.splitext(textgrid_file)[0]:
            print(f"Skipping unmatched pair: {audio_file} and {textgrid_file}")
            continue

        # Parse the TextGrid file
        textgrid_data = parse_textgrid(textgrid_path)

        # Split the audio into segments, skipping empty ones
        processed_data = split_audio(audio_path, textgrid_data, output_audio_dir)
        all_processed_data.extend(processed_data)

    # Save the processed data as a JSON file
    with open(output_json_path, "w") as f:
        json.dump(all_processed_data, f, indent=4)

    return all_processed_data

# Directories
audio_dir = "/content/primock57/audio/"           # Path to the audio directory
transcripts_dir = "/content/primock57/transcripts/"  # Path to the transcripts directory
output_audio_dir = "/content/processed_audio/"    # Directory to save processed audio segments
output_json_path = "/content/primock57_dataset.json"  # Path to save the final JSON dataset

# Get the audio files before processing the dataset
audio_files = sorted([f for f in os.listdir(audio_dir) if f.endswith('.wav')])  # Define audio_files here

# Process the dataset
all_processed_data = process_primock57_dataset(audio_dir, transcripts_dir, output_audio_dir, output_json_path)
print(f"Processed {len(all_processed_data)} segments from {len(audio_files)} audio files.")


[31mERROR: Operation cancelled by user[0m[31m
[0mTraceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/pip/_vendor/pkg_resources/__init__.py", line 3070, in _dep_map
    return self.__dep_map
  File "/usr/local/lib/python3.10/dist-packages/pip/_vendor/pkg_resources/__init__.py", line 2863, in __getattr__
    raise AttributeError(attr)
AttributeError: _DistInfoDistribution__dep_map

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/pip/_vendor/pkg_resources/__init__.py", line 3061, in _parsed_pkg_info
    return self._pkg_info
  File "/usr/local/lib/python3.10/dist-packages/pip/_vendor/pkg_resources/__init__.py", line 2863, in __getattr__
    raise AttributeError(attr)
AttributeError: _pkg_info. Did you mean: 'egg_info'?

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3

**Using Speech Recognition And PyDub**

In [None]:
!pip install jiwer
!pip install speech_recognition
!pip install SpeechRecognition PyDub pandas noisereduce

import os
import json
import numpy as np
import pandas as pd
from IPython.display import Audio, display
from jiwer import wer
import speech_recognition as sr
from pydub import AudioSegment
import noisereduce as nr

# Function to convert audio to WAV format (if required)
def convert_audio_to_wav(audio_file_path):
    audio = AudioSegment.from_file(audio_file_path)
    wav_file = audio_file_path.split(".")[0] + ".wav"
    audio.export(wav_file, format="wav")
    return wav_file

# Function to reduce noise in audio
def reduce_noise(audio_file_path):
    audio = AudioSegment.from_file(audio_file_path)
    samples = np.array(audio.get_array_of_samples())
    reduced_samples = nr.reduce_noise(y=samples, sr=audio.frame_rate)
    reduced_audio = AudioSegment(
        reduced_samples.tobytes(),
        frame_rate=audio.frame_rate,
        sample_width=audio.sample_width,
        channels=audio.channels
    )
    reduced_audio_path = audio_file_path.split(".")[0] + "_reduced.wav"
    reduced_audio.export(reduced_audio_path, format="wav")
    return reduced_audio_path

# Function to split long audio into smaller segments
def split_audio(audio_file_path, segment_duration_ms=10000):
    audio = AudioSegment.from_file(audio_file_path)
    segments = []
    for i in range(0, len(audio), segment_duration_ms):
        segment = audio[i:i + segment_duration_ms]
        segment_path = f"{audio_file_path.split('.')[0]}_segment_{i}.wav"
        segment.export(segment_path, format="wav")
        segments.append(segment_path)
    return segments

# Function to perform speech-to-text conversion
def speech_to_text(audio_file_path):
    recognizer = sr.Recognizer()
    try:
        with sr.AudioFile(audio_file_path) as source:
            audio = recognizer.record(source)
        return recognizer.recognize_google(audio)
    except sr.UnknownValueError:
        return "Could not understand audio"
    except sr.RequestError as e:
        return f"Error: {str(e)}"

# Load the JSON dataset
def load_primock_dataset(json_path, limit=12):
    with open(json_path, "r") as f:
        data = json.load(f)
    return data[:limit]  # Load only the first `limit` files

# Function to evaluate and display results
def evaluate_primock(json_path, output_csv_path):
    dataset = load_primock_dataset(json_path)
    results = []

    for idx, entry in enumerate(dataset):
        audio_path = entry["audio_filepath"]
        reference_text = entry["text"]

        # Convert to WAV if necessary
        if not audio_path.endswith(".wav"):
            audio_path = convert_audio_to_wav(audio_path)

        # Reduce noise
        audio_path = reduce_noise(audio_path)

        # Split long audio files
        audio_segments = split_audio(audio_path)

        # Combine predictions for all segments
        prediction_text = ""
        for segment in audio_segments:
            segment_prediction = speech_to_text(segment)
            if segment_prediction != "Could not understand audio":
                prediction_text += " " + segment_prediction

        print(f"Processing File {idx + 1}: {audio_path}")
        display(Audio(audio_path))  # Play the audio

        print(f"Reference Text: {reference_text}")
        print(f"Prediction (Hypothesis): {prediction_text.strip()}")

        # Calculate Word Error Rate (WER)
        word_error_rate = wer(reference_text, prediction_text.strip())
        print(f"Word Error Rate (WER): {word_error_rate:.4f}\n")

        # Store results
        results.append({
            "Audio File": audio_path,
            "Reference": reference_text,
            "Hypothesis": prediction_text.strip(),
            "WER": word_error_rate,
        })

    # Save results as a DataFrame
    df = pd.DataFrame(results)
    df.to_csv(output_csv_path, index=False)
    print(f"Report saved to: {output_csv_path}")

    return output_csv_path

# Path to your Primock JSON file
primock_json_path = "/content/primock57_dataset.json"  # Update with your actual path

# Output CSV file
output_csv_path = "/content/primock_evaluation_report.csv"

# Evaluate the dataset and generate a report
report_path = evaluate_primock(primock_json_path, output_csv_path)


[31mERROR: Could not find a version that satisfies the requirement speech_recognition (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for speech_recognition[0m[31m
Processing File 1: /content/processed_audio/day1_consultation01_doctor_segment_1_reduced.wav


Reference Text: Hello? Hi. Um, should we start? Yeah, okay. Hello how um. Good morning sir, how can I help you this morning?
Prediction (Hypothesis): hello full stop
Word Error Rate (WER): 1.0000

Processing File 2: /content/processed_audio/day1_consultation01_doctor_segment_3_reduced.wav


Reference Text:  Sorry to hear that. Um, and and when you say diarrhea, what'd you mean by diarrhea? Do you mean you're going to the toilet more often? Or are your stools more loose?
Prediction (Hypothesis): what do you mean by diary do you mean you're going to tell him more often or are your schools more loose
Word Error Rate (WER): 0.6250

Processing File 3: /content/processed_audio/day1_consultation01_doctor_segment_7_reduced.wav


Reference Text: Okay. And how many times a day are you going, let's say, in the last couple of days?
Prediction (Hypothesis): okay and how many times a day are you going to see the last couple of days
Word Error Rate (WER): 0.3889

Processing File 4: /content/processed_audio/day1_consultation01_doctor_segment_9_reduced.wav


Reference Text: Six, seven times a day. And you  mention it's mainly watery. Have you noticed any other things, like blood in your stools?
Prediction (Hypothesis): 7 * 37 things like blood in your stools
Word Error Rate (WER): 0.8182

Processing File 5: /content/processed_audio/day1_consultation01_doctor_segment_11_reduced.wav


Reference Text: Okay. And you mentioned you've had some pain in your tummy as well.  Whereabouts is the pain, exactly?
Prediction (Hypothesis): whereabouts is the pain exactly
Word Error Rate (WER): 0.8889

Processing File 6: /content/processed_audio/day1_consultation01_doctor_segment_13_reduced.wav


Reference Text: One side. And what side is that?
Prediction (Hypothesis): what side is that
Word Error Rate (WER): 0.5714

Processing File 7: /content/processed_audio/day1_consultation01_doctor_segment_15_reduced.wav


Reference Text: Left side. Okay, and can you describe the pain to me?
Prediction (Hypothesis): can you describe the pain to me
Word Error Rate (WER): 0.4545

Processing File 8: /content/processed_audio/day1_consultation01_doctor_segment_17_reduced.wav


Reference Text: Okay. And is the pain, is that, is it there all the time, or does it come and go?
Prediction (Hypothesis): and this pain is that is it there all the time or does it come and go
Word Error Rate (WER): 0.4211

Processing File 9: /content/processed_audio/day1_consultation01_doctor_segment_19_reduced.wav


Reference Text: Come and go. Does the pain move anywhere else, for example towards your back?
Prediction (Hypothesis): is a pain move anywhere else because your back
Word Error Rate (WER): 0.7143

Processing File 10: /content/processed_audio/day1_consultation01_doctor_segment_21_reduced.wav


Reference Text: Okay, fine. And you mentioned you've been feeling quite weak and shaky as well. What do you mean by shaky? Do you mean you've been having, uh have you been feeling feverish, for example?
Prediction (Hypothesis): weak and shaky as well would you even been having anything feverish for example
Word Error Rate (WER): 0.7941

Processing File 11: /content/processed_audio/day1_consultation01_doctor_segment_25_reduced.wav


Reference Text: You measure your temperature then?
Prediction (Hypothesis): measure temperature then
Word Error Rate (WER): 0.6000

Processing File 12: /content/processed_audio/day1_consultation01_doctor_segment_27_reduced.wav


Reference Text: Okay. Okay. Any other symptoms like sweating, or um, night sweats? No? And, uh, any vomiting at all?
Prediction (Hypothesis): any other symptoms like sweating or not and any vomiting at all
Word Error Rate (WER): 0.6111

Report saved to: /content/primock_evaluation_report.csv


In [None]:
# Provide the report as a downloadable file
from google.colab import files
download_choice = input("Do you want to download the transcription report? (yes/no): ").strip().lower()
if download_choice == "yes":
  files.download(report_path)

Do you want to download the transcription report? (yes/no): no
