# Generating "Gold" data

Use the `large-v3` model as baseline:

- Perform prediction on all audio files and use that as "ground truth"
- Why use these as labels?
	- The labels provided by NUHS are not perfect, and audio mixing are often intertwined between multiple speakers (i.e., patient, SLT, family member, etc.)
    - Some labels are more "complete" than others
	- Consistency issues within labels: Some human labellers may ignore noise of a certain threshold - Utilising Whisper as "Ground Truth" helps with that inconsistency
	- The end goal: To improve the performance of Whisper for Aphasic Patients in a localised context (Singapore)
- Use the new ground truth transcriptions + audio and train the `small.en` model
	- This fine-tuned model should perform better than the "baseline" small model for our use case


In [None]:
import os

os.chdir('..')
os.getcwd()

In [1]:
import warnings
warnings.filterwarnings("ignore")

The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.


## OPTIONAL: Uploading the files to cloud storage

In [None]:
import os
from tqdm import tqdm
from google.cloud import storage

# Set up your credentials and initialize the client
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'aphasia-chatter-5a70166fc2f1.json'
client = storage.Client()

# Define the bucket and target directory in the bucket
bucket = client.get_bucket('speech-sit-bucket')  # Replace with your bucket name
directory_prefix = 'audio'  # Destination prefix in the GCS bucket

# Specify your local directory with files to upload
local_directory = 'samples/audio'  # Replace with your local directory path

# Get list of files in the directory
files = [f for f in os.listdir(local_directory) if os.path.isfile(os.path.join(local_directory, f))]

# Upload each file with a progress bar
with tqdm(total=len(files), desc="Uploading files", unit="file") as pbar:
    for filename in files:
        local_file_path = os.path.join(local_directory, filename)
        blob_path = os.path.join(directory_prefix, filename)
        blob = bucket.blob(blob_path)
        
        # Upload the file and update the progress bar
        blob.upload_from_filename(local_file_path)
        pbar.update(1)

## Step 1: Load the audio files

In [2]:
import os
from tqdm import tqdm
from google.cloud import storage

# Set up Google Cloud credentials and initialize the client
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'aphasia-chatter-5a70166fc2f1.json'
client = storage.Client()
bucket = client.get_bucket('speech-sit-bucket')  # Replace with your bucket name

# Define GCS directory and local download directory
directory_prefix = 'audio/'  # GCS directory prefix
download_directory = 'samples/audio'  # Local directory path

# Check if the local directory exists
if not os.path.exists(download_directory):
    # If directory doesn't exist, create it
    os.makedirs(download_directory)

    # List all blobs (files) in the specified GCS directory
    audios = bucket.list_blobs(prefix=directory_prefix)

    # Initialize tqdm, but set the total to len(audio_file_set), the actual number of files to download
    progress_bar = tqdm(total=14159, desc="Downloading Files", unit=" files", leave=False)
    for index, audio in enumerate(audios):
        if index != 0:
            audio_file_name = os.path.basename(audio.name)
            local_file_path = os.path.join(download_directory, audio_file_name)
            audio.download_to_filename(local_file_path)
            progress_bar.update(1)
    progress_bar.close()
else:
    # If the directory exists, read files from local storage
    print("Reading files from the local directory...")
    files = sorted(os.listdir(download_directory))
    print(f"Loaded {len(files)} files from local directory.")

Reading files from the local directory...
Loaded 14159 files from local directory.


## Step 2: Remove Files > 30s long and Extract Waveforms

The purpose of this step is to identify audio files that are longer than 30s. We can then remove them.

In [3]:
import os
import librosa
import numpy as np
from pydub import AudioSegment
from tqdm import tqdm

# Paths
input_dir = "samples/audio"
output_waveform_dir = "samples/waveform"
output_audio_dir = "samples/audio_processed"

# Ensure output directories exist
os.makedirs(output_audio_dir, exist_ok=True)
os.makedirs(output_waveform_dir, exist_ok=True)
        
def save_audio_and_waveform(file_path, filename, waveform, sample_rate):
    # Save the original audio as a single chunk
    base_name = os.path.splitext(filename)[0]
    
    # Save audio
    output_audio_path = os.path.join(output_audio_dir, f"{base_name}.wav")
    AudioSegment.from_file(file_path).export(output_audio_path, format="wav")
    
    # Save waveform as .npy file
    output_waveform_path = os.path.join(output_waveform_dir, f"waveform_{base_name}.wav.npy")
    np.save(output_waveform_path, waveform)

# Iterate through each audio file in the input directory
for filename in tqdm(os.listdir(input_dir), desc="Processing audio files"):
    if filename.endswith(".m4a"):
        file_path = os.path.join(input_dir, filename)
        try:
            # Load the audio file with librosa to access waveform and sample rate
            waveform, sample_rate = librosa.load(file_path, sr=16000)

            # Duration
            duration = librosa.get_duration(y=waveform, sr=sample_rate)

            # If file > 30s, drop it
            if duration <= 30:
                save_audio_and_waveform(file_path, filename, waveform, sample_rate)
        except Exception as e:
            tqdm.write(f"Skipping {filename} due to error: {e}")
            continue

Processing audio files:  88%|████████▊ | 12466/14159 [16:07<01:45, 16.07it/s]

Skipping hl_e062_A-59.m4a due to error: 


Processing audio files:  98%|█████████▊| 13923/14159 [17:50<00:15, 15.50it/s]

Skipping hl_e071_A-P-02.m4a due to error: 


Processing audio files: 100%|██████████| 14159/14159 [18:08<00:00, 13.01it/s]


## Step 2: Generate the "gold" transcriptions
Use `whisper-large-v3`

In [4]:
import gc
import torch
torch.cuda.empty_cache()
gc.collect()

0

In [5]:
import os
import pandas as pd
import torch.backends
import torch.backends.mps
import torch.mps
import whisper
from whisper.normalizers import BasicTextNormalizer
from tqdm import tqdm
import torch
import math

# Load the Whisper model
model = whisper.load_model("large-v3")

# Set device
if torch.backends.mps.is_available():
    device = "mps"
elif torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"
model.to(device)

# Specify directory with audio files and checkpoint file path
audio_directory = 'samples/audio_processed'
checkpoint_path = 'transcription_checkpoint.csv'

# Load checkpoint if it exists
if os.path.exists(checkpoint_path):
    df = pd.read_csv(checkpoint_path)
    processed_files = set(df['audio_base_path'])
    print(f"Resuming from checkpoint. {len(processed_files)} files already processed.")
else:
    df = pd.DataFrame(columns=["patient", "path", "audio_base_path", "gold_transcript"])
    processed_files = set()

# List audio files in the directory
audio_files = [f for f in os.listdir(audio_directory) if os.path.isfile(os.path.join(audio_directory, f))]

# Transcribe each audio file and store results in DataFrame format
for audio_file in tqdm(audio_files, desc="Transcribing Audio Files"):
    if audio_file in processed_files:
        continue  # Skip already processed files

    audio_path = os.path.join(audio_directory, audio_file)
    audio_base_path = os.path.basename(audio_path)

    try:
        # Transcribe the audio
        result = model.transcribe(audio_path, language="en")
        gold_transcript = result["text"]

        # Ensure that patient file name is in the format 'al_xxxx' or 'hl_xxxx'
        patient = audio_base_path[:7]

        # Create a new row to add
        new_row = pd.DataFrame({
            "patient": [patient],
            "path": [audio_path],
            "audio_base_path": [audio_base_path],
            "gold_transcript": [gold_transcript]
        })

        # Append the new row to the DataFrame
        df = pd.concat([df, new_row], ignore_index=True)

        # Save checkpoint every 10 files
        if len(df) % 10 == 0:
            df.to_csv(checkpoint_path, index=False)

    except Exception as e:
        # Log the error and continue with the next file
        print(f"Error processing {audio_file}: {e}")
        continue

# Final save to CSV after processing all files
df.to_csv(checkpoint_path, index=False)
print("Transcription complete. Final checkpoint saved.")

# Display the DataFrame to verify
print(df.head())

100%|█████████████████████████████████████| 2.88G/2.88G [00:46<00:00, 65.9MiB/s]
Transcribing Audio Files: 100%|██████████| 13769/13769 [2:21:36<00:00,  1.62it/s]  

Transcription complete. Final checkpoint saved.
   patient                                      path   audio_base_path  \
0  al_e026  samples/audio_processed/al_e026_A-02.wav  al_e026_A-02.wav   
1  al_e026  samples/audio_processed/al_e026_A-03.wav  al_e026_A-03.wav   
2  al_e026  samples/audio_processed/al_e026_A-04.wav  al_e026_A-04.wav   
3  al_e026  samples/audio_processed/al_e026_A-05.wav  al_e026_A-05.wav   
4  al_e026  samples/audio_processed/al_e026_A-06.wav  al_e026_A-06.wav   

                                     gold_transcript  
0   I do body. And I have a bag. Racking a... ......  
1   My body is a frog, frog, frog, cow, cow, cow,...  
2   A crack? It looks like a bag of people clicki...  
3   The broke, a broke croaking, cracked calf, cr...  
4   This is a frog calf. The two persons, Bok is ...  





In [6]:
df = pd.read_csv(checkpoint_path)

In [7]:
df = df.dropna(ignore_index=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13680 entries, 0 to 13679
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   patient          13680 non-null  object
 1   path             13680 non-null  object
 2   audio_base_path  13680 non-null  object
 3   gold_transcript  13680 non-null  object
dtypes: object(4)
memory usage: 427.6+ KB


In [8]:
df.tail()

Unnamed: 0,patient,path,audio_base_path,gold_transcript
13675,hl_e072,samples/audio_processed/hl_e072_B-48.wav,hl_e072_B-48.wav,Running.
13676,hl_e072,samples/audio_processed/hl_e072_B-49.wav,hl_e072_B-49.wav,Sewing with a needle?
13677,hl_e072,samples/audio_processed/hl_e072_B-50.wav,hl_e072_B-50.wav,Shaving
13678,hl_e072,samples/audio_processed/hl_e072_B-51.wav,hl_e072_B-51.wav,Shooting
13679,hl_e072,samples/audio_processed/hl_e072_B-52.wav,hl_e072_B-52.wav,Singing.


In [9]:
import numpy as np
import pandas as pd

patients = df['patient'].unique()
np.random.shuffle(patients)

train_patients = patients[:int(0.7 * len(patients))]
val_patients = patients[int(0.7 * len(patients)):int(0.85 * len(patients))]
test_patients = patients[int(0.85 * len(patients)):]

train_df = df[df['patient'].isin(train_patients)]
val_df = df[df['patient'].isin(val_patients)]
test_df = df[df['patient'].isin(test_patients)]

train_df.to_csv('train_set.csv', index=False)
val_df.to_csv('val_set.csv', index=False)
test_df.to_csv('test_set.csv', index=False)

## Step 3: Export the transcripts to the bucket

In [10]:
import os
from tqdm import tqdm
from google.cloud import storage

# Set up your credentials and initialize the client
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'aphasia-chatter-5a70166fc2f1.json'
client = storage.Client()

# Define the bucket and target directory in the bucket
bucket = client.get_bucket('speech-sit-bucket')  # Replace with your bucket name

directory_prefix = 'transcripts/'

# Function to upload a file to the specified GCS bucket path
def upload_to_bucket(local_file, bucket_path):
    blob = bucket.blob(bucket_path)
    blob.upload_from_filename(local_file)
    print(f"Uploaded {local_file} to {bucket_path}")

# Upload each CSV file to the bucket
upload_to_bucket('train_set.csv', os.path.join(directory_prefix, 'train_set.csv'))
upload_to_bucket('val_set.csv', os.path.join(directory_prefix, 'val_set.csv'))
upload_to_bucket('test_set.csv', os.path.join(directory_prefix, 'test_set.csv'))

Uploaded train_set.csv to transcripts/train_set.csv
Uploaded val_set.csv to transcripts/val_set.csv
Uploaded test_set.csv to transcripts/test_set.csv


## Step 4: Export the waveforms to the bucket

In [16]:
import os
from tqdm import tqdm
from google.cloud import storage
import time

# Set up your credentials and initialize the client
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'aphasia-chatter-5a70166fc2f1.json'
client = storage.Client()

# Define the bucket and target directory in the bucket
bucket = client.get_bucket('speech-sit-bucket')  # Replace with your bucket name
directory_prefix = 'waveform'  # Destination prefix in the GCS bucket

# Specify your local directory with files to upload
local_directory = 'samples/waveform'  # Replace with your local directory path

# Get list of files in the directory
files = [f for f in os.listdir(local_directory) if os.path.isfile(os.path.join(local_directory, f))]

# Upload each file with a progress bar
with tqdm(total=len(files), desc="Uploading files", unit="file") as pbar:
    for filename in files:
        local_file_path = os.path.join(local_directory, filename)
        blob_path = os.path.join(directory_prefix, filename)
        blob = bucket.blob(blob_path)
        
        # Upload the file and update the progress bar
        blob.upload_from_filename(local_file_path)
        pbar.update(1)
        time.sleep(0.5)

Uploading files: 100%|██████████| 13769/13769 [2:47:44<00:00,  1.37file/s] 
