# Exploratory Data Analysis of Our Audio Files

## Download Audio Files, Train, Test, Validation Sets, and Waveforms

In [1]:
import os
from tqdm import tqdm
from google.cloud import storage

# Set up Google Cloud credentials and initialize the client
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'aphasia-chatter-5a70166fc2f1.json'
client = storage.Client()
bucket = client.get_bucket('speech-sit-bucket')  # Replace with your bucket name

# Define GCS directory and local download directory
directory_prefix = 'audio/'  # GCS directory prefix
download_directory = 'samples/audio'  # Local directory path

# Check if the local directory exists
if not os.path.exists(download_directory):
    # If directory doesn't exist, create it
    os.makedirs(download_directory)

    # List all blobs (files) in the specified GCS directory
    audios = bucket.list_blobs(prefix=directory_prefix)

    # Initialize tqdm, but set the total to len(audio_file_set), the actual number of files to download
    progress_bar = tqdm(total=14159, desc="Downloading Files", unit=" files", leave=False)
    for index, audio in enumerate(audios):
        if index != 0:
            audio_file_name = os.path.basename(audio.name)
            local_file_path = os.path.join(download_directory, audio_file_name)
            audio.download_to_filename(local_file_path)
            progress_bar.update(1)
    progress_bar.close()
else:
    # If the directory exists, read files from local storage
    print("Reading files from the local directory...")
    files = sorted(os.listdir(download_directory))
    print(f"Loaded {len(files)} files from local directory.")

                                                                            

In [2]:
import os
from tqdm import tqdm
from google.cloud import storage

# Set up Google Cloud credentials and initialize the client
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'aphasia-chatter-5a70166fc2f1.json'
client = storage.Client()
bucket = client.get_bucket('speech-sit-bucket')  # Replace with your bucket name

# Define GCS directory and local download directory
directory_prefix = 'transcripts/'  # GCS directory prefix
download_directory = 'samples/transcripts'  # Local directory path

# Check if the local directory exists
if not os.path.exists(download_directory):
    # If directory doesn't exist, create it
    os.makedirs(download_directory)

    # List all blobs (files) in the specified GCS directory
    audios = bucket.list_blobs(prefix=directory_prefix)

    # Initialize tqdm, but set the total to len(audio_file_set), the actual number of files to download
    progress_bar = tqdm(total=3, desc="Downloading Gold Dataset", unit=" files", leave=False)
    for index, audio in enumerate(audios):
        audio_file_name = os.path.basename(audio.name)
        local_file_path = os.path.join(download_directory, audio_file_name)
        audio.download_to_filename(local_file_path)
        progress_bar.update(1)
    progress_bar.close()
else:
    # If the directory exists, read files from local storage
    print("Reading files from the local directory...")
    files = sorted(os.listdir(download_directory))
    print(f"Loaded {len(files)} files from local directory.")

                                                                           

In [3]:
import os
from tqdm import tqdm
from google.cloud import storage

# Set up Google Cloud credentials and initialize the client
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'aphasia-chatter-5a70166fc2f1.json'
client = storage.Client()
bucket = client.get_bucket('speech-sit-bucket')  # Replace with your bucket name

# Define GCS directory and local download directory
directory_prefix = 'waveform/'  # GCS directory prefix
download_directory = 'samples/waveform'  # Local directory path

# Check if the local directory exists
if not os.path.exists(download_directory):
    # If directory doesn't exist, create it
    os.makedirs(download_directory)

    # List all blobs (files) in the specified GCS directory
    audios = bucket.list_blobs(prefix=directory_prefix)

    # Initialize tqdm, but set the total to len(audio_file_set), the actual number of files to download
    progress_bar = tqdm(total=14159, desc="Downloading Files", unit=" files", leave=False)
    for index, audio in enumerate(audios):
        if index != 0:
            audio_file_name = os.path.basename(audio.name)
            local_file_path = os.path.join(download_directory, audio_file_name)
            audio.download_to_filename(local_file_path)
            progress_bar.update(1)
    progress_bar.close()
else:
    # If the directory exists, read files from local storage
    print("Reading files from the local directory...")
    files = sorted(os.listdir(download_directory))
    print(f"Loaded {len(files)} files from local directory.")

                                                                             

## Calculate the Total Audio Time

In [1]:
import pandas as pd

train_set = pd.read_csv('samples/transcripts/train_set.csv')
val_set = pd.read_csv('samples/transcripts/val_set.csv')
test_set = pd.read_csv('samples/transcripts/test_set.csv')


In [46]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9422 entries, 0 to 9421
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   patient          9422 non-null   object
 1   path             9422 non-null   object
 2   audio_base_path  9422 non-null   object
 3   gold_transcript  9422 non-null   object
 4   waveform_path    9422 non-null   object
dtypes: object(5)
memory usage: 368.2+ KB


In [47]:
val_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2047 entries, 0 to 2046
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   patient          2047 non-null   object
 1   path             2047 non-null   object
 2   audio_base_path  2047 non-null   object
 3   gold_transcript  2047 non-null   object
 4   waveform_path    2047 non-null   object
dtypes: object(5)
memory usage: 80.1+ KB


In [45]:
test_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2211 entries, 0 to 2210
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   patient          2211 non-null   object
 1   path             2211 non-null   object
 2   audio_base_path  2211 non-null   object
 3   gold_transcript  2211 non-null   object
 4   waveform_path    2211 non-null   object
dtypes: object(5)
memory usage: 86.5+ KB


In [5]:
train_set['waveform_path'] = train_set.apply(lambda row: f"samples/waveform/waveform_{row['audio_base_path']}.npy", axis=1)
val_set['waveform_path'] = val_set.apply(lambda row: f"samples/waveform/waveform_{row['audio_base_path']}.npy", axis=1)
test_set['waveform_path'] = test_set.apply(lambda row: f"samples/waveform/waveform_{row['audio_base_path']}.npy", axis=1)

train_set.head()

Unnamed: 0,patient,path,audio_base_path,gold_transcript,waveform_path
0,al_e026,samples/audio_processed/al_e026_A-02.wav,al_e026_A-02.wav,I do body. And I have a bag. Racking a... ......,samples/waveform/waveform_al_e026_A-02.wav.npy
1,al_e026,samples/audio_processed/al_e026_A-03.wav,al_e026_A-03.wav,"My body is a frog, frog, frog, cow, cow, cow,...",samples/waveform/waveform_al_e026_A-03.wav.npy
2,al_e026,samples/audio_processed/al_e026_A-04.wav,al_e026_A-04.wav,A crack? It looks like a bag of people clicki...,samples/waveform/waveform_al_e026_A-04.wav.npy
3,al_e026,samples/audio_processed/al_e026_A-05.wav,al_e026_A-05.wav,"The broke, a broke croaking, cracked calf, cr...",samples/waveform/waveform_al_e026_A-05.wav.npy
4,al_e026,samples/audio_processed/al_e026_A-06.wav,al_e026_A-06.wav,"This is a frog calf. The two persons, Bok is ...",samples/waveform/waveform_al_e026_A-06.wav.npy


In [6]:
import os
import librosa
import numpy as np

def compute_audio_length(df: pd.DataFrame):
    total_time = 0
    for _, row in df.iterrows():
        waveform = np.load(row['waveform_path'])
        audio_time = librosa.get_duration(y=waveform, sr=16000)
        total_time += audio_time
    return total_time

In [7]:
total_time_train_hours = compute_audio_length(train_set) / 3600
print(f"Train set total audio time (hours): {total_time_train_hours:.1f}")

Train set total audio time (hours): 15.1


In [8]:
total_time_val_hours = compute_audio_length(val_set) / 3600
print(f"Validation set total audio time (hours): {total_time_val_hours:.1f}")

Validation set total audio time (hours): 3.2


In [9]:
total_time_test_hours = compute_audio_length(test_set) / 3600
print(f"Test set total audio time (hours): {total_time_test_hours:.1f}")

Test set total audio time (hours): 3.6


In [10]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9422 entries, 0 to 9421
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   patient          9422 non-null   object
 1   path             9422 non-null   object
 2   audio_base_path  9422 non-null   object
 3   gold_transcript  9422 non-null   object
 4   waveform_path    9422 non-null   object
dtypes: object(5)
memory usage: 368.2+ KB


In [35]:
aphasic = []
normal = []

for audio in os.listdir('samples/audio'):
    if 'al_' in audio:
        aphasic.append(audio)
    else:
        normal.append(audio)

print(len(aphasic))
print(len(normal))

2989
11170


In [42]:
import librosa
import numpy as np

aphasic_wf = []
normal_wf = []

for wf in os.listdir('samples/waveform'):
    if 'al_' in wf:
        aphasic_wf.append(wf)
    else:
        normal_wf.append(wf)

total_aphasic_seconds = 0
total_normal_seconds = 0

for wf in aphasic_wf:
    wf = np.load(f'samples/waveform/{wf}')
    total_aphasic_seconds += librosa.get_duration(y=wf, sr=16000)

for wf in normal_wf:
    wf = np.load(f'samples/waveform/{wf}')
    total_normal_seconds += librosa.get_duration(y=wf, sr=16000)

print(f"{total_aphasic_seconds / (3600):.01f} hours")
print(f"{total_normal_seconds / (3600):.01f} hours")

7.3 hours
14.9 hours


In [11]:
val_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2047 entries, 0 to 2046
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   patient          2047 non-null   object
 1   path             2047 non-null   object
 2   audio_base_path  2047 non-null   object
 3   gold_transcript  2047 non-null   object
 4   waveform_path    2047 non-null   object
dtypes: object(5)
memory usage: 80.1+ KB


In [44]:
train_set['patient'].unique()

array(['al_e026', 'al_e028', 'al_e078', 'al_e085', 'al_e099', 'al_e100',
       'al_e101', 'al_e117', 'al_e118', 'al_e122', 'al_e132', 'al_e179',
       'hl_e002', 'hl_e003', 'hl_e005', 'hl_e006', 'hl_e007', 'hl_e008',
       'hl_e010', 'hl_e011', 'hl_e013', 'hl_e014', 'hl_e015', 'hl_e016',
       'hl_e017', 'hl_e018', 'hl_e019', 'hl_e020', 'hl_e021', 'hl_e024',
       'hl_e025', 'hl_e023', 'hl_e031', 'hl_e027', 'hl_e032', 'hl_e033',
       'hl_e034', 'hl_e035', 'hl_e037', 'hl_e043', 'hl_e044', 'hl_e045',
       'hl_e046', 'hl_e047', 'hl_e050', 'hl_e051', 'hl_e049', 'hl_e052',
       'hl_e053', 'hl_e057', 'hl_e059', 'hl_e058', 'hl_e060', 'hl_e062',
       'hl_e065', 'hl_e066', 'hl_e069', 'hl_e071', 'hl_e070', 'hl_e072'],
      dtype=object)

In [12]:
test_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2211 entries, 0 to 2210
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   patient          2211 non-null   object
 1   path             2211 non-null   object
 2   audio_base_path  2211 non-null   object
 3   gold_transcript  2211 non-null   object
 4   waveform_path    2211 non-null   object
dtypes: object(5)
memory usage: 86.5+ KB


In [13]:
val_set.iloc[0]

patient                                                   al_e048
path                     samples/audio_processed/al_e048_A-07.wav
audio_base_path                                  al_e048_A-07.wav
gold_transcript                                                 B
waveform_path      samples/waveform/waveform_al_e048_A-07.wav.npy
Name: 0, dtype: object

In [14]:
from whisper import log_mel_spectrogram

In [15]:
train_set.head()

Unnamed: 0,patient,path,audio_base_path,gold_transcript,waveform_path
0,al_e026,samples/audio_processed/al_e026_A-02.wav,al_e026_A-02.wav,I do body. And I have a bag. Racking a... ......,samples/waveform/waveform_al_e026_A-02.wav.npy
1,al_e026,samples/audio_processed/al_e026_A-03.wav,al_e026_A-03.wav,"My body is a frog, frog, frog, cow, cow, cow,...",samples/waveform/waveform_al_e026_A-03.wav.npy
2,al_e026,samples/audio_processed/al_e026_A-04.wav,al_e026_A-04.wav,A crack? It looks like a bag of people clicki...,samples/waveform/waveform_al_e026_A-04.wav.npy
3,al_e026,samples/audio_processed/al_e026_A-05.wav,al_e026_A-05.wav,"The broke, a broke croaking, cracked calf, cr...",samples/waveform/waveform_al_e026_A-05.wav.npy
4,al_e026,samples/audio_processed/al_e026_A-06.wav,al_e026_A-06.wav,"This is a frog calf. The two persons, Bok is ...",samples/waveform/waveform_al_e026_A-06.wav.npy


In [17]:
import whisper

whisper.available_models()

['tiny.en',
 'tiny',
 'base.en',
 'base',
 'small.en',
 'small',
 'medium.en',
 'medium',
 'large-v1',
 'large-v2',
 'large-v3',
 'large',
 'large-v3-turbo',
 'turbo']

In [23]:
import whisper
from whisper import DecodingOptions

import ssl
ssl._create_default_https_context = ssl._create_unverified_context

model = whisper.load_model('large-v2')


In [32]:
res_temp0 = model.transcribe(
    np.load(train_set['waveform_path'].iloc[55]),
    verbose=True,
    language="en"
)

res_temp03 = model.transcribe(
    np.load(train_set['waveform_path'].iloc[55]),
    verbose=True,
    temperature=0.3,
    language="en"
)

print(res_temp0['text'])
print(res_temp03['text'])

[00:00.000 --> 00:13.000]  Bride boy, you have a small little bride. She's a very pretty girl. She's a small boy.
[00:13.000 --> 00:25.000]  She's been clicking her body and looking at her body when she's clicking small things.
[00:00.000 --> 00:12.000]  Bright boy, you have a small little bright. She's a very pretty people. She's a small boy.
[00:12.000 --> 00:25.000]  She's been clicking her body and looking at us, talking her body when she's clicking small things.
 Bride boy, you have a small little bride. She's a very pretty girl. She's a small boy. She's been clicking her body and looking at her body when she's clicking small things.
 Bright boy, you have a small little bright. She's a very pretty people. She's a small boy. She's been clicking her body and looking at us, talking her body when she's clicking small things.


In [33]:
from jiwer import wer
from whisper.normalizers import EnglishTextNormalizer

normalizer = EnglishTextNormalizer()

wer_temp_0 = wer(normalizer(train_set['gold_transcript'].iloc[55]), normalizer(res_temp0['text']))
wer_temp_03 = wer(normalizer(train_set['gold_transcript'].iloc[55]), normalizer(res_temp03['text']))

print(f"WER for temp = 0.0: {wer_temp_0 * 100}")
print(f"WER for temp = 0.3: {wer_temp_03 * 100}")

WER for temp = 0.0: 5.555555555555555
WER for temp = 0.3: 11.11111111111111


In [40]:
spec = log_mel_spectrogram(np.load(val_set['waveform_path'].iloc[0]))
spec = np.pad(spec, ((0, 0), (0, 3000 - spec.shape[1])), mode='constant')


In [43]:
import torch

spec = torch.tensor(spec)

In [45]:
from whisper import DecodingOptions

res = model.decode(
    spec,
    options=DecodingOptions(
        task="transcribe",
        language="en"
    )
)

res

DecodingResult(audio_features=tensor([[-1.1787,  1.7676, -0.6934,  ..., -0.4246, -0.9922,  0.3442],
        [-0.8120,  2.2559, -0.1475,  ..., -0.3032, -0.8276,  0.2561],
        [ 0.0821,  2.2441, -0.0927,  ..., -0.5024, -0.6113, -0.2166],
        ...,
        [-0.7119, -1.5713, -0.0238,  ...,  1.3818,  0.2429,  0.7856],
        [-1.0498, -1.6338,  0.0679,  ...,  1.1738, -0.1987,  0.6753],
        [-1.6641, -1.1670, -0.0039,  ...,  0.9429, -0.5078,  0.6011]],
       dtype=torch.float16), language='en', language_probs=None, tokens=[50363, 347, 50463], text='B', avg_logprob=-0.7366571426391602, no_speech_prob=0.18790775537490845, temperature=0.0, compression_ratio=0.1111111111111111)

DecodingResult(audio_features=tensor([[-1.1787,  1.7676, -0.6934,  ..., -0.4246, -0.9922,  0.3442],
        [-0.8120,  2.2559, -0.1475,  ..., -0.3032, -0.8276,  0.2561],
        [ 0.0821,  2.2441, -0.0927,  ..., -0.5024, -0.6113, -0.2166],
        ...,
        [-0.7119, -1.5713, -0.0238,  ...,  1.3818,  0.2429,  0.7856],
        [-1.0498, -1.6338,  0.0679,  ...,  1.1738, -0.1987,  0.6753],
        [-1.6641, -1.1670, -0.0039,  ...,  0.9429, -0.5078,  0.6011]],
       dtype=torch.float16), language='en', language_probs=None, tokens=[50363, 347, 50463], text='B', avg_logprob=-0.7366571426391602, no_speech_prob=0.18790775537490845, temperature=0.0, compression_ratio=0.1111111111111111)