In [1]:
import os

def filter_folders(base_path):
    control_folders = []
    dysarthric_folders = []
    for item in os.listdir(base_path):
        folder_path = os.path.join(base_path, item)
        if os.path.isdir(folder_path):
            if 'C' in item:
                control_folders.append(folder_path)
            else:
                dysarthric_folders.append(folder_path)
    return control_folders, dysarthric_folders

In [2]:
base_directory = 'TORGO'
control, dysarthric = filter_folders(base_directory)

print("Control folders:", control)
print("Dysarthric folders:", dysarthric)


Control folders: ['TORGO/MC01', 'TORGO/FC01', 'TORGO/FC03', 'TORGO/MC03', 'TORGO/MC04', 'TORGO/MC02', 'TORGO/FC02']
Dysarthric folders: ['TORGO/M02', 'TORGO/M01', 'TORGO/M04', 'TORGO/F04', 'TORGO/M03', 'TORGO/F01', 'TORGO/M05', 'TORGO/F03']


In [3]:
def extract_sessions(base_path):
    session_folders = []
    for item in os.listdir(base_path):
        folder_path = os.path.join(base_path, item)
        if os.path.isdir(folder_path) and 'Session' in folder_path:
            session_folders.append(folder_path)
            
    return session_folders

In [4]:
control_session_folders = []
dysarthric_session_folders = []

print("Control Sessions:")
for control_folder in control:
    session_folders = extract_sessions(control_folder)
    control_session_folders.append(session_folders)
    print("Session folders:", session_folders)

print()

print("Dysarthric Sessions:")
for dysarthric_folder in dysarthric:
    session_folders = extract_sessions(dysarthric_folder)
    dysarthric_session_folders.append(session_folders)
    print("Session folders:", session_folders)

Control Sessions:
Session folders: ['TORGO/MC01/Session2', 'TORGO/MC01/Session1', 'TORGO/MC01/Session3']
Session folders: ['TORGO/FC01/Session1']
Session folders: ['TORGO/FC03/Session3', 'TORGO/FC03/Session1', 'TORGO/FC03/Session2']
Session folders: ['TORGO/MC03/Session2', 'TORGO/MC03/Session1']
Session folders: ['TORGO/MC04/Session2', 'TORGO/MC04/Session1']
Session folders: ['TORGO/MC02/Session1', 'TORGO/MC02/Session2']
Session folders: ['TORGO/FC02/Session2', 'TORGO/FC02/Session3']

Dysarthric Sessions:
Session folders: ['TORGO/M02/Session1', 'TORGO/M02/Session2']
Session folders: ['TORGO/M01/Session1', 'TORGO/M01/Session2_3']
Session folders: ['TORGO/M04/Session1', 'TORGO/M04/Session2']
Session folders: ['TORGO/F04/Session2', 'TORGO/F04/Session1']
Session folders: ['TORGO/M03/Session2']
Session folders: ['TORGO/F01/Session1']
Session folders: ['TORGO/M05/Session2', 'TORGO/M05/Session1']
Session folders: ['TORGO/F03/Session1', 'TORGO/F03/Session3', 'TORGO/F03/Session2']


In [5]:
import os
from pathlib import Path
import wave

def process_session_folder(folder_path):
    prompts_dir_path = os.path.join(folder_path, 'prompts')
    array_mic_dir_path = os.path.join(folder_path, 'wav_arrayMic')
    head_mic_dir_path = os.path.join(folder_path, 'wav_headMic')

    # read prompts
    path = Path(prompts_dir_path)
    prompts = [file.read_text(encoding='utf-8') for file in sorted(path.glob('*.txt'))]

    # get array mic file paths
    array_mic_audio_files = []
    array_mic_durations = []
    if os.path.isdir(array_mic_dir_path):
        for item in sorted(os.listdir(array_mic_dir_path)):
            audio_file_path = os.path.join(array_mic_dir_path, item)
            array_mic_audio_files.append(audio_file_path)
            with wave.open(audio_file_path, 'rb') as wav_file:
                frames = wav_file.getnframes()
                framerate = wav_file.getframerate()
                duration = frames / float(framerate)
                array_mic_durations.append(duration)
                
    # get head mic file paths 
    head_mic_audio_files = []
    head_mic_durations = []
    if os.path.isdir(head_mic_dir_path):
        for item in sorted(os.listdir(head_mic_dir_path)):
            audio_file_path = os.path.join(head_mic_dir_path, item)
            head_mic_audio_files.append(audio_file_path)
            with wave.open(audio_file_path, 'rb') as wav_file:
                frames = wav_file.getnframes()
                framerate = wav_file.getframerate()
                duration = frames / float(framerate)
                head_mic_durations.append(duration)

    return prompts, array_mic_audio_files, array_mic_durations, head_mic_audio_files, head_mic_durations
    

In [6]:
print(dysarthric_session_folders[0][0])
prompts, array_mic_audio_files, array_mic_durations, head_mic_audio_files, head_mic_durations = process_session_folder(dysarthric_session_folders[0][0])
print(f'\nPrompts ({len(prompts)}):')
print(prompts[:5])
print(f'\nArray mic audio files ({len(array_mic_audio_files)}):')
print(array_mic_audio_files[:5])
print(array_mic_durations[:5])
print(f'\nHeadmic audio files ({len(head_mic_audio_files)}):')
print(head_mic_audio_files[:5])
print(head_mic_durations[:5])

TORGO/M02/Session1

Prompts (240):
["[say 'Pah-Tah-Kah' repeatedly]", '[relax your mouth in its normal position]', "[say 'Ah-P-Eee' repeatedly]", "[say 'Eee-P-Ah' repeatedly]", "[say 'OA' as in cOAt in a very low pitch]"]

Array mic audio files (240):
['TORGO/M02/Session1/wav_arrayMic/0001.wav', 'TORGO/M02/Session1/wav_arrayMic/0002.wav', 'TORGO/M02/Session1/wav_arrayMic/0003.wav', 'TORGO/M02/Session1/wav_arrayMic/0004.wav', 'TORGO/M02/Session1/wav_arrayMic/0005.wav']
[17.25, 4.2, 8.7, 7.65, 2.4]

Headmic audio files (240):
['TORGO/M02/Session1/wav_headMic/0001.wav', 'TORGO/M02/Session1/wav_headMic/0002.wav', 'TORGO/M02/Session1/wav_headMic/0003.wav', 'TORGO/M02/Session1/wav_headMic/0004.wav', 'TORGO/M02/Session1/wav_headMic/0005.wav']
[17.4249375, 6.5324375, 11.15825, 6.9386875, 4.43825]


In [7]:
import pandas as pd

def prepare_datasets(base_dir):
    control, dysarthric = filter_folders(base_directory)
    dataset_control = prepare_dataset(control)
    dataset_dysarthric = prepare_dataset(dysarthric)

def prepare_dataset(speaker_folders):
    dataset = pd.DataFrame(columns=['speaker', 'session', 'prompt', 'path', 'mic', 'length'])
    # process speakers
    for speaker_folder in sorted(speaker_folders):
        # process speaker sessions
        speaker_sessions = extract_sessions(speaker_folder)
        for session in sorted(speaker_sessions):
            
            # process session
            prompts, array_mic, array_mic_durations, head_mic, head_mic_durations = process_session_folder(session)
            prompts_array_mic = prompts[:len(array_mic)]
            prompts_head_mic = prompts[:len(head_mic)]
            
            # prepare dataframe
            df_array_mic = pd.DataFrame({'speaker': speaker_folder.split('/', 1)[-1], 'session': session[-1], 
                                         'prompt': prompts_array_mic, 'path': array_mic, 'mic': 'array_mic', 'length': array_mic_durations})
            df_head_mic = pd.DataFrame({'speaker': speaker_folder.split('/', 1)[-1], 'session': session[-1],
                                        'prompt': prompts_head_mic, 'path': head_mic, 'mic': 'head_mic', 'length': head_mic_durations})
            df_session = pd.concat([df_array_mic, df_head_mic])

            # update dataset
            dataset = pd.concat(dataset, df_session)
            
    return dataset
            
    

ModuleNotFoundError: No module named 'pandas'

In [None]:
import pickle

# Example object to pickle
data = {"name": "Alice", "age": 30, "is_student": False}

# Pickle the object and save it to a file
with open("data.pkl", "wb") as file:
    pickle.dump(data, file)

# Unpickle the object from the file
with open("data.pkl", "rb") as file:
    loaded_data = pickle.load(file)

print(loaded_data)