# Load data

In [38]:
import os
import pandas as pd
import librosa

def load_positive_data(root_dir, sample_rate=16000):
    """
    Load positive (cough) segments from a dataset where each sample is in its own folder.

    Parameters:
    - root_dir: Root directory containing positive sample folders.
    - sample_rate: Sampling rate for loading audio files.

    Returns:
    - segments: List of NumPy arrays containing cough audio segments.
    """
    segments = []
    # Iterate over each sample folder
    for sample_folder in os.listdir(root_dir):
        sample_path = os.path.join(root_dir, sample_folder)
        print(sample_path)
        if os.path.isdir(sample_path):
            # Define paths to audio and label files
            audio_file = os.path.join(sample_path, 'data.wav')
            label_file = os.path.join(sample_path, 'label.label')
            # Check if both files exist
            if os.path.exists(audio_file) and os.path.exists(label_file):
                # Load audio file
                audio, sr = librosa.load(audio_file, sr=sample_rate)
                # Load labels
                labels = pd.read_csv(label_file)
                # Ensure labels have the necessary columns
                if {'Time(Seconds)', 'Length(Seconds)'}.issubset(labels.columns):
                    for index, row in labels.iterrows():
                        start_sample = int(row['Time(Seconds)'] * sr)
                        end_sample = int((row['Time(Seconds)'] + row['Length(Seconds)']) * sr)
                        segment = audio[start_sample:end_sample]
                        segments.append(segment)
                else:
                    print(f"Label file {label_file} missing 'Time(Seconds)' or 'Length(Seconds)' columns.")
            else:
                print(f"Missing audio or label file in {sample_path}.")
    return segments

In [39]:
erocshadow_cough_folder = "data/raw/positive/coughing"

erocshadow_cough = load_positive_data(root_dir=erocshadow_cough_folder)


data/raw/positive/coughing/mix2_cough_train
data/raw/positive/coughing/407839__bobtheross__coughing
data/raw/positive/coughing/594629__tshepangncwane_200253__coughing
data/raw/positive/coughing/184871__eelke__coughing
data/raw/positive/coughing/155650__poorenglishjuggler__cough
data/raw/positive/coughing/366229__millonlazaruspillay__coughing
data/raw/positive/coughing/.DS_Store
data/raw/positive/coughing/mix4_cough_train
data/raw/positive/coughing/457972__fabrizio84__cough
data/raw/positive/coughing/408086__biawinter__cough
data/raw/positive/coughing/mix1_cough_train
data/raw/positive/coughing/425777__thatkellytrna__girl-cough
data/raw/positive/coughing/108017__erocshadow__cough
data/raw/positive/coughing/spectorj__voice-request-33-sarcastic-coughing
data/raw/positive/coughing/650914__frenkfurth__coughing-001
data/raw/positive/coughing/mix3_cough_train
data/raw/positive/coughing/371360__goldkelchen__cough
data/raw/positive/coughing/408888__puzzelz__coughing
data/raw/positive/coughing/4

In [47]:

print(erocshadow_cough[0]) # The first cough in mix2_cough_train, 13363 samples

[ 0.00064087 -0.00192261  0.0020752  ... -0.02819824 -0.01596069
 -0.02526855]
