# Load data

In [13]:
import os
import librosa
import pandas as pd
import src.label_parsing as lp 

def load_positive_data(root_dir, sample_rate=16000):
    """
    Load positive (cough) segments from a dataset where each sample is in its own folder.

    Parameters:
    - root_dir: Root directory containing positive sample folders.
    - sample_rate: Sampling rate for loading audio files.

    Returns:
    - segments: List of NumPy arrays containing cough audio segments.
    """
    segments = []
    # Iterate over each sample folder
    for sample_folder in os.listdir(root_dir):
        sample_path = os.path.join(root_dir, sample_folder)
        if os.path.isdir(sample_path):
            # Define paths to audio and label files
            audio_file = os.path.join(sample_path, 'data.wav')
            label_file = os.path.join(sample_path, 'label.label')
            # Check if both files exist
            if os.path.exists(audio_file) and os.path.exists(label_file):
                print("------------------------")
                print("Data File:", audio_file)
                print("Label File:", label_file)
                print("------------------------")
                # Load audio file
                audio, sr = librosa.load(audio_file, sr=sample_rate)
                # Load labels
                labels = pd.read_csv(label_file)
                # Ensure labels have the necessary columns
                if {'Time(Seconds)', 'Length(Seconds)'}.issubset(labels.columns):
                    for index, row in labels.iterrows():
                        start_sample = int(row['Time(Seconds)'] * sr)
                        end_sample = int((row['Time(Seconds)'] + row['Length(Seconds)']) * sr)
                        segment = audio[start_sample:end_sample]
                        segments.append(segment)
                else:
                    print(f"Label file {label_file} missing 'Time(Seconds)' or 'Length(Seconds)' columns.")
            else:
                print(f"Missing audio or label file in {sample_path}.")
    return segments


In [None]:

cough_folder = "data/raw/positive/coughing"
cough_seg = load_positive_data(cough_folder)
print(cough_seg[0]) # The first cough in mix2_cough_train, 13363 samples

# Feature extraction

In [15]:
import librosa
import numpy as np

def extract_mel_spectrogram(segment, sr, n_mels=64):
    spectrogram = librosa.feature.melspectrogram(y=segment, sr=sr, n_mels=n_mels)
    log_spectrogram = librosa.power_to_db(spectrogram)
    return log_spectrogram