# Load data

In [4]:
import os
import librosa
import numpy as np
import pandas as pd
import src.label_parsing as lp 

def load_positive_data(dir, sample_rate=16000):
    """
    Load positive (cough) segments from folder.

    Args:
    - dir: Directory containing positive sample folders.
    - sample_rate: Sampling rate for loading audio files.

    Returns:
    - segments: List of NumPy arrays containing cough audio segments. 
    """
    segments = []
    # Iterate over each sample folder
    for sample_folder in os.listdir(dir):
        sample_path = os.path.join(dir, sample_folder)
        if os.path.isdir(sample_path):
            # Define paths to audio and label files
            audio_file = os.path.join(sample_path, 'data.wav')
            label_file = os.path.join(sample_path, 'label.label')
            # Check if both files exist
            if os.path.exists(audio_file) and os.path.exists(label_file):
                # print("------------------------")
                # print("Data File:", audio_file)
                # print("Label File:", label_file)
                # print("------------------------")
                # Load audio file
                audio, sr = librosa.load(audio_file, sr=sample_rate, mono=False)
                # Check audio type
                if audio.ndim != 1:
                    print(f"Stereo Audio from file {audio_file}")
                # Load labels
                labels = pd.read_csv(label_file)
                # Ensure labels have the necessary columns
                if {'Time(Seconds)', 'Length(Seconds)'}.issubset(labels.columns):
                    for index, row in labels.iterrows():
                        start_sample = int(row['Time(Seconds)'] * sr)
                        end_sample = int((row['Time(Seconds)'] + row['Length(Seconds)']) * sr)
                        segment = audio[start_sample:end_sample]
                        segments.append(segment)
                else:
                    print(f"Label file {label_file} missing 'Time(Seconds)' or 'Length(Seconds)' columns.")
            else:
                print(f"Missing audio or label file in {sample_path}.")
    return segments


def get_length_statistics(list_of_lists):
    """
    Calculate the min, max, 25%, mean, 75% lengths of sublists within a main list.

    Args:
    - list_of_lists (list of lists): The main list containing sublists.

    Returns:
    - dict: A dictionary containing 'min_length', 'max_length', and 'mean_length'.

    Raises:
    - ValueError: If any sublist is empty.
    """
    # Validate each sublist and check for emptiness
    for idx, sublist in enumerate(list_of_lists):
        if len(sublist)==0:
            raise ValueError(f"Sublist at index {idx} is empty. All sublists must be non-empty.")

    # At this point, all sublists are non-empty lists
    lengths = [len(sublist) for sublist in list_of_lists]

    # # Calculate statistics
    min_length = min(lengths)
    max_length = max(lengths)
    mean_length = sum(lengths) / len(lengths)
    Q1 = np.percentile(lengths, 25)
    median = np.percentile(lengths, 50)
    Q3 = np.percentile(lengths, 75)

    # Return results in a dictionary
    return {
        'min_length': min_length,
        'max_length': max_length,
        'mean_length': mean_length,
        '25%': Q1,
        '50%': median,
        '75%': Q3
    }


In [None]:
positive_sample_folder = "data/raw/positive/coughing"
positive_segments = load_positive_data(positive_sample_folder)
print(positive_segments[0]) # The first cough in mix2_cough_train, 13363 samples, last 0.835s
print(f"Number of positive samples from coughing folder is {len(positive_segments)}") 

get_length_statistics(positive_segments)

In [None]:
positive_sample_folder2 = "data/raw/positive/coughing_batch_2"
positive_segments2 = load_positive_data(positive_sample_folder2)
print(positive_segments2[0]) # The first cough in mix2_cough_train, 13363 samples, last 0.835s
print(f"Number of positive samples from coughing folder is {len(positive_segments2)}")
get_length_statistics(positive_segments2)

# Feature extraction

In [73]:
import librosa
import numpy as np
import matplotlib.pyplot as plt


def extract_mel_spectrogram(segment, sr, n_mels=64, n_fft=512, hop_length=256):
    """
    Extracts MFCC features from an audio segment.

    Args:
    - segment: NumPy array containing the audio signal.
    - sr: Sampling rate (default is 16,000 Hz).
    - n_mels: Number of Mel bands to generate (default is 64).
    - n_fft: Length of the FFT window (default is 512).
    - hop_length: Number of samples between successive frames (default is 256).

    Returns:
    - log_spectrogram: A 2D NumPy array of shape (n_mels, T), where T is the number of frames, and has values in dB
    """
    # Compute MFCC features from the audio segment
    spectrogram = librosa.feature.melspectrogram(y=segment, sr=sr, n_mels=n_mels, n_fft=n_fft, hop_length=hop_length)
    # Convert to Logarithmic scale
    log_spectrogram = librosa.power_to_db(spectrogram)
    return log_spectrogram


def extract_mfcc(segment, sr=16000, n_mfcc=40, n_fft=512, hop_length=256):
    """
    Extracts MFCC features from an audio segment.

    Args:
    - segment: NumPy array containing the audio signal.
    - sr: Sampling rate (default is 16,000 Hz).
    - n_mfcc: Number of MFCCs to return (default is 40).
    - n_fft: Length of the FFT window (default is 512).
    - hop_length: Number of samples between successive frames (default is 256).

    Returns:
    - mfccs: A 2D NumPy array of shape (n_mfcc, T), where T is the number of frames.
    """
    # Compute MFCC features from the audio segment
    mfccs = librosa.feature.mfcc(y=segment, sr=sr, n_mfcc=n_mfcc, n_fft=n_fft, hop_length=hop_length)
    
    # Apply normalization
    mfccs = (mfccs - np.mean(mfccs, axis=1, keepdims=True)) / (np.std(mfccs, axis=1, keepdims=True) + 1e-6)
    
    return mfccs


def plot_mfccs(mfccs):
    plt.figure(figsize=(10, 4))
    librosa.display.specshow(mfccs, x_axis='time')
    plt.colorbar()
    plt.title('MFCC')
    plt.tight_layout()
    plt.show()

def plot_log_spectrogram(log_spectrogram):
    plt.figure(figsize=(10, 4))
    librosa.display.specshow(log_spectrogram, sr=16000, hop_length=512, x_axis='time', y_axis='mel')
    plt.colorbar(format='%+2.0f dB')
    plt.title('Mel Spectrogram')
    plt.tight_layout()
    plt.show()


In [72]:
positive_features_mel = []
for segment in positive_segments:
    feature = extract_mel_spectrogram(segment, sr=16000)
    positive_features_mel.append(feature)

positive_features_mfcc = []
for segment in positive_segments:
    feature = extract_mfcc(segment, sr=16000)
    positive_features_mfcc.append(feature)

In [None]:
positive_features_mfcc[20].shape
# display(positive_features_mfcc[20])

In [None]:
positive_features_mel[10].shape

In [None]:
plot_log_spectrogram(positive_features_mel[1])
plot_mfccs(positive_features_mfcc[1])