In [191]:
import pandas as pd
from sklearn.model_selection import train_test_split
import librosa
import soundfile as sf
import os
import requests
import zipfile

In [213]:
# Download digiscope dataset

url = "https://physionet.org/content/circor-heart-sound/get-zip/1.0.3/"
local_filename = "digiscope.zip"

with requests.get(url, stream=True) as r:
    r.raise_for_status()
    with open(local_filename, 'wb') as f:
        for chunk in r.iter_content(chunk_size=8192):
            f.write(chunk)

print("Download complete.")

Download complete.


In [214]:
# Unzip the dataset

with zipfile.ZipFile("digiscope.zip", 'r') as zip_ref:
    zip_ref.extractall("digiscope")  # Extract to this folder

print("Extraction complete.")

Extraction complete.


In [215]:
## load the dataset metadata as a dataframe
df = pd.read_csv("./metadata/digiscope_metadata.csv")
df

Unnamed: 0,Patient ID,Recording locations:,Age,Sex,Height,Weight,Pregnancy status,Murmur,Murmur locations,Most audible location,...,Systolic murmur pitch,Systolic murmur quality,Diastolic murmur timing,Diastolic murmur shape,Diastolic murmur grading,Diastolic murmur pitch,Diastolic murmur quality,Outcome,Campaign,Additional ID
0,2530,AV+PV+TV+MV,Child,Female,98.0,15.90,False,Absent,,,...,,,,,,,,Abnormal,CC2015,
1,14998,AV+PV+TV+MV,Child,Male,,,False,Absent,,,...,,,,,,,,Abnormal,CC2015,
2,23625,AV+PV+TV+MV,Child,Female,92.0,14.00,False,Absent,,,...,,,,,,,,Abnormal,CC2015,50379.0
3,24160,AV+PV+TV+MV,Child,Female,98.0,17.66,False,Absent,,,...,,,,,,,,Abnormal,CC2015,
4,31737,AV+PV+TV+MV,Child,Female,90.0,14.40,False,Absent,,,...,,,,,,,,Abnormal,CC2015,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
937,85196,AV+PV+TV+MV,Child,Female,129.0,28.80,False,Unknown,,,...,,,,,,,,Normal,CC2015,
938,85203,AV+MV,Infant,Male,66.0,11.00,False,Unknown,,,...,,,,,,,,Normal,CC2015,
939,85229,PV+TV+MV,Child,Female,153.0,36.90,False,Unknown,,,...,,,,,,,,Normal,CC2015,
940,85300,AV+MV,Infant,Female,62.0,7.30,False,Unknown,,,...,,,,,,,,Normal,CC2015,


In [216]:
## Append "Filename" column to the dataframe
cols = df.columns.values.flatten().tolist()
cols.append("Filename")

filenames = []
rows_lst = []

for i in range(len(df)):
    rec_locs_str = df.iloc[i]["Recording locations:"]
    if "+" in rec_locs_str:
        rec_locs_lst = rec_locs_str.split("+")
        for j in range(len(rec_locs_lst)):
            filename = str(df.iloc[i]["Patient ID"]) + "_" + rec_locs_lst[j] + ".wav"
            row = df.iloc[i].values.flatten().tolist()
            row.append(filename)
            rows_lst.append(row)
            
    else:
        filename = str(df.iloc[i]["Patient ID"]) + "_" + rec_locs_str + ".wav"
        row = df.iloc[i].values.flatten().tolist()
        row.append(filename)
        rows_lst.append(row)
        
df = pd.DataFrame(rows_lst, columns = cols)
df

Unnamed: 0,Patient ID,Recording locations:,Age,Sex,Height,Weight,Pregnancy status,Murmur,Murmur locations,Most audible location,...,Systolic murmur quality,Diastolic murmur timing,Diastolic murmur shape,Diastolic murmur grading,Diastolic murmur pitch,Diastolic murmur quality,Outcome,Campaign,Additional ID,Filename
0,2530,AV+PV+TV+MV,Child,Female,98.0,15.9,False,Absent,,,...,,,,,,,Abnormal,CC2015,,2530_AV.wav
1,2530,AV+PV+TV+MV,Child,Female,98.0,15.9,False,Absent,,,...,,,,,,,Abnormal,CC2015,,2530_PV.wav
2,2530,AV+PV+TV+MV,Child,Female,98.0,15.9,False,Absent,,,...,,,,,,,Abnormal,CC2015,,2530_TV.wav
3,2530,AV+PV+TV+MV,Child,Female,98.0,15.9,False,Absent,,,...,,,,,,,Abnormal,CC2015,,2530_MV.wav
4,14998,AV+PV+TV+MV,Child,Male,,,False,Absent,,,...,,,,,,,Abnormal,CC2015,,14998_AV.wav
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3158,85229,PV+TV+MV,Child,Female,153.0,36.9,False,Unknown,,,...,,,,,,,Normal,CC2015,,85229_TV.wav
3159,85229,PV+TV+MV,Child,Female,153.0,36.9,False,Unknown,,,...,,,,,,,Normal,CC2015,,85229_MV.wav
3160,85300,AV+MV,Infant,Female,62.0,7.3,False,Unknown,,,...,,,,,,,Normal,CC2015,,85300_AV.wav
3161,85300,AV+MV,Infant,Female,62.0,7.3,False,Unknown,,,...,,,,,,,Normal,CC2015,,85300_MV.wav


In [217]:
### Some patients have two different ids beacause they were in both 2014 and 2015 campaign
### we give unique ids to these patients

for i in range(len(df)):
    if str(df.iloc[i]["Additional ID"]) != "nan":
        if df.iloc[i]["Additional ID"] < df.iloc[i]["Patient ID"]:
            df.loc[i, ['Patient ID']] = df.iloc[i]["Additional ID"]
            
df

Unnamed: 0,Patient ID,Recording locations:,Age,Sex,Height,Weight,Pregnancy status,Murmur,Murmur locations,Most audible location,...,Systolic murmur quality,Diastolic murmur timing,Diastolic murmur shape,Diastolic murmur grading,Diastolic murmur pitch,Diastolic murmur quality,Outcome,Campaign,Additional ID,Filename
0,2530,AV+PV+TV+MV,Child,Female,98.0,15.9,False,Absent,,,...,,,,,,,Abnormal,CC2015,,2530_AV.wav
1,2530,AV+PV+TV+MV,Child,Female,98.0,15.9,False,Absent,,,...,,,,,,,Abnormal,CC2015,,2530_PV.wav
2,2530,AV+PV+TV+MV,Child,Female,98.0,15.9,False,Absent,,,...,,,,,,,Abnormal,CC2015,,2530_TV.wav
3,2530,AV+PV+TV+MV,Child,Female,98.0,15.9,False,Absent,,,...,,,,,,,Abnormal,CC2015,,2530_MV.wav
4,14998,AV+PV+TV+MV,Child,Male,,,False,Absent,,,...,,,,,,,Abnormal,CC2015,,14998_AV.wav
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3158,85229,PV+TV+MV,Child,Female,153.0,36.9,False,Unknown,,,...,,,,,,,Normal,CC2015,,85229_TV.wav
3159,85229,PV+TV+MV,Child,Female,153.0,36.9,False,Unknown,,,...,,,,,,,Normal,CC2015,,85229_MV.wav
3160,85300,AV+MV,Infant,Female,62.0,7.3,False,Unknown,,,...,,,,,,,Normal,CC2015,,85300_AV.wav
3161,85300,AV+MV,Infant,Female,62.0,7.3,False,Unknown,,,...,,,,,,,Normal,CC2015,,85300_MV.wav


In [218]:
### Train test split the data based on the patient id and murmur columns
### The same patients will not appear in both train and test sets
### Also data is stratified based on the murmur column

patients = df.groupby('Patient ID').last().reset_index()
train, test = train_test_split(patients["Patient ID"], train_size=0.75, test_size=0.25, stratify=patients['Murmur'])

In [219]:
train_df = df[df['Patient ID'].isin(train.values)]
train_df

Unnamed: 0,Patient ID,Recording locations:,Age,Sex,Height,Weight,Pregnancy status,Murmur,Murmur locations,Most audible location,...,Systolic murmur quality,Diastolic murmur timing,Diastolic murmur shape,Diastolic murmur grading,Diastolic murmur pitch,Diastolic murmur quality,Outcome,Campaign,Additional ID,Filename
0,2530,AV+PV+TV+MV,Child,Female,98.0,15.9,False,Absent,,,...,,,,,,,Abnormal,CC2015,,2530_AV.wav
1,2530,AV+PV+TV+MV,Child,Female,98.0,15.9,False,Absent,,,...,,,,,,,Abnormal,CC2015,,2530_PV.wav
2,2530,AV+PV+TV+MV,Child,Female,98.0,15.9,False,Absent,,,...,,,,,,,Abnormal,CC2015,,2530_TV.wav
3,2530,AV+PV+TV+MV,Child,Female,98.0,15.9,False,Absent,,,...,,,,,,,Abnormal,CC2015,,2530_MV.wav
8,23625,AV+PV+TV+MV,Child,Female,92.0,14.0,False,Absent,,,...,,,,,,,Abnormal,CC2015,50379.0,23625_AV.wav
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3155,85203,AV+MV,Infant,Male,66.0,11.0,False,Unknown,,,...,,,,,,,Normal,CC2015,,85203_AV.wav
3156,85203,AV+MV,Infant,Male,66.0,11.0,False,Unknown,,,...,,,,,,,Normal,CC2015,,85203_MV.wav
3160,85300,AV+MV,Infant,Female,62.0,7.3,False,Unknown,,,...,,,,,,,Normal,CC2015,,85300_AV.wav
3161,85300,AV+MV,Infant,Female,62.0,7.3,False,Unknown,,,...,,,,,,,Normal,CC2015,,85300_MV.wav


In [220]:
test_df = df[df['Patient ID'].isin(test.values)]
test_df

Unnamed: 0,Patient ID,Recording locations:,Age,Sex,Height,Weight,Pregnancy status,Murmur,Murmur locations,Most audible location,...,Systolic murmur quality,Diastolic murmur timing,Diastolic murmur shape,Diastolic murmur grading,Diastolic murmur pitch,Diastolic murmur quality,Outcome,Campaign,Additional ID,Filename
4,14998,AV+PV+TV+MV,Child,Male,,,False,Absent,,,...,,,,,,,Abnormal,CC2015,,14998_AV.wav
5,14998,AV+PV+TV+MV,Child,Male,,,False,Absent,,,...,,,,,,,Abnormal,CC2015,,14998_PV.wav
6,14998,AV+PV+TV+MV,Child,Male,,,False,Absent,,,...,,,,,,,Abnormal,CC2015,,14998_TV.wav
7,14998,AV+PV+TV+MV,Child,Male,,,False,Absent,,,...,,,,,,,Abnormal,CC2015,,14998_MV.wav
34,40798,AV+PV+TV+MV,Child,Male,116.0,22.5,False,Absent,,,...,,,,,,,Abnormal,CC2015,,40798_AV.wav
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3153,85196,AV+PV+TV+MV,Child,Female,129.0,28.8,False,Unknown,,,...,,,,,,,Normal,CC2015,,85196_TV.wav
3154,85196,AV+PV+TV+MV,Child,Female,129.0,28.8,False,Unknown,,,...,,,,,,,Normal,CC2015,,85196_MV.wav
3157,85229,PV+TV+MV,Child,Female,153.0,36.9,False,Unknown,,,...,,,,,,,Normal,CC2015,,85229_PV.wav
3158,85229,PV+TV+MV,Child,Female,153.0,36.9,False,Unknown,,,...,,,,,,,Normal,CC2015,,85229_TV.wav


In [221]:
train_df.to_csv("train_set.csv", na_rep="nan", index=False)
test_df.to_csv("test_set.csv", na_rep="nan", index=False)

In [222]:
### Split each audio file into 5-sec fixed-length segments with optional stride 

def segment_and_save_audio(
    csv_path,
    input_dir,
    output_dir,
    output_csv_path,
    segment_dur=5.0,
    stride=2.5,
    target_sr=16000
):
    """
    Segments audio files into fixed-duration segments with overlap, resamples and normalizes them, and saves them to disk.

    Parameters:
        csv_path (str): Path to input CSV with 'Filename', 'Murmur', and 'Outcome' columns.
        input_dir (str): Directory containing input audio files.
        output_dir (str): Directory to save segmented audio files.
        output_csv_path (str): Path to save the updated CSV with segment metadata.
        segment_dur (float): Duration (in seconds) of each audio segment. Default is 5.0.
        stride (float): Stride (in seconds) between segments. Default is 2.5.
        target_sr (int): Target sample rate for resampling. Default is 16000.
    """

    os.makedirs(output_dir, exist_ok=True)

    df = pd.read_csv(csv_path)
    df_rows = []

    for i in range(len(df)):
        fname = df.iloc[i]["Filename"]
        file_path = os.path.join(input_dir, fname)

        murmur_label = df.iloc[i]["Murmur"]
        outcome_label = df.iloc[i]["Outcome"]

        # Load original audio
        try:
            audio, sr = librosa.load(file_path, sr=None)
        except Exception as e:
            print(f"Failed to load {file_path}: {e}")
            continue

        dur = librosa.get_duration(y=audio, sr=sr)

        if dur < segment_dur:
            continue

        # Resample
        audio = librosa.resample(audio, orig_sr=sr, target_sr=target_sr)
        sr = target_sr

        # Normalize to [-1, 1]
        if audio.max() > 0:
            audio = audio / abs(audio).max()

        # Segment and save
        j = 0
        while (len(audio) - j * stride * sr) >= segment_dur * sr:
            start_ix = int(j * stride * sr)
            end_ix = int(start_ix + segment_dur * sr)

            audio_segment = audio[start_ix:end_ix]

            save_fname = f"{os.path.splitext(fname)[0]}_{j}.wav"
            save_path_full = os.path.join(output_dir, save_fname)

            sf.write(save_path_full, audio_segment, samplerate=sr, subtype='PCM_16')

            df_rows.append([save_fname, murmur_label, outcome_label])
            j += 1

    # Save new metadata CSV
    files_df = pd.DataFrame(df_rows, columns=["Filename", "Murmur", "Outcome"])
    files_df.to_csv(output_csv_path, index=False)
    print(f"Saved segmented data to: {output_csv_path}")

segment_and_save_audio(
    csv_path="train_set.csv",
    input_dir="./digiscope/the-circor-digiscope-phonocardiogram-dataset-1.0.3/training_data/",
    output_dir="./digiscope_segmented/",
    output_csv_path="train_set_segmented.csv",
    segment_dur=5,
    stride=2.5,
    target_sr=16000
)

segment_and_save_audio(
    csv_path="test_set.csv",
    input_dir="./digiscope/the-circor-digiscope-phonocardiogram-dataset-1.0.3/training_data/",
    output_dir="./digiscope_segmented/",
    output_csv_path="train_set_segmented.csv",
    segment_dur=5,
    stride=2.5,
    target_sr=16000
)

Saved segmented data to: train_set_segmented.csv
Saved segmented data to: train_set_segmented.csv


In [223]:
# split train set into train/dev sets for fine-tuning

df = pd.read_csv("./train_set_segmented.csv")

patient_ids = []

for i in range(len(df)):
    fname_lst = df.iloc[i]["Filename"].split("_")
    patient_id = fname_lst[0]
    patient_ids.append(patient_id)

df["Patient ID"] = patient_ids

patients = df.groupby('Patient ID').last().reset_index()
train, test = train_test_split(patients["Patient ID"], train_size=0.87, test_size=0.13, stratify=patients['Murmur'])

train_df = df[df['Patient ID'].isin(train.values)]
test_df = df[df['Patient ID'].isin(test.values)]

train_df = train_df.drop("Patient ID", axis=1)
test_df = test_df.drop("Patient ID", axis=1)

train_df.to_csv("./metadata/finetune_train_set.csv", na_rep="nan", index=False)
test_df.to_csv("./metadata/finetune_dev_set.csv", na_rep="nan", index=False)