In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from ls.config.loader import load_config

In [3]:
cfg = load_config("../configs/config.yaml")

In [4]:
cfg.keys()

dict_keys(['seed', 'dataset', 'audio', 'models', 'training', 'mlflow'])

In [5]:
print("Seed:", cfg.seed)
print("Dataset config:", cfg.dataset)
print("Audio config:", cfg.audio)
print("Models config:", cfg.models)
print("Training config:", cfg.training)
print("MLflow config:", cfg.mlflow)

Seed: 42
Dataset config: {'name': 'icbhi', 'data_folder': '/home/AIoT04/Datasets/icbhi_dataset', 'class_split': 'lungsound', 'split_strategy': 'official', 'test_fold': 0, 'n_cls': 4, 'weighted_sampler': True, 'batch_size': 16, 'num_workers': 4, 'h': 128, 'w': 1024}
Audio config: {'sample_rate': 16000, 'desired_length': 8.0, 'remove_dc': True, 'normalize': False, 'pad_type': 'repeat', 'use_fade': True, 'fade_samples_ratio': 64, 'n_mels': 128, 'frame_length': 40, 'frame_shift': 10, 'low_freq': 100, 'high_freq': 5000, 'window_type': 'hanning', 'use_energy': False, 'dither': 0.0, 'mel_norm': 'mit', 'resz': 1.0, 'raw_augment': 1, 'wave_aug': [{'type': 'Crop', 'sampling_rate': 16000, 'zone': [0.0, 1.0], 'coverage': 1.0, 'p': 0.0}, {'type': 'Noise', 'color': 'white', 'p': 0.0}, {'type': 'Speed', 'factor': [0.9, 1.1], 'p': 0.0}, {'type': 'Loudness', 'factor': [0.5, 2.0], 'p': 0.1}, {'type': 'VTLP', 'sampling_rate': 16000, 'zone': [0.0, 1.0], 'fhi': 4800, 'factor': [0.9, 1.1], 'p': 0.1}, {'type

In [8]:
# Create a metadata file (global) that contains:
# PID | Filename | CycleIndex | CycleStart | CycleEnd | Crackles | Wheezes | Split | Device | Fold | Age | Sex BMI | CW | CH | Disease | AuscLoc

In [9]:
import pandas as pd
import os

In [10]:
# Load all reference files
meta = pd.read_csv(
    os.path.join(cfg.dataset.data_folder, "metadata.txt"), sep="\t", header=None, names=["PID", "Age", "Sex", "BMI", "CW", "CH", "DiseaseShort"])
diagnosis = pd.read_csv(
    os.path.join(cfg.dataset.data_folder, "patient_diagnosis.txt"), sep="\t", header=None, names=["PID", "Disease"])
split = pd.read_csv(
    os.path.join(cfg.dataset.data_folder, "official_split.txt"), sep="\t", header=None, names=["Filename", "Split"])
folds = pd.read_csv(
    os.path.join(cfg.dataset.data_folder, "patient_list_foldwise.txt"), sep=" ", header=None, names=["PID", "Fold"])

In [11]:
meta.head()

Unnamed: 0,PID,Age,Sex,BMI,CW,CH,DiseaseShort
0,101,3.0,F,,19.0,99.0,Pr
1,102,0.75,F,,9.8,73.0,Ar
2,103,70.0,F,33.0,,,Ar
3,104,70.0,F,28.47,,,Al
4,105,7.0,F,,32.0,135.0,Tc


In [12]:
# Merge patient-level info
meta = meta.merge(diagnosis, on="PID", how="left").merge(folds, on="PID", how="left")
meta.head()

Unnamed: 0,PID,Age,Sex,BMI,CW,CH,DiseaseShort,Disease,Fold
0,101,3.0,F,,19.0,99.0,Pr,URTI,1
1,102,0.75,F,,9.8,73.0,Ar,Healthy,4
2,103,70.0,F,33.0,,,Ar,Asthma,3
3,104,70.0,F,28.47,,,Al,COPD,3
4,105,7.0,F,,32.0,135.0,Tc,URTI,2


In [13]:
from pathlib import Path

In [14]:
items = os.listdir(Path(cfg.dataset.data_folder))
bases_wav = {f.split(".")[0] for f in items if f.endswith(".wav")}
bases_txt = {f.split(".")[0] for f in items if f.endswith(".txt")}
filenames = sorted(bases_wav & bases_txt)
print(f"Found {len(filenames)} files with both .wav and .txt")
print(filenames[:5])

Found 920 files with both .wav and .txt
['101_1b1_Al_sc_Meditron', '101_1b1_Pr_sc_Meditron', '102_1b1_Ar_sc_Meditron', '103_2b2_Ar_mc_LittC2SE', '104_1b1_Al_sc_Litt3200']


In [15]:
rows = []

# Loop through each filename.txt
for fname in filenames:
    
    # Parse file-level info
    pid = int(fname.split("_")[0])
    rec_index = fname.split("_")[1]
    ausc_loc = fname.split("_")[2]
    acc_mode = fname.split("_")[3]
    device = fname.split("_")[-1]
    
    # Find matching metadata rows
    patient_info = meta.loc[meta["PID"] == pid].iloc[0, :]
    split_row = split.loc[split["Filename"] == fname, "Split"]
    split_type = split_row.iloc[0] if not split_row.empty else "unknown"
    
    # Read cycles
    path = os.path.join(cfg.dataset.data_folder, f"{fname}.txt")
    df = pd.read_csv(path, sep="\t", header=None, names=["CycleStart", "CycleEnd", "Crackles", "Wheezes"])

    df["CycleIndex"] = df.index
    df["PID"] = pid
    df["Filename"] = fname
    df["Split"] = split_type
    df["Device"] = device
    df["AuscLoc"] = ausc_loc
    df["Fold"] = patient_info.get("Fold", None)
    df["Age"] = patient_info.get("Age", None)
    df["Sex"] = patient_info.get("Sex", None)
    df["BMI"] = patient_info.get("BMI", None)
    df["CW"] = patient_info.get("CW", None)
    df["CH"] = patient_info.get("CH", None)
    df["Disease"] = patient_info.get("Disease", None)
    
    rows.append(df)

In [16]:
# Concatenate all
icbhi_df = pd.concat(rows, ignore_index=True)
icbhi_df = icbhi_df[["PID", "Filename", "CycleIndex", "CycleStart", "CycleEnd", "Crackles", "Wheezes",
                     "Split", "Device", "Fold", "Age", "Sex", "BMI", "CW", "CH", "Disease", "AuscLoc"]]

# Save to disk
icbhi_df.to_csv(os.path.join(cfg.dataset.data_folder, "icbhi_metadata.csv"), index=False)
print(icbhi_df.head())

   PID                Filename  CycleIndex  CycleStart  CycleEnd  Crackles  \
0  101  101_1b1_Al_sc_Meditron           0       0.036     0.579         0   
1  101  101_1b1_Al_sc_Meditron           1       0.579     2.450         0   
2  101  101_1b1_Al_sc_Meditron           2       2.450     3.893         0   
3  101  101_1b1_Al_sc_Meditron           3       3.893     5.793         0   
4  101  101_1b1_Al_sc_Meditron           4       5.793     7.521         0   

   Wheezes Split    Device  Fold  Age Sex  BMI    CW    CH Disease AuscLoc  
0        0  test  Meditron     1  3.0   F  NaN  19.0  99.0    URTI      Al  
1        0  test  Meditron     1  3.0   F  NaN  19.0  99.0    URTI      Al  
2        0  test  Meditron     1  3.0   F  NaN  19.0  99.0    URTI      Al  
3        0  test  Meditron     1  3.0   F  NaN  19.0  99.0    URTI      Al  
4        0  test  Meditron     1  3.0   F  NaN  19.0  99.0    URTI      Al  


In [17]:
icbhi_df.shape

(6898, 17)

In [18]:
icbhi_df.columns

Index(['PID', 'Filename', 'CycleIndex', 'CycleStart', 'CycleEnd', 'Crackles',
       'Wheezes', 'Split', 'Device', 'Fold', 'Age', 'Sex', 'BMI', 'CW', 'CH',
       'Disease', 'AuscLoc'],
      dtype='object')