# Signal Preprocessing

## Steps
1. Denoising
    - High-frequency noise (> 40 hz).
    - Baseline wander (< 0.5 hz).
2. Inversion correction
    - If the signal is inverted, correct it.
3. Downsampling
    - From 300 hz to 100 hz.
4. Continuous wavelet transform
    - 0.5 - 40 hz.

## Load Config

In [1]:
import configparser
from pathlib import Path
import pandas as pd
from tqdm import tqdm
import shutil
import wfdb
import neurokit2 as nk
import numpy as np
import pywt
from dataset_utils.transforms import RandomCrop


config = configparser.ConfigParser()
config.read("config.ini")

label_file = Path(config["data_preprocessing"]["label_file"])
src_dir = Path(config["data_preprocessing"]["src_dir"])
dst_dir = Path(config["data_preprocessing"]["dst_dir"])

src_freq = config["data_preprocessing"].getint("src_freq")
dst_freq = config["data_preprocessing"].getint("dst_freq")
dst_time = config["data_preprocessing"].getint("dst_time")

cwt_freqs = np.linspace(40, 0.5, 31)
norm_freqs = cwt_freqs / dst_freq
scales = pywt.frequency2scale("morl", norm_freqs)

random_crop = RandomCrop(dst_time * dst_freq)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def preprocess_signal(signal, freq, dst_freq):
    # denoising
    signal = nk.signal_filter(
        signal,
        sampling_rate=freq,
        lowcut=0.5,
        highcut=40,
        method="butterworth",
        order=6,
    )

    # inversion correction
    signal, _ = nk.ecg_invert(signal, sampling_rate=freq)

    # downsampling
    signal = nk.signal_resample(
        signal, sampling_rate=freq, desired_sampling_rate=dst_freq
    )
    
    # random crop
    signal = random_crop(signal)

    return signal

In [None]:
# remove existing files and directories
shutil.rmtree(dst_dir, ignore_errors=True)

dst_dir.mkdir(parents=True, exist_ok=True)

# load the dataset
dataset_df = pd.read_csv(label_file, header=None, names=["record_name", "label"])

# load all signals and labels
all_signals: list[np.ndarray] = []
all_labels: list[str] = dataset_df["label"].tolist()
for record_name in tqdm(
    dataset_df["record_name"],
    total=len(dataset_df),
    desc=f"Loading dataset",
):
    # load signal
    signal, _ = wfdb.rdsamp(src_dir / record_name)
    signal = np.squeeze(signal)
    all_signals.append(signal)

# process the dataset
for record_name, label, signal in tqdm(
    zip(dataset_df["record_name"], all_labels, all_signals),
    total=len(all_labels),
    desc=f"Processing dataset",
):
    # preprocess the signal
    preprocessed_signal = preprocess_signal(signal, src_freq, dst_freq)
    
    coefs, _ = pywt.cwt(preprocessed_signal, scales=scales, wavelet="morl", sampling_period=1.0 / dst_freq)
    preprocessed_signal = np.vstack([preprocessed_signal, coefs])
    
    np.savetxt(dst_dir / f"{record_name}.csv", preprocessed_signal, fmt="%f")


Loading dataset: 100%|██████████| 8528/8528 [00:03<00:00, 2141.78it/s]
Processing dataset: 100%|██████████| 8528/8528 [05:41<00:00, 25.01it/s]
