# Data Preprocessing

## Setup

This sections imports all necessary libraries, sets all parameters and the seed. Change to create different data sets.

In [1]:
# Imports
import numpy as np

import utilities as utils
from utilities import *

In [2]:
# Parameters
input_gdansk = "../data/age_decades/"
input_physionet = "/home/flennic/Downloads/physionet.org/files/crisdb/1.0.0/"
subdirs_physionet = ["e/", "f/", "m/"]
output_dir = "../data/preprocessed/"

data_source_type = "physionet" # ["gdansk", "physionet"]
splice_type = "constant" # ["complete", "constant", "random"]
label_type = "regression" # ["classification", "regression"]
output_type = "deep" # ["features", "deep"]
in_seconds = True # Will be automatically set to false for creating features, as the frequency features require milli seconds.

splits_gdansk = [0.6, 0.2, 0.2]
splits_physionet = [0.8, 0.1, 0.1]
seed = 42
pad_length = 27_000 if data_source_type == "gdansk" else 135_000 # For deep learning padding
N = 48 if data_source_type == "gdansk" else 240

In [3]:
# Auto adjustment
if data_source_type == "gdansk":
    splits = splits_gdansk
elif data_source_type == "physionet":
    splits = splits_physionet
else:
    raise Exception("Data source not supported.")
    
if output_type == "features":
    in_seconds = False
    unit = "milliseconds"
else:
    unit = "seconds" if in_seconds else "milliseconds"
    
if splice_type == "constant":
    pad_length //= N
    
np.random.seed(seed=seed)

## Data to General Dictionary

This section reads the different data sets and writes the information into a dictionary. The dictionary can hold different types of information, but they must have the entries `Recording` and `AgeDecade` to be eligable being processed further. Outlier detection and missing values do not need to be handled yet.

In [4]:
%%time
if data_source_type == "gdansk":
    recordings = utils.read_gdansk_to_dict(input_gdansk)    
elif data_source_type == "physionet":
    recordings = utils.read_physionet_to_dict(input_physionet, subdirs_physionet)
else:
    raise Exception("Data source not supported.")

Parsing e/ directory...
Parsing f/ directory...
Parsing m/ directory...
CPU times: user 1min 57s, sys: 1min 34s, total: 3min 31s
Wall time: 6min 43s


## Interpolation

In [5]:
%%time
cleared_recordings = clear_data_in_recordings(recordings, in_seconds=in_seconds)

CPU times: user 3min 9s, sys: 2.08 s, total: 3min 11s
Wall time: 3min 10s


## Split and Splice

In [6]:
%%time
train_recordings, val_recordings, test_recordings = np.array_split(cleared_recordings, (np.array(splits)[:-1].cumsum() * len(cleared_recordings)).astype(int))

CPU times: user 335 µs, sys: 106 µs, total: 441 µs
Wall time: 395 µs


In [7]:
%%time
if splice_type == "complete":
    print("complete")
elif splice_type == "constant":
    print("constant")
    print("Training:")
    train_recordings = splice_lod_constant_by_number(train_recordings, n=N)
    print("Validation:")
    val_recordings = splice_lod_constant_by_number(val_recordings, n=N)
    print("Test:")
    test_recordings = splice_lod_constant_by_number(test_recordings, n=N)
elif splice_type == "random":
    raise Exception("Only do random if absolutely necesarry! Just adds do much overhead and is hardly comparable!")
    print("random")
    print("Training:")
    train_recordings = splice_lod_random(train_recordings, chunksize_in_minutes=5, data_is_seconds=True)
    print("Validation:")
    val_recordings = splice_lod_random(val_recordings, chunksize_in_minutes=5, data_is_seconds=True)
    print("Test:")
    test_recordings = splice_lod_random(test_recordings, chunksize_in_minutes=5, data_is_seconds=True)
else:
    raise Exception("Splice type not supported.")

constant
Training:
Validation:
Test:%
CPU times: user 2min 47s, sys: 3 s, total: 2min 50s
Wall time: 2min 50s


# Save as Features or Deep

In [8]:
if data_source_type == "physionet":
    del recordings, cleared_recordings

In [9]:
%%time
if output_type == "features":
    classification = label_type == "classification"
    print("Featuring train...")
    train_data_set = recordings_to_feature_dataframe(train_recordings, classification)
    print("Featuring val...")
    val_data_set = recordings_to_feature_dataframe(val_recordings, classification)
    print("Featuring test...")
    test_data_set = recordings_to_feature_dataframe(test_recordings, classification)
    print("Saving to disk...")
    out_path = f"{output_dir}original_{data_source_type}_{splice_type}_{output_type}_{label_type}_{unit}"
    print(f"Saving to {out_path}_TYPE.csv")
    train_data_set.to_csv(out_path + "_train.csv")
    val_data_set.to_csv(out_path + "_val.csv")
    test_data_set.to_csv(out_path + "_test.csv")
elif output_type == "deep":
    classification = label_type == "classification"
    print("Deepening train...")
    train_data_set = recordings_to_deep_dataframe(train_recordings,
                                                  pad_length,
                                                  classification,
                                                  data_source_type == "gdansk")
    print("Deepening val...")
    val_data_set = recordings_to_deep_dataframe(val_recordings,
                                                pad_length,
                                                classification,
                                                data_source_type == "gdansk")
    print("Deepening test...")
    test_data_set = recordings_to_deep_dataframe(test_recordings,
                                                 pad_length,
                                                 classification,
                                                 data_source_type == "gdansk")
    print("Saving to disk...")
    out_path = f"{output_dir}original_{data_source_type}_{splice_type}_{output_type}_{label_type}_{unit}"
    print(f"Saving to {out_path}_TYPE.csv")
    train_data_set.to_csv(out_path + "_train.csv")
    val_data_set.to_csv(out_path + "_val.csv")
    test_data_set.to_csv(out_path + "_test.csv")
else:
    raise Exception("Output type not supported.")

Deepening train...
562
Deepening val...
562
Deepening test...
562
Saving to disk...
Saving to ../data/preprocessed/physionet_constant_deep_regression_seconds_TYPE.csv
CPU times: user 6min 37s, sys: 44.9 s, total: 7min 22s
Wall time: 7min 20s


In [10]:
train_data_set.head()

Unnamed: 0,age,rr1,rr2,rr3,rr4,rr5,rr6,rr7,rr8,rr9,...,rr553,rr554,rr555,rr556,rr557,rr558,rr559,rr560,rr561,rr562
0,52,0.914,0.914,0.914,0.914,0.898,0.906,0.891,0.898,0.883,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,52,0.797,0.797,0.797,0.797,0.781,0.789,0.789,0.797,0.781,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,52,0.742,0.727,0.734,0.734,0.734,0.719,0.734,0.734,0.727,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,52,0.734,0.742,0.734,0.75,0.734,0.719,0.727,0.727,0.734,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,52,0.961,0.961,0.953,0.953,0.93,0.953,0.93,0.953,0.938,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
