In [None]:
import pandas as pd
import numpy as np
from dateutil.rrule import rrule, SECONDLY, MINUTELY, HOURLY
import warnings

from heart_rate_variability import apply_hrv
from load_raw_vital_signs import *
from datasets import create_dataloaders
from preprocess_outputs import preprocess_crt_avpu

RAW_VITAL_DATA_PATH = "./../DATA/Raw Data/filtered_df_removed_nan_files.parquet"
CLINICAL_DATA_PATH = "./../DATA/Clean Data/IMPALA_Clinical_Data_202308211019_Raw.csv"
PROCESSED_RAW_VITAL_SIGN_DATA_PATH = "./../DATA/Raw Data/raw_patient_dict_p30"

### Load data

In [None]:

data = load_patient_dict(PROCESSED_RAW_VITAL_SIGN_DATA_PATH)
print(data.keys())


### Split data into windows

In [None]:

def split_data_into_window(df, time_unit='m', time_freq=15):
    """
    Split the data into windows.
    :param df: Pandas DataFrame containing the data indexed on timestamps.
    :param time_unit: time unit of the data window, e.g. s (seconds), m (minutes).
    :param time_freq: number of time units in the data window.
    """

    df = df.copy()

    rrule_time = {'h' : HOURLY, 'm' : MINUTELY, 's' : SECONDLY}
    windows = []
    datetimes = []
    num_features = df.shape[1]

    for start in rrule(freq=rrule_time[time_unit], interval=time_freq,
                       dtstart=df['datetime'].iloc[0], until=df['datetime'].iloc[-1]):
        
        # Select window
        end = start + pd.Timedelta(time_freq, unit=time_unit)
        idx = df['datetime'].between(start, end)
        window = df[idx]

        if window.size > 0:
            # From datetime only keep hours
            window.loc[:, 'datetime'] = window.loc[:, 'datetime'].dt.hour

            # Save windows and timepoints seperately
            windows.append(window.values)
            datetimes.append(df[idx]['datetime'].iloc[0])

        else: # If no data in time window, still add empty window
            window = np.empty((1, num_features))
            window.fill(np.float64('nan'))
            window[0, -1] = end.hour

            # Save windows and timepoints seperately
            windows.append(window)
            datetimes.append(start)

    return windows, np.array(datetimes)


In [None]:

df = data['Z-H-0120'].drop(['patient_id', 'location'], axis=1)
df = df.sort_values('datetime')

windows, datetimes = split_data_into_window(df, time_unit='m', time_freq=15)

del df


### Aggregate data windows, calculate HRV and add patient's age

In [None]:

# Vital signs: ECGHR,      ECGRR,      SPO2HR,    SPO2,
#              NIBP_lower, NIBP_upper, NIBP_mean, datetime

def aggregate_windows(windows):
    """
    Aggregate vital signs.

    Suppress 'Mean of empty slice' or 'All-NaN slice encountered' warnings as
    these are dealt with manually.
    """

    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=RuntimeWarning)

        # Calculate mean, min, max and std of the first four features
        mean_ = np.array([np.nanmean(a[:, :4], axis=0) for a in windows])
        min_ = np.array([np.nanmin(a[:, :4], axis=0) for a in windows])
        max_ = np.array([np.nanmax(a[:, :4], axis=0) for a in windows])
        std_ = np.array([np.nanstd(a[:, :4], axis=0) for a in windows])

    # Replace NaN values with -999
    mean_ = np.where(np.isnan(mean_), -999, mean_)
    min_ = np.where(np.isnan(min_), -999, min_)
    max_ = np.where(np.isnan(max_), -999, max_)
    std_ = np.where(np.isnan(std_), -999, std_)

    # Choose latest valid entries of last four features
    other_features = []
    for i, w in enumerate(windows):

        current_window = []
        bool_labels = np.where(np.isnan(w[:, 4:]), -999, w[:, 4:]) >= 0

        for i in range(4):
            valid_entries = w[:, i+4][bool_labels[:, i]]
            current_window.append(valid_entries[-1] if valid_entries.size > 0 else -999)

        other_features.append(current_window)
    
    other_features = np.array(other_features)

    return np.concatenate([mean_, min_, max_, std_, other_features], axis=1)


def get_age_in_months(path):
    """
    Create a dictionary of all patients with corresponding age in months.

    :param path: string containing path to the clincal data.
    """

    df = pd.read_csv(path, low_memory=False, usecols=['record_id', 'recru_age_months'])

    return {record_id : values['recru_age_months'].iloc[0] for \
            record_id, values in df.groupby('record_id')}


In [None]:

age_dict = get_age_in_months(CLINICAL_DATA_PATH)

agg_data = aggregate_windows(windows)
print(agg_data.shape)

hrv_data = apply_hrv(windows, return_features=['lfnu'])
print(hrv_data.shape)

age_data = (np.ones((agg_data.shape[0], 1)) * age_dict['Z-H-0120'])
print(age_data.shape)

new_data = np.concatenate((agg_data, hrv_data, age_data), axis=1)
print(new_data.shape)



### Apply sliding window

In [None]:

def sliding_window_backward(data, outputs, datetimes, sample_window_hours):
    """
    Apply a sliding window over the given data. If a window is too long, the first
    entries are discarded. If a window is too short it is discarded completely.

    :param data: NumPy Array containing the aggregated vital sign windows.
    :param datetimes: List containing the starting times for each window.
    :param outputs: Pandas Dataframe containing CRT and AVPU values per time.
    """

    X, y = [], []

    datetimes = pd.Series(datetimes, name='time')

    for end in datetimes[::-1]:

        start = end - pd.Timedelta(sample_window_hours, unit='h')
        idx = datetimes.between(start, end)
        window = data[idx]


        if window.shape[0] < sample_window_hours * 4:
            continue

        elif window.shape[0] > sample_window_hours * 4: # Remove first entry
            window = window[-sample_window_hours*4:]        

        # Choose output that is nearest to the end time of the window
        nearest_output_idx = np.argmin([abs(end - t) for t in outputs.index])

        X.append(window.T)
        y.append(outputs.iloc[nearest_output_idx].values)
        
    
    X = np.array(X) # Shape: samples, dimensions, time
    y = np.array(y) # Shape: samples, dimensions

    return X, y


In [None]:

outputs = preprocess_crt_avpu(CLINICAL_DATA_PATH)
X, y = sliding_window_backward(new_data, outputs['Z-H-0120'], datetimes, 4)


### Turn data to PyTorch dataloaders

In [None]:

train_dataloader, val_dataloader, test_dataloader = \
    create_dataloaders(X, y, batch_size=32, seed=42)
