In [None]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection
from matplotlib.lines import Line2D
from dateutil.rrule import rrule, SECONDLY, MINUTELY
import seaborn as sns

sys.path.append('./../preprocessing/')
sys.path.append('./../preprocessing/load_raw_vital_signs')
sys.path.append('./../preprocessing/preprocess_raw_vital_signs')

from load_raw_vital_signs import *
from preprocess_raw_vital_signs import *

VITAL_DATA_PATH = "./../DATA/Clean Data/Vital Signs Data/df_merged.parquet"
RAW_VITAL_DATA_PATH = "./../DATA/Raw Data/filtered_df_removed_nan_files.parquet"

#### Load data

In [None]:

data = load_patient_dict('./../DATA/Raw Data/raw_patient_dict_p30')
print(data.keys())


#### Visualize raw vital signs

In [None]:

def visualize_raw_vital_sign(patient_id, df, vital_sign, rounded_time):
    """
    Visualize the data of one vital sign for one patient.

    :param df: Pandas DataFrame containing the vital sign information.
    :param vital_sign: String indicating a certain vital sign. If vital_sign is
                       a list, multiple vital signs will be plot.
    :param rouneded_time: String indicating the time unit to which is rounded
                          (e.g. '5T' = 5 minutes, '2H' = 2 hours, etc.).
    """

    df = df.copy()

    df = df.set_index('datetime')
    df = df.sort_index()
    df.index = df.index.round(rounded_time)

    if type(vital_sign) == str:
        vital_sign = [vital_sign]
    
    for v in vital_sign:
        # Remove nan values
        data = df[v].dropna()

        # Average over given time
        data = data.groupby('datetime').mean()

        # Plot results
        # plt.scatter(data.index, data.values, marker='.')
        plt.plot(data.index, data.values)


    plt.title(f'ID: {patient_id}, metrics: {vital_sign}')

    plt.legend(vital_sign)
    plt.gcf().autofmt_xdate()
    plt.tight_layout()
    plt.show()


In [None]:

# Metrics: ECGHR, ECGRR, SPO2HR, SPO2, NIBP_lower, NIBP_upper, NIBP_mean

patient_id = 'Z-H-0120'

t = 'T'

# visualize_raw_vital_sign(patient_id, data[patient_id], ['NIBP_upper'], t)
visualize_raw_vital_sign(patient_id, data[patient_id][:20000], ['NIBP_upper'], t)
# visualize_raw_vital_sign(patient_id, data[patient_id][20000:40000], ['NIBP_upper'], t)
# visualize_raw_vital_sign(patient_id, data[patient_id][60000:80000], ['NIBP_upper'], t)


#### Compare ECG and SPO2 heart rate

In [None]:

patient_id = 'Z-H-0216'
rounded_time = '5T'

ecg_hr = data[patient_id][['ECGHR', 'datetime']].copy()

ecg_hr = ecg_hr.set_index('datetime')
ecg_hr = ecg_hr.sort_index()
ecg_hr = ecg_hr.dropna()

ecg_hr.index = ecg_hr.index.round(rounded_time)
ecg_hr = ecg_hr.groupby('datetime').mean()

ecg_hr = ecg_hr.astype(np.int16)

spo2_hr = data[patient_id][['SPO2HR', 'datetime']].copy()

spo2_hr = spo2_hr.set_index('datetime')
spo2_hr = spo2_hr.sort_index()
spo2_hr = spo2_hr.dropna()

spo2_hr.index = spo2_hr.index.round(rounded_time)
spo2_hr = spo2_hr.groupby('datetime').mean()

spo2_hr = spo2_hr.astype(np.int16)

print(ecg_hr.shape)
print(spo2_hr.shape)

print(ecg_hr.index[0], spo2_hr.index[0])


In [None]:

x = [[], []] # First index: start ECG, second index: start SPO2
y = [[], []]
side = [[], []]

begin_time = min(ecg_hr.index[0], spo2_hr.index[0])
end_time = max(ecg_hr.index[-1], spo2_hr.index[-1])


for i in range(2):

    mean_value = ecg_hr.iloc[0].values if i == 0 else spo2_hr.iloc[0].values

    for t in rrule(freq=SECONDLY, dtstart=begin_time, until=end_time):

        if t in ecg_hr.index and t in spo2_hr.index:
            ecg_diff = np.abs(mean_value - ecg_hr.loc[t].values)
            spo2_diff = np.abs(mean_value - spo2_hr.loc[t].values)

            smallest_diff = np.argmin([ecg_diff, spo2_diff])

            if smallest_diff:
                y[i].append(spo2_hr.loc[t].values[0])
                mean_value = spo2_hr.loc[t].values

            else:
                y[i].append(ecg_hr.loc[t].values[0])
                mean_value = ecg_hr.loc[t].values

            side[i].append(smallest_diff)
            x[i].append(t)

        elif t in ecg_hr.index:
            side[i].append(0)
            x[i].append(t)
            y[i].append(ecg_hr.loc[t].values[0])

        elif t in spo2_hr.index:
            side[i].append(1)
            x[i].append(t)
            y[i].append(spo2_hr.loc[t].values[0])


x = np.array(x)
y = np.array(y)
side = np.array(side)

print(x.shape)
print(y.shape)
print(side.shape)


In [None]:
path = "./RESULTS/Raw_HR_baseline_experiments/"
a = 0.4

# Plot 1: ECGHR start, ECGHR, SPO2HR
plt.plot(ecg_hr.index, ecg_hr.values, alpha=a, color='tab:green', label='ECG_HR')
plt.plot(spo2_hr.index, spo2_hr.values, alpha=a, color='tab:red', label='SPO_HR')

plt.plot(x[0], y[0], color='tab:blue', label='baseline (ECG_HR)')

plt.title(f'ID: {patient_id}, start: [ECG_HR]')
plt.legend()
plt.xlabel(f'Time ({rounded_time})')
plt.ylabel('Heart rate')

plt.gcf().autofmt_xdate()
plt.tight_layout()
plt.savefig(f'{path}{patient_id}_startECG_{rounded_time}.png')
plt.show()
plt.close()


# Plot 2 SPO2HR start vs ECGHR, SPO2HR
plt.plot(ecg_hr.index, ecg_hr.values, alpha=a, color='tab:green', label='ECG_HR')
plt.plot(spo2_hr.index, spo2_hr.values, alpha=a, color='tab:red', label='SPO_HR')

plt.plot(x[1], y[1], color='tab:orange', label='baseline (SPO2_HR)')

plt.title(f'ID: {patient_id}, start: [SPO2_HR]')
plt.legend()
plt.xlabel(f'Time ({rounded_time})')
plt.ylabel('Heart rate')

plt.gcf().autofmt_xdate()
plt.tight_layout()
plt.savefig(f'{path}{patient_id}_startSPO2_{rounded_time}.png')
plt.show()
plt.close()


# Plot 3 ECGHR start vs SPO2HR start
plt.plot(x[0], y[0], color='tab:blue', label='baseline (ECG_HR)')
plt.plot(x[1], y[1], color='tab:orange', label='baseline (SPO2_HR)')

plt.title(f'ID: {patient_id}, start: [ECG_HR, SPO2_HR]')
plt.legend()
plt.xlabel(f'Time ({rounded_time})')
plt.ylabel('Heart rate')

plt.gcf().autofmt_xdate()
plt.tight_layout()
plt.savefig(f'{path}{patient_id}_startECG_startSPO2_{rounded_time}.png')
plt.show()
plt.close()


In [None]:

# Plot 4: Identify where ECGHR & SPO2HR are used, startECGHR
x_test = range(len(x[0]))

annotations = np.where(side[0] == 0, True, False)
c = ['tab:blue' if a else 'tab:orange' for a in annotations]

lines = [((x0,y0), (x1,y1)) for x0, y0, x1, y1 in \
            zip(x_test[:-1], y[0, :-1], x_test[1:], y[0, 1:])]
colored_lines = LineCollection(lines, colors=c)

fig, ax = plt.subplots(1)
ax.add_collection(colored_lines)
ax.autoscale_view()

plt.title(f'ID: {patient_id}, start: [ECGHR]')
plt.legend(handles=[Line2D([0], [0], color='tab:blue', label='ECGHR'),
                    Line2D([0], [0], color='tab:orange', label='SPO2HR')])
plt.xlabel('Data points')
plt.ylabel('Heart rate')
plt.tight_layout()
plt.savefig(f'{path}{patient_id}_startECG_combined_{rounded_time}.png')
plt.show()


# Plot 4: Identify where ECGHR & SPO2HR are used, startSPO2HR
x_test = range(len(x[1]))

annotations = np.where(side[1] == 0, True, False)
c = ['tab:blue' if a else 'tab:orange' for a in annotations]

lines = [((x0,y0), (x1,y1)) for x0, y0, x1, y1 in \
            zip(x_test[:-1], y[1, :-1], x_test[1:], y[1, 1:])]
colored_lines = LineCollection(lines, colors=c)

fig, ax = plt.subplots(1)
ax.add_collection(colored_lines)
ax.autoscale_view()

plt.title(f'ID: {patient_id}, start: [SPO2HR]')
plt.legend(handles=[Line2D([0], [0], color='tab:blue', label='ECGHR'),
                    Line2D([0], [0], color='tab:orange', label='SPO2HR')])
plt.xlabel('Data points')
plt.ylabel('Heart rate')
plt.tight_layout()
plt.savefig(f'{path}{patient_id}_startSPO2_combined_{rounded_time}.png')
plt.show()


#### Visualize NaN values

In [None]:

perc_missing = []

for patient_id in data:
    patient_missing = []
    df = data[patient_id].copy()

    for vital in ['ECGHR', 'ECGRR', 'SPO2HR', 'SPO2', 'NIBP_lower', 'NIBP_upper', 'NIBP_mean']:
        patient_missing.append(df[df[vital].notna()].shape[0] / df.shape[0] * 100)

    patient_missing.append(((df.dropna(how='any').shape[0]) / df.shape[0]) * 100)
    perc_missing.append(patient_missing)

perc_missing = np.array(perc_missing)


# Plot results
plt.boxplot(perc_missing, 0, '')

plt.xticks(range(1, 9),
           ['ECGHR', 'ECGRR', 'SPO2HR', 'SPO2', 'NIBP_lower',
                'NIBP_upper', 'NIBP_mean', 'Any'],
            rotation=-45)

plt.title(f"Percentage of non-missing values, averaged over {len(data)} patients")
plt.tight_layout()
plt.show()
plt.close()


In [None]:
from sklearn.decomposition import PCA

overall_pca = []

for patient_id in data:

    df = data[patient_id].copy()

    df = df[['ECGHR', 'ECGRR', 'SPO2HR', 'SPO2', 'NIBP_lower',
                'NIBP_upper', 'NIBP_mean']].dropna(how='any')


    if df.shape[0] > 0:
        pca = PCA(n_components=7)
        pca.fit(df.values)

        overall_pca.append(pca.components_)


overall_pca = np.array(overall_pca)


In [None]:
import seaborn as sns

mean_pca = overall_pca.mean(axis=0)

# print(mean_pca)

# plt.figure()
# sns.heatmap(mean_pca, xticklabels=['ECGHR', 'ECGRR', 'SPO2HR', 'SPO2', 'NIBP_lower',
#                 'NIBP_upper', 'NIBP_mean'], yticklabels=range(1, 8), linewidths=0.8)

# plt.ylabel('Nth principal component')
# plt.title('Principal components and how much each variable contributes')
# plt.show()

plt.imshow(mean_pca, cmap='magma', norm='linear')

plt.colorbar()

plt.yticks(range(7), range(1, 8))
plt.ylabel('Nth component')
plt.xticks(range(7), ['ECGHR', 'ECGRR', 'SPO2HR', 'SPO2',
                      'NIBP_lower', 'NIBP_upper', 'NIBP_mean'], rotation=-45)
plt.title('Principal components and variable contribution')
plt.show()


#### Compare mean and median

In [None]:

agg_data = []

for i, (patient_id, df) in enumerate(data.items()):

    # 1. Split raw data in 15 minute windows
    df = df.drop(['patient_id', 'location'], axis=1)
    df = df.sort_values('datetime')
    windows, datetimes = split_data_into_windows(df)

    # 2. Aggregate vital signs
    agg_data.append(aggregate_windows(windows))


In [None]:

for i, d in enumerate(agg_data):

    if i > 5:
        break

    for j, feature in enumerate(['ECGHR', 'ECGRR', 'SPO2HR', 'SPO2']):
        
        plt.subplot(2, 2, j+1)

        # Mean-Median
        sns.histplot(d[:, j], bins=50, color='tab:blue')
        sns.histplot(d[:, 16+j], bins=50, color='tab:orange')
        plt.legend(['Mean', 'Median'])

        # Min-Max
        # sns.histplot(d[:, 4+j], bins=50, color='tab:blue')
        # sns.histplot(d[:, 8+j], bins=50, color='tab:orange')
        # plt.legend(['Min', 'Max'])

        plt.title(feature)
        plt.tight_layout()

    plt.suptitle(list(data.keys())[i])
    plt.tight_layout()
    plt.show()
