### Extract Time Series

Notebook for obtaining time series of a given patient.
In case there are several records from an ICU stay, the time series will include a gap.

In [2]:
import pandas as pd
import numpy as np

import psycopg2
import os
import wfdb
import urllib.request
import datetime
import re

from utils import (
    get_record_from_line,
    get_time_range_icustay,
    get_time_range_record,
    get_signal_index,
    ranges_overlap,
    get_delta_earliest_end_latest_start,
)

In [3]:
subject_id = 55638
current_user_id = str(subject_id).zfill(6)
wdb_base_path = "https://physionet.org/files/"
wdb_dir_path = f"mimic3wdb-matched/1.0/p{current_user_id[:2]}/p{current_user_id}/"

required_signals_list = ["HR", "ABPSYS", 'ABPDIAS', "ABPMEAN", "RESP", "SPO2"]

In [4]:
connection = psycopg2.connect(database="mimic", user="holthausen")
cursor = connection.cursor()
cursor.execute("set search_path to mimiciii")

query_subject_id = f"SELECT  * FROM sepsis3_cohort coh WHERE coh.subject_id = {subject_id}"
df_subject_id = pd.read_sql(query_subject_id, connection)

In [5]:
time_range_icustay = get_time_range_icustay(df_subject_id.iloc[0])

In [7]:
time_range_icustay
icustay_length_in_seconds = (time_range_icustay[1] - time_range_icustay[0]).total_seconds()

In [53]:
wdb_records = urllib.request.urlopen(wdb_base_path + wdb_dir_path + "RECORDS")

numerics_files_list = [get_record_from_line(line) for line in wdb_records.readlines() if get_record_from_line(line)[-1] == "n"]

# take out?
signals_dict = None

for record in numerics_files_list:
    signals, fields = wfdb.rdsamp(record, pn_dir=wdb_dir_path)

    time_range_record = get_time_range_record(fields)
    time_range_overlapped = get_overlapped_range(time_range_icustay, time_range_record)

    if time_range_overlapped is None:
        continue

    sampling_frequency_magnitude = "seconds" if "%.3f" % (fields["fs"]) == "1.000" else "minutes"

    if signals_dict is None:
        icustay_length_in_seconds = (time_range_icustay[1] - time_range_icustay[0]).total_seconds()
        signal_length = int(icustay_length_in_seconds if sampling_frequency_magnitude == "seconds" else icustay_length_in_seconds // 60)
        signals_dict = get_empty_signals_dict(signal_length, required_signals_list, time_range_icustay)
        
    signals_names_list = [re.sub(r"[\s%]", "", item.upper()) for item in fields["sig_name"]]
    
    signals_exist = all(x in signals_names_list for x in required_signals_list)

    index_start_signal = get_index_difference(time_range_icustay[0], time_range_overlapped[0], sampling_frequency_magnitude)

    if signals_exist:
        update_signals_dict(signals, signals_dict, required_signals_list, signals_names_list, index_start_signal)

df = pd.DataFrame(signals_dict)
df.to_csv(f"signals_{subject_id}.csv", index=False)

Updating field HR in ranges 226 to 2546
Updating field ABPSYS in ranges 226 to 2546
Updating field ABPDIAS in ranges 226 to 2546
Updating field ABPMEAN in ranges 226 to 2546
Updating field RESP in ranges 226 to 2546
Updating field SPO2 in ranges 226 to 2546


In [52]:
def get_empty_signals_dict(length, signals_list, time_range_icustay):
    signals_dict = dict()
    basetime = time_range_icustay[0]
    if sampling_frequency_magnitude == "seconds":
        time_column = np.array([basetime + datetime.timedelta(seconds=i+1) for i in range(signal_length)])
    else:
        time_column = np.array([basetime + datetime.timedelta(minutes=i+1) for i in range(signal_length)])
    signals_dict["TIME"] = time_column
    for item in signals_list:
        # numpy arrays with empty strings but with datatype = strings of length 10
        signals_dict[item] = np.full(length,  np.nan, dtype=float)
    return signals_dict

In [50]:
def update_signals_dict(signals, signals_dict, signals_names, signals_names_list, index_start):
    index_dict = dict([(item, get_signal_index(item, signals_names_list)) for item in signals_names])
    for item in signals_names:
        signal_length = signals[:, index_dict[item]].shape[0]
        icustay_length = signals_dict[item].shape[0]
        index_end = min(index_start + signal_length, icustay_length - index_start)
        signal_info = signals[:,index_dict[item]]
        print(f"Updating field {item} in ranges {index_start} to {index_end}")
        signals_dict[item][index_start:index_end] = signal_info[:index_end-index_start]

In [41]:
def get_index_difference(point_in_time_a, point_in_time_b, magnitude):
    return int((point_in_time_b - point_in_time_a).total_seconds() // (1 if magnitude == "seconds" else 60))

In [42]:
def read_record(record: str, wdb_dir_path: str) -> [str]:
    signals_in_physiobank = []
    try:
        signals, fields = wfdb.rdsamp(record, pn_dir=wdb_dir_path)
        for i in fields["sig_name"]:
            signals_in_physiobank.append(i)
    except ValueError:
        print(f"Error occurred while reading waveform: {record}")
    return signals_in_physiobank

In [43]:
def get_overlapped_range(range_icustay, range_record):
    latest_start = max(range_icustay[0], range_record[0])
    earliest_end = min(range_icustay[1], range_record[1])
    delta = (earliest_end - latest_start).days + 1
    overlap = max(0, delta)
    range_output = None
    if overlap:
        range_output = (latest_start, earliest_end)
    return range_output
    