In [16]:
import pandas as pd
import numpy as np

import psycopg2
import os
import wfdb
import urllib.request
import datetime
import re

In [2]:
connection = psycopg2.connect(database="mimic", user=os.environ["USERNAME"])
cursor = connection.cursor()
cursor.execute("set search_path to mimiciii")

In [3]:
query_waveform_exists = """SELECT  *
    FROM sepsis3_cohort coh 
    WHERE coh.waveform_exists = 1"""
df_waveform_exists = pd.read_sql(query_waveform_exists, connection)

In [17]:
required_signals_list = ["HR", "ABPSYS", "ABPDIAS", "ABPMEAN", "RESP", "SPO2"]

for index, row in df_waveform_exists.iterrows():
    current_user_id = str(row.subject_id).zfill(6)
    wdb_base_path = "https://physionet.org/files/"
    wdb_dir_path = f"mimic3wdb-matched/1.0/p{current_user_id[:2]}/p{current_user_id}/"

    wdb_records = urllib.request.urlopen(wdb_base_path + wdb_dir_path + "RECORDS")

    for line in wdb_records.readlines():
        record = line.decode("utf-8")
        record = str(record).rstrip()
        if record[-1] == "n":
            signals, fields = wfdb.rdsamp(record, pn_dir=wdb_dir_path)
            signals_names_list = [
                re.sub(r"[\s%]", "", item.upper()) for item in fields["sig_name"]
            ]

            signals_exist = all(
                x in signals_names_list
                for x in required_signals_list
            )

            if signals_exist:
                indexes_dict = {}
                for item in required_signals_list:
                    indexes_dict[item.lower()] = get_signal_index(item, signals_names_list)
                
                record_start_time = datetime.datetime.combine(
                        fields["base_date"], fields["base_time"]
                    )

                if "%.3f" % (fields["fs"]) == "1.000":
                    record_end_time = record_start_time + datetime.timedelta(
                        seconds=(fields["sig_len"] - 1)
                        )
                elif "%.3f" % (fields["fs"]) == "0.017":
                    record_end_time = record_start_time + datetime.timedelta(
                        minutes=(fields["sig_len"] - 1)
                        )
                else:
                    print("ERROR IN SAMPLING")
                

                # Caculate if we have a recording for the time of icu stay

                time_range_icustay = (
                    datetime.datetime.strptime(str(row["intime"]), "%Y-%m-%d %H:%M:%S"),
                    datetime.datetime.strptime(str(row["outtime"]), "%Y-%m-%d %H:%M:%S"),
                )
                
                time_range_record = (record_start_time, record_end_time)

                latest_start = max(time_range_icustay[0], time_range_record[0])
                earliest_end = min(time_range_icustay[0], time_range_record[0])
                print(earliest_end)
                print(latest_start)
                print("---")
                delta_earliest_end_latest_start = (
                    earliest_end - latest_start
                ).days
                
                print(delta_earliest_end_latest_start)

                if delta_earliest_end_latest_start >= 0:
                    print(indexes_list.values())
                    input()
                    nparray_subset = signals[:, indexes_list.values()]
                    np_missing_values = np.count_nonzero(~np.isnan(nparray_subset))
                    print(np_missing_values)

-1
-2050
-2053
-2055
-2092
-2099
-1
-1


KeyboardInterrupt: 

In [14]:
def get_signal_index(signal: str, signal_names: [str]):
    return signal_names.index(signal)