In [1]:
import pandas as pd
import numpy as np

import psycopg2
import os
import wfdb
import urllib.request
import datetime
import re

In [2]:
connection = psycopg2.connect(database="mimic", user="holthausen")
cursor = connection.cursor()
cursor.execute("set search_path to mimiciii")

In [3]:
query_waveform_exists = """SELECT  *
    FROM sepsis3_cohort coh 
    WHERE coh.waveform_exists = 1"""
df_waveform_exists = pd.read_sql(query_waveform_exists, connection)

In [4]:
df_waveform_exists

Unnamed: 0,hadm_id,icustay_id,intime,outtime,age,gender,ethnicity,first_service,dbsource,suspected_of_infection_poe,...,exclusion_csurg,exclusion_carevue,exclusion_early_suspicion,exclusion_late_suspicion,exclusion_bad_data,excluded,waveform_exists,has_sepsis,sepsis_onsettime,subject_id
0,193820,235055,2170-02-15 13:49:22,2170-02-18 14:25:05,82.2770,F,WHITE,CMED,metavision,1,...,0,0,0,0,0,0,1,0,,68780
1,134516,210357,2124-08-23 21:02:58,2124-08-24 16:40:07,47.8833,F,WHITE,NSURG,metavision,1,...,0,0,0,0,0,0,1,0,,68789
2,145714,214367,2134-12-11 00:12:10,2134-12-13 17:05:31,60.5845,M,WHITE,NSURG,metavision,1,...,0,0,0,0,0,0,1,0,,68797
3,153514,252326,2116-05-20 02:52:29,2116-05-21 12:12:36,52.0559,F,WHITE,MED,metavision,1,...,0,0,0,0,0,0,1,0,,68806
4,136015,278608,2146-09-23 06:58:03,2146-09-24 15:43:04,87.5337,F,WHITE,TRAUM,metavision,0,...,0,0,0,0,0,0,1,0,,68807
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4441,163988,270621,2123-11-02 17:42:30,2123-11-03 18:51:56,49.5890,F,HISPANIC OR LATINO,MED,metavision,0,...,0,0,0,0,0,0,1,0,,91824
4442,191601,274657,2164-07-26 20:48:26,2164-07-29 17:05:45,88.5386,F,WHITE,NSURG,metavision,0,...,0,0,0,0,0,0,1,0,,93062
4443,160899,225701,2115-02-26 14:05:30,2115-02-27 15:25:24,73.5102,F,WHITE,MED,metavision,0,...,0,0,0,0,0,0,1,0,,97778
4444,186728,294375,2105-03-13 18:17:49,2105-03-14 15:12:35,59.8370,F,WHITE,CMED,metavision,0,...,0,0,0,0,0,0,1,0,,98336


In [None]:
required_signals_list = ["HR", "ABPSYS", 'ABPDIAS', "ABPMEAN", "RESP", "SPO2"]
df_missing_values_columns = ["subject_id", "number_of_icu_stays", "percentage_missing_values", "required_signals_available"]

subject_id_column = df_waveform_exists["subject_id"]
number_of_icu_stays_column = np.zeros(len(subject_id_column), dtype=np.int16)
percentage_missing_values_column = np.full(len(subject_id_column), 100.0, dtype=float)
required_signals_available_column = np.ones(len(subject_id_column), dtype=bool)

for index, row in df_waveform_exists.iterrows():
    time_range_icustay = get_time_range_icustay(row)
    icustay_length_in_seconds = (time_range_icustay[1] - time_range_icustay[0]).total_seconds()

    current_user_id = str(row.subject_id).zfill(6)
    wdb_base_path = "https://physionet.org/files/"
    wdb_dir_path = f"mimic3wdb-matched/1.0/p{current_user_id[:2]}/p{current_user_id}/"

    wdb_records = urllib.request.urlopen(wdb_base_path + wdb_dir_path + "RECORDS")
    
    numerics_files_list = [get_record_from_line(line) for line in wdb_records.readlines() if get_record_from_line(line)[-1] == "n"]

    number_of_icu_stays_column[index] = len(numerics_files_list)

    for record in numerics_files_list:
        signals, fields = wfdb.rdsamp(record, pn_dir=wdb_dir_path)

        signals_names_list = [re.sub(r"[\s%]", "", item.upper()) for item in fields["sig_name"]]
            
        signals_exist = all(x in signals_names_list for x in required_signals_list)

        required_signals_available_column[index] = required_signals_available_column[index] and signals_exist

        record_range = get_time_range_record(fields)
        record_length_in_seconds = (record_range[1] - record_range[0]).total_seconds()

        percentage_missing_values_column[index] -= (record_length_in_seconds / icustay_length_in_seconds) * 100
        print(percentage_missing_values_column[index])

df_missing_values = pd.DataFrame([[subject_id_column, number_of_icu_stays_column, percentage_missing_values_column, required_signals_available_column]], columns=df_missing_values_columns)
        

In [None]:
def get_percentage_of_missing_records(wdb_dir_path, numerics_files_list, row):
    for file in numerics_files_list:
        signals, fields = wfdb.rdsamp(record, pn_dir=wdb_dir_path)

In [None]:
def process_numerics_files(numerics_files_list, row, df_ts_records):
    df_ts_records_columns = ['RECORD','TIME','HR', 'SPO2','ABPSYS','ABPDIAS','ABPMEAN','RESP']
    count_overlap = 0
    for file in numerics_files_list:
        try:
            signals, fields = wfdb.rdsamp(record, pn_dir=wdb_dir_path)

            signals_names_list = [re.sub(r"[\s%]", "", item.upper()) for item in fields["sig_name"]]
            
            signals_exist = all(x in signals_names_list for x in required_signals_list)

            gap = ""

            if signals_exist:
                indexes_dict = dict([(item.lower(), get_signal_index(item, signals_names_list)) for item in required_signals_list])
                    
                time_range_record = get_time_range_record(fields)
                time_range_icustay = get_time_range_icustay(row)
                
                delta_earliest_end_latest_start = get_delta_earliest_end_latest_start(time_range_icustay, time_range_record)

                if delta_earliest_end_latest_start >= 0:
                    if count_overlap == 0:
                        if time_range_record[0] > time_range_icustay[0]:
                            gap_duration = time_range_record[0] - time_range_icustay[0]
                            duration_in_seconds = gap_duration.total_seconds
                            duration_in_minutes = duration_in_seconds // 60
                            gap += f",{duration_in_minutes}"
                            try:
                                df_ts_records_time_temp_start.drop(df_ts_records_time_temp_start.index, inplace=True)
                            except:
                                print("df_ts_records_time_temp_start does not exist")
                            df_ts_records_time_temp_start = pd.DataFrame(columns = df_ts_records_columns)
                            if '%.3f'%(fields['fs'])== '0.017' :
                                df_ts_records_time_temp_start['TIME'] = pd.date_range(time_range_icustay[0] + datetime.timedelta(minutes=1), periods = duration_in_minutes, freq='1min')
                            elif '%.3f'%(fields['fs'])== '1.000' :
                                df_ts_records_time_temp_start['TIME'] = pd.date_range(time_range_icustay[0] + datetime.timedelta(seconds=1), periods = (duration_in_seconds - 1), freq='S') 
                            df_ts_records = df_ts_records.append(df_ts_records_time_temp_start, ignore_index=True)
                            gap += f",start:{duration_in_minutes}"

                        try:
                            df_ts_records_temp.drop(df_ts_records_temp.index, inplace=True)
                        except:
                            print("df_ts_records_temp does not exist")
                        df_ts_records_temp = pd.DataFrame(columns=df_ts_records_columns)
                        
                        for key in indexes_dict:
                            df_ts_records_temp[index] = signals[:, indexes_dict[key]]

                        if '%.3f'%(fields['fs'])== '0.017' :
                            df_ts_records_time_temp_start['TIME'] = pd.date_range(time_range_record[0], periods=fields["sig_len"], freq='1min')
                        elif '%.3f'%(fields['fs'])== '1.000' :
                            df_ts_records_time_temp_start['TIME'] = pd.date_range(time_range_record[0], periods=fields["sig_len"], freq='S') 
                        df_ts_records_temp["TIME"] = pd.to_datetime(df_ts_records_temp["TIME"])
                        df_ts_records = df_ts_records.append(df_ts_records_temp, ignore_index=True)

                    elif time_range_record[0] < get_time_range_icustay[1]:
                        last_record_time = df_ts_records.loc[df_ts_records.shape[0] - 1, "TIME"]
                        gap_duration = get_time_range_record[0] - last_record_time
                        duration_in_seconds = gap_duration.total_seconds()
                        duration_in_minutes = duration_in_seconds // 60
                        
                        try:
                            df_ts_records_time_temp.drop(df_ts_records_time_temp.index, inplace=True)
                            df_ts_records_temp.drop(df_ts_records_temp.index, inplace=True)
                        except:
                            print("df_ts_records_temp and df_ts_records_time_temp do not exist")
                        
                        df_ts_records_time_temp = pd.DataFrame(columns=df_ts_records_columns)

                        if '%.3f'%(fields['fs'])== '0.017' :
                            df_ts_records_time_temp['TIME'] = pd.date_range(last_record_time + datetime.timedelta(minutes=1), 
                                                                periods=duration_in_minutes, freq='1min')
                        elif '%.3f'%(fields['fs'])== '1.000' :
                            df_ts_records_time_temp['TIME'] = pd.date_range(last_record_time + datetime.timedelta(seconds=1), 
                                                                periods=(duration_in_seconds - 1), freq='S')
                        df_ts_records = df_ts_records.append(df_ts_records_time_temp, ignore_index=True)

                        gap += f',mid:{duration_in_minutes}'

                        df_ts_records_temp = pd.DataFrame(columns=df_ts_records_columns)
                        for key in indexes_dict:
                            df_ts_records_temp[index] = signals[:, indexes_dict[key]]

                        if '%.3f'%(fields['fs'])== '0.017' :
                            df_ts_records_temp['TIME'] = pd.date_range(time_range_record[0], periods=fields['sig_len'], freq='1min') 
                        elif  '%.3f'%(fields['fs'])== '1.000' :
                            df_ts_records_temp['TIME'] = pd.date_range(time_range_record[0], periods=fields['sig_len'], freq='S') 
                        df_ts_records_temp["TIME"] = pd.to_datetime(df_ts_records_temp["TIME"])
                        df_ts_records = df_ts_records.append(df_ts_records_temp, ignore_index=True)

                    df_ts_records['RECORD'] = record
                    fs = '%.3f'%(fields['fs'])
                    count_overlap += 1
                
                else:
                    print(f"Record does not exist for the ICU stays with the signals needed: {row["subject_id"]}")
                
            except:



                
            #     print(delta_earliest_end_latest_start)

            #     if delta_earliest_end_latest_start >= 0:
            #         print(indexes_list.values())
            #         input()
            #         nparray_subset = signals[:, indexes_list.values()]
            #         np_missing_values = np.count_nonzero(~np.isnan(nparray_subset))
            #         print(np_missing_values)

In [4]:
def get_record_from_line(line):
    return str(line.decode("utf-8")).rstrip()

In [5]:
def get_signal_index(signal: str, signal_names: [str]):
    return signal_names.index(signal)

In [6]:
def get_time_range_record(fields):
    record_start_time = datetime.datetime.combine(fields["base_date"], fields["base_time"])

    if "%.3f" % (fields["fs"]) == "1.000":
        record_end_time = record_start_time + datetime.timedelta(seconds=(fields["sig_len"] - 1))
    elif "%.3f" % (fields["fs"]) == "0.017":
        record_end_time = record_start_time + datetime.timedelta(minutes=(fields["sig_len"] - 1))
    else:
        print("ERROR IN SAMPLING")
    
    return (record_start_time, record_end_time)
    

In [7]:
def get_time_range_icustay(row):
    return (datetime.datetime.strptime(str(row["intime"]), "%Y-%m-%d %H:%M:%S"),
            datetime.datetime.strptime(str(row["outtime"]), "%Y-%m-%d %H:%M:%S"))

In [8]:
def get_delta_earliest_end_latest_start(range_a, range_b):
    latest_start = max(range_a[0], range_b[0])
    earliest_end = min(range_a[1], range_b[1])
    return (earliest_end - latest_start).days + 1