In [198]:
from datetime import datetime

print(datetime.now())
# data preprocessing
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import collections
from collections import defaultdict
import os
import sys
import shutil
from collections import Counter

# the full input files pathes are here
DATA_PATH_stages = "data/kdigo_stages_measured.csv"
DATA_PATH_labs = "data/labs-kdigo_stages_measured.csv"
DATA_PATH_vitals = "data/vitals-kdigo_stages_measured.csv"
DATA_PATH_vents = "data/vents-vasopressor-sedatives-kdigo_stages_measured.csv"
DATA_PATH_detail = "data/icustay_detail-kdigo_stages_measured.csv"
SEPARATOR = ";"

2024-07-11 10:58:45.178836


In [199]:
# the output pathes are here
OUTPUT_PATH = "data/AKI"

In [200]:
# Set parameter as constant

# which classifier to use, only run one classifier at one time
ALL_STAGES = False  # not binary label, each class separately 0,1,2,3

CLASS1 = True  # AnyAKI
CLASS2 = False  # ModerateSevereAKI
CLASS3 = False  # SevereAKI


MAX_FEATURE_SET = True


# resampling  and imputing
TIME_SAMPLING = True
SAMPLING_INTERVAL = "1H"
# RESAMPLE_LIMIT = 16 # 4 days*6h interval

# if MOST_COMMON is not applied,sampling with different strategies per kind of variable,
# numeric variables use mean value, categorical variables use max value
MOST_COMMON = False  # resampling with most common

# fit Yereva's time span
MAX_HOUR = 48

IMPUTE_EACH_ID = True  # imputation within each icustay_id with most common value
IMPUTE_COLUMN = False  # imputation based on whole column
IMPUTE_METHOD = "most_frequent"
FILL_VALUE = 0  # fill missing value and ragged part of 3d array

# Age constraints: adults
ADULTS_MIN_AGE = 18
ADULTS_MAX_AGE = -1

NORMALIZATION = "min-max"
NORM_TYPE = "min_max"

CAPPING = True
if CAPPING:
    CAPPING_THRESHOLD_UPPER = 0.99
    CAPPING_THRESHOLD_LOWER = 0.01


# use random split or fixed train/val/test set
RANDOM_SPLIT = True
FIXED = False
RANDOM_SEED = 42
SPLIT_SIZE = 0.2

# set changable info corresponding to each classifier as variables

min_set = ["icustay_id", "charttime", "creat", "uo_rt_6hr", "uo_rt_12hr", "uo_rt_24hr", "aki_stage"]


max_set = [
    "icustay_id",
    "charttime",
    "aki_stage",
    "hadm_id",
    "albumin_avg",
    "aniongap_avg",
    "bicarbonate_avg",
    "bilirubin_avg",
    "bun_avg",
    "chloride_avg",
    "creat",
    "diasbp_mean",
    "glucose_avg",
    "heartrate_mean",
    "hematocrit_avg",
    "hemoglobin_avg",
    "potassium_avg",
    "resprate_mean",
    "sodium_avg",
    "spo2_mean",
    "sysbp_mean",
    "uo_rt_12hr",
    "uo_rt_24hr",
    "uo_rt_6hr",
    "wbc_avg",
    "sedative",
    "vasopressor",
    "vent",
    "age",
    "F",
    "M",
    "asian",
    "black",
    "hispanic",
    "native",
    "other",
    "unknown",
    "white",
    "ELECTIVE",
    "EMERGENCY",
    "URGENT",
]

In [201]:
# Some functions used later


def cap_data(df):
    print("Capping between the {} and {} quantile".format(CAPPING_THRESHOLD_LOWER, CAPPING_THRESHOLD_UPPER))
    cap_mask = df.columns.difference(["icustay_id", "charttime", "aki_stage", "subject_id", "intime", "HOURS"])

    print("cap_maks", cap_mask)
    # Filtrar solo columnas numéricas
    numeric_cols = df[cap_mask].select_dtypes(include=[np.number]).columns
    print("numeric_cols", numeric_cols)

    df[numeric_cols] = df[numeric_cols].clip(
        df[numeric_cols].quantile(CAPPING_THRESHOLD_LOWER), df[numeric_cols].quantile(CAPPING_THRESHOLD_UPPER), axis=1
    )

    return df


def normalise_data(df, norm_mask):
    print("Normalizing in [0,1] with {} normalization".format(NORMALIZATION))

    df[norm_mask] = (df[norm_mask] - df[norm_mask].min()) / (df[norm_mask].max() - df[norm_mask].min())

    return df


# impute missing value in resampleing data with most common based on each id
def fast_mode(df, key_cols, value_col):
    """Calculate a column mode, by group, ignoring null values.

    key_cols : list of str - Columns to groupby for calculation of mode.
    value_col : str - Column for which to calculate the mode.

    Return
    pandas.DataFrame
        One row for the mode of value_col per key_cols group. If ties, returns the one which is sorted first."""
    return (
        df.groupby(key_cols + [value_col])
        .size()
        .to_frame("counts")
        .reset_index()
        .sort_values("counts", ascending=False)
        .drop_duplicates(subset=key_cols)
    ).drop("counts", axis=1)


# get max shape of 3d array
def get_dimensions(array, level=0):
    yield level, len(array)
    try:
        for row in array:
            yield from get_dimensions(row, level + 1)
    except TypeError:  # not an iterable
        pass


def get_max_shape(array):
    dimensions = defaultdict(int)
    for level, length in get_dimensions(array):
        dimensions[level] = max(dimensions[level], length)
    return [value for _, value in sorted(dimensions.items())]


# pad the ragged 3d array to rectangular shape based on max size
def iterate_nested_array(array, index=()):
    try:
        for idx, row in enumerate(array):
            yield from iterate_nested_array(row, (*index, idx))
    except TypeError:  # final level
        yield (*index, slice(len(array))), array  # think of the types


def pad(array, fill_value):
    dimensions = get_max_shape(array)
    result = np.full(dimensions, fill_value, dtype=np.float64)
    for index, value in iterate_nested_array(array):
        result[index] = value
    return result

# read csv files

In [202]:
print("read csv files")
# reading csv files
X = pd.read_csv(DATA_PATH_stages, sep=SEPARATOR)
X.drop(["aki_stage_creat", "aki_stage_uo"], axis=1, inplace=True)
# remove totally empty rows
X = X.dropna(how="all", subset=["creat", "uo_rt_6hr", "uo_rt_12hr", "uo_rt_24hr", "aki_stage"])
print("convert charttime to timestamp")
X["charttime"] = pd.to_datetime(X["charttime"])

# merge rows if they have exact timestamp within same icustay_id AL : it substitutes missing values with zero
# X = X.groupby(['icustay_id', 'charttime']).sum().reset_index(['icustay_id', 'charttime'])

dataset_detail = pd.read_csv(DATA_PATH_detail, sep=SEPARATOR)  # age constraint
# keep "intime" to calculate Hours in Yereva

# subject_id;hadm_id;stay_id;gender;anchor_age;anchor_year;anchor_year_group;admittime;dischtime;deathtime;
# race -> ethnicity
# deathtime -> dod

dataset_detail.drop(
    [
        "dod",
        "admittime",
        "dischtime",
        "los_hospital",
        "race",
        "hospital_expire_flag",
        "hospstay_seq",
        "first_hosp_stay",
        "outtime",
        "los_icu",
        "icustay_seq",
        "first_icu_stay",
    ],
    axis=1,
    inplace=True,
    errors="ignore",
)
print("convert intime to timestamp")
dataset_detail["intime"] = pd.to_datetime(dataset_detail["icu_intime"])

INTIME = pd.DataFrame()
INTIME["stay_id"] = dataset_detail["stay_id"]
INTIME["intime"] = dataset_detail["intime"]

read csv files
convert charttime to timestamp
convert intime to timestamp


In [203]:
dataset_detail.columns

Index(['subject_id', 'hadm_id', 'stay_id', 'gender', 'admission_age',
       'icu_intime', 'icu_outtime', 'subject_id.1', 'gender.1', 'anchor_age',
       'anchor_year', 'anchor_year_group', 'dod.1', 'subject_id.2',
       'hadm_id.1', 'admittime.1', 'dischtime.1', 'deathtime',
       'admission_type', 'admit_provider_id', 'admission_location',
       'discharge_location', 'insurance', 'language', 'marital_status',
       'race.1', 'edregtime', 'edouttime', 'hospital_expire_flag.1', 'intime'],
      dtype='object')

In [204]:
dataset_detail = dataset_detail.drop(["subject_id.1", "subject_id.2", "hadm_id.1", "gender.1"], axis=1)

In [205]:
dataset_detail.rename(
    columns={
        "dod.1": "dod",
        "admittime.1": "admittime",
        "dischtime.1": "dischtime",
        "race.1": "race",
        "hospital_expire_flag.1": "hospital_expire_flag",
    },
    inplace=True,
)

In [206]:
dataset_labs = pd.read_csv(DATA_PATH_labs, sep=SEPARATOR)  # 'bands lactate platelet ptt inr pt
dataset_labs.drop(
    [
        "albumin_min",
        "albumin_max",
        "bilirubin_min",
        "bilirubin_max",
        "bands_min",
        "bands_max",
        "lactate_min",
        "lactate_max",
        "platelet_min",
        "platelet_max",
        "ptt_min",
        "ptt_max",
        "inr_min",
        "inr_max",
        "pt_min",
        "pt_max",
    ],
    axis=1,
    inplace=True,
)
dataset_labs = dataset_labs.dropna(subset=["charttime"])
dataset_labs = dataset_labs.dropna(subset=dataset_labs.columns[4:], how="all")
dataset_labs["charttime"] = pd.to_datetime(dataset_labs["charttime"])
dataset_labs = dataset_labs.sort_values(by=["stay_id", "charttime"])

if MAX_FEATURE_SET:
    dataset_vitals = pd.read_csv(DATA_PATH_vitals, sep=SEPARATOR)
    dataset_vents = pd.read_csv(DATA_PATH_vents, sep=SEPARATOR)
    # dataset_icd = pd.read_csv(DATA_PATH_icd, sep= SEPARATOR)
    dataset_vitals.drop(
        [
            "heartrate_min",
            "heartrate_max",
            "sysbp_min",
            "sysbp_max",
            "diasbp_min",
            "diasbp_max",
            "meanbp_min",
            "meanbp_max",
            "meanbp_mean",
            "tempc_min",
            "tempc_max",
            "tempc_mean",
            "resprate_min",
            "resprate_max",
            "spo2_min",
            "spo2_max",
            "glucose_min",
            "glucose_max",
        ],
        axis=1,
        inplace=True,
    )
    print("convert charttime to timestamp")
    dataset_vitals["charttime"] = pd.to_datetime(dataset_vitals["charttime"])
    dataset_vents["charttime"] = pd.to_datetime(dataset_vents["charttime"])
    dataset_vitals = dataset_vitals.sort_values(by=["stay_id", "charttime"])
    dataset_vents = dataset_vents.sort_values(by=["stay_id", "charttime"])
    # AL drop those where all columns are nan (empty rows)
    dataset_vitals = dataset_vitals.dropna(subset=dataset_vitals.columns[4:], how="all")

convert charttime to timestamp


In [207]:
def break_up_stays_by_subject(stays, output_path, subjects=None, verbose=1):
    subjects = stays.subject_id.unique() if subjects is None else subjects
    nb_subjects = subjects.shape[0]
    for i, subject_id in enumerate(subjects):
        if verbose:
            sys.stdout.write("\rSUBJECT {0} of {1}...".format(i + 1, nb_subjects))
        dn = os.path.join(output_path, str(subject_id))
        try:
            os.makedirs(dn)
        except:
            pass

        stays.ix[stays.subject_id == subject_id].sort_values(by="intime").to_csv(
            os.path.join(dn, "stays.csv"), index=False
        )
    if verbose:
        sys.stdout.write("DONE!\n")

In [208]:
print("compute avg from min/max in labs file")
print(datetime.now())
# Labs file: instead of min and max their avg
counter = 0
col1 = 4
col2 = 5
null_l = []  # no null values in those that are different
changed = 0  # 4316 records changed to avg

while counter < 11:
    row = 0
    # find where min and max are different and save their row indices
    while row < len(dataset_labs):
        a = dataset_labs.iloc[row, col1]
        b = dataset_labs.iloc[row, col2]
        if a == b or (np.isnan(a) and np.isnan(b)):
            pass
        elif a != b:
            changed += 1
            avg = (a + b) / 2
            dataset_labs.iloc[row, col1] = avg
            if (np.isnan(a) and ~np.isnan(b)) or (np.isnan(b) and ~np.isnan(a)):
                null_l.append(row)
        else:
            print(a)
            print(b)
        row += 1
    # delete the redundant column max, update counters
    dataset_labs.drop(dataset_labs.columns[col2], axis=1, inplace=True)
    counter = counter + 1
    col1 = col1 + 1
    col2 = col2 + 1

dataset_labs.columns = [
    "subject_id",
    "hadm_id",
    "stay_id",
    "charttime",
    "aniongap_avg",
    "bicarbonate_avg",
    "creatinine_avg",
    "chloride_avg",
    "glucose_avg",
    "hematocrit_avg",
    "hemoglobin_avg",
    "potassium_avg",
    "sodium_avg",
    "bun_avg",
    "wbc_avg",
]
if len(null_l) > 0:
    print("null values encountered")
print(datetime.now())

compute avg from min/max in labs file
2024-07-11 11:00:32.767478
2024-07-11 11:18:34.943684


In [209]:
dataset_labs.columns

Index(['subject_id', 'hadm_id', 'stay_id', 'charttime', 'aniongap_avg',
       'bicarbonate_avg', 'creatinine_avg', 'chloride_avg', 'glucose_avg',
       'hematocrit_avg', 'hemoglobin_avg', 'potassium_avg', 'sodium_avg',
       'bun_avg', 'wbc_avg'],
      dtype='object')

In [210]:
print("Merge creatinine and glucose.")
# merge creatinine from labs and set with labels
creat_l = dataset_labs[["stay_id", "charttime", "creatinine_avg"]].copy()
creat_l = creat_l.dropna(subset=["creatinine_avg"])
creat = X[["stay_id", "charttime", "creat"]].copy()
creat = creat.dropna(subset=["creat"])
creat_l = creat_l.rename(columns={"creatinine_avg": "creat"})

# creat = creat.append(creat_l, ignore_index=True)
creat = pd.concat([creat, creat_l], ignore_index=True)

creat.drop_duplicates(inplace=True)
# delete old columns
dataset_labs.drop(["creatinine_avg"], axis=1, inplace=True)
dataset_labs = dataset_labs.dropna(subset=dataset_labs.columns[4:], how="all")
X.drop(["creat"], axis=1, inplace=True)
# merge new column
X = pd.merge(X, creat, on=["stay_id", "charttime"], sort=True, how="outer", copy=False)

Merge creatinine and glucose.


In [211]:
if MAX_FEATURE_SET:
    # merge glucose from vitals and labs
    glucose_v = dataset_vitals[["subject_id", "hadm_id", "stay_id", "charttime", "glucose_mean"]].copy()
    glucose_v = glucose_v.dropna(subset=["glucose_mean"])
    glucose = dataset_labs[["subject_id", "hadm_id", "stay_id", "charttime", "glucose_avg"]].copy()
    glucose = glucose.dropna(subset=["glucose_avg"])
    glucose_v = glucose_v.rename(columns={"glucose_mean": "glucose_avg"})

    # glucose = glucose.append(glucose_v, ignore_index=True)
    glucose = pd.concat([glucose, glucose_v], ignore_index=True)

    glucose.drop_duplicates(inplace=True)
    # delete old columns
    dataset_labs.drop(["glucose_avg"], axis=1, inplace=True)
    dataset_vitals.drop(["glucose_mean"], axis=1, inplace=True)
    dataset_vitals = dataset_vitals.dropna(subset=dataset_vitals.columns[4:], how="all")
    # merge new column
    dataset_labs = pd.merge(
        dataset_labs,
        glucose,
        on=[
            "subject_id",
            "hadm_id",
            "stay_id",
            "charttime",
        ],
        sort=True,
        how="outer",
        copy=False,
    )

dataset_labs = dataset_labs.sort_values(by=["stay_id", "charttime"], ignore_index=True)
X = X.sort_values(by=["stay_id", "charttime"], ignore_index=True)

In [212]:
X.columns

Index(['subject_id', 'hadm_id', 'stay_id', 'charttime', 'creat_low_past_7day',
       'creat_low_past_48hr', 'uo_rt_6hr', 'uo_rt_12hr', 'uo_rt_24hr',
       'aki_stage_crrt', 'aki_stage', 'aki_stage_smoothed', 'creat'],
      dtype='object')

In [213]:
dataset_labs.columns

Index(['subject_id', 'hadm_id', 'stay_id', 'charttime', 'aniongap_avg',
       'bicarbonate_avg', 'chloride_avg', 'hematocrit_avg', 'hemoglobin_avg',
       'potassium_avg', 'sodium_avg', 'bun_avg', 'wbc_avg', 'glucose_avg'],
      dtype='object')

In [214]:
dataset_vitals.columns

Index(['subject_id', 'hadm_id', 'stay_id', 'charttime', 'heartrate_mean',
       'sysbp_mean', 'diasbp_mean', 'resprate_mean', 'spo2_mean'],
      dtype='object')

In [215]:
print("Merging labs, vitals and vents files")
if MAX_FEATURE_SET:
    Y = pd.merge(X, dataset_labs, on=["stay_id", "charttime"], how="outer", copy=False)
    Y = pd.merge(X, dataset_vitals, on=["stay_id", "charttime", "subject_id", "hadm_id"], how="outer", copy=False)
    Y = pd.merge(X, dataset_vents, on=["stay_id", "charttime"], how="outer", copy=False)
    # X.drop(["subject_id"], axis = 1, inplace = True)

Merging labs, vitals and vents files


In [216]:
# print("Merging labs, vitals and vents files")
# if MAX_FEATURE_SET:
#     X = pd.merge(X, dataset_labs, on = ["stay_id", "charttime"], how= "outer", copy = False)
#     X = pd.merge(X, dataset_vitals, on = ["stay_id", "charttime","subject_id", "hadm_id"], how= "outer", copy = False)
#     X = pd.merge(X, dataset_vents, on = ["stay_id", "charttime"], how= "outer", copy = False)
#     #X.drop(["subject_id"], axis = 1, inplace = True)

In [217]:
dataset_detail.columns

Index(['subject_id', 'hadm_id', 'stay_id', 'gender', 'admission_age',
       'icu_intime', 'icu_outtime', 'anchor_age', 'anchor_year',
       'anchor_year_group', 'dod', 'admittime', 'dischtime', 'deathtime',
       'admission_type', 'admit_provider_id', 'admission_location',
       'discharge_location', 'insurance', 'language', 'marital_status', 'race',
       'edregtime', 'edouttime', 'hospital_expire_flag', 'intime'],
      dtype='object')

In [218]:
dataset_detail_copy = dataset_detail

In [219]:
Y_copy = Y

In [220]:
print("start preprocessing time dependent data")
print("Removing patients under the min age")
dataset_detail = dataset_detail.loc[dataset_detail["admission_age"] >= ADULTS_MIN_AGE]
adults_icustay_id_list = dataset_detail["stay_id"].unique()
Y = Y[Y.stay_id.isin(adults_icustay_id_list)].sort_values(by=["stay_id"], ignore_index=True)
Y = Y.sort_values(by=["stay_id", "charttime"], ignore_index=True)
adults_icustay_id_list = np.sort(adults_icustay_id_list)

start preprocessing time dependent data
Removing patients under the min age


In [221]:
X_original = X

In [222]:
print("drop stay_id with time span less than 48hrs")


def more_than_HOURS_ahead(adults_icustay_id_list, X):
    drop_list = []
    los_list = []  # calculating LOS ICU based on charttime
    long_stays_id = []  # LOS longer than MAX DAYS days
    last_charttime_list = []
    seq_length = X.groupby(
        ["stay_id"], as_index=False
    ).size()  # Sian modified to above code, AL: seq_length = X.groupby(['stay_id'],as_index=False).size().to_frame('size')
    id_count = 0
    first_row_index = 0

    while id_count < len(adults_icustay_id_list):
        stay_id = adults_icustay_id_list[id_count]
        last_row_index = (
            first_row_index + seq_length.iloc[id_count, 1] - 1
        )  # Sian modified, AL: seq_length.iloc[id_count,0]-1
        first_time = X.iat[first_row_index, X.columns.get_loc("charttime")]
        last_time = X.iat[last_row_index, X.columns.get_loc("charttime")]
        los = round(float((last_time - first_time).total_seconds() / 60 / 60 / 24), 4)  # in days
        if los < 48 / 24:
            drop_list.append(stay_id)
        else:
            los_list.append(los)
            if los > 35:
                long_stays_id.append(stay_id)
                last_charttime_list.append(last_time)
        # udpate for the next stay_id
        first_row_index = last_row_index + 1
        id_count += 1
    if len(long_stays_id) != len(last_charttime_list):
        print("ERROR")
    print("%d long stays" % len(long_stays_id))
    # drop all the rows with the saved stay_id
    print("there are %d id-s shorter than 48 hours" % len(drop_list))
    X = X[~X.stay_id.isin(drop_list)]
    id_list = X["stay_id"].unique()
    X = X.sort_values(by=["stay_id", "charttime"], ignore_index=True)

    return id_list, X, long_stays_id, last_charttime_list


id_list, Y, long_stays_id, last_charttime_list = more_than_HOURS_ahead(adults_icustay_id_list, Y)

long = pd.DataFrame()
long["stay_id"] = long_stays_id
long["last_time"] = last_charttime_list

drop stay_id with time span less than 48hrs
3168 long stays
there are 8082 id-s shorter than 48 hours


# extract Label 

In [223]:
print("binarise labels")
if ALL_STAGES:
    pass
elif CLASS1:
    Y.loc[Y["aki_stage"] > 1, "aki_stage"] = 1
elif CLASS2:
    Y.loc[Y["aki_stage"] < 2, "aki_stage"] = 0
    Y.loc[Y["aki_stage"] > 1, "aki_stage"] = 1
elif CLASS3:
    Y.loc[Y["aki_stage"] < 3, "aki_stage"] = 0
    Y.loc[Y["aki_stage"] > 2, "aki_stage"] = 1

binarise labels


In [224]:
print("choose one label for each stay_id (whenever it turn pos in the whole staying)")


def one_label_per_icustay(id_list, X):
    dataset = X
    temp_icustay_df = pd.DataFrame()
    target_list = []

    for icustay in id_list:
        temp_icustay_df = dataset.loc[dataset["stay_id"] == icustay].sort_values(by=["charttime"])
        if any(temp_icustay_df.aki_stage == 1):
            target_list.append(1)
        else:
            target_list.append(0)

    return target_list


target_list = one_label_per_icustay(id_list, Y)

target = pd.DataFrame()
target["stay_id"] = id_list
target["y_true"] = target_list

choose one label for each stay_id (whenever it turn pos in the whole staying)


In [225]:
target

Unnamed: 0,stay_id,y_true
0,30000153,1
1,30000213,1
2,30000484,1
3,30000646,0
4,30001148,1
...,...,...
65005,39999286,1
65006,39999384,0
65007,39999552,0
65008,39999562,0


In [226]:
print("number of neg and pos label in target(whole stay)")
target["y_true"].value_counts()

number of neg and pos label in target(whole stay)


y_true
1    44466
0    20544
Name: count, dtype: int64

In [227]:
hour = 48  # set time span
print("calculate how many pos label within the first 48hrs, could be different time span")


def count_pos_label(X, INTIME, hour):
    dataset = X
    dataset = pd.merge(dataset, INTIME, on=["stay_id"], how="left", copy=False)
    dataset["HOURS"] = (dataset.charttime - dataset.intime).apply(lambda s: s / np.timedelta64(1, "s")) / 60.0 / 60
    dataset = dataset[dataset["HOURS"] >= 0]
    dataset = dataset[dataset["HOURS"] <= hour]
    dataset = dataset.reset_index(drop=True)

    temp_icustay_df = pd.DataFrame()
    target_list = []

    for icustay in id_list:
        temp_icustay_df = dataset.loc[dataset["stay_id"] == icustay].sort_values(by=["charttime"])
        if any(temp_icustay_df.aki_stage == 1):
            target_list.append(1)
        else:
            target_list.append(0)
    print("number of neg and pos label within the first " + str(hour) + "hr")
    print(Counter(target_list))


count_pos_label(Y, INTIME, hour)

# TODO 3/24: also compute within 24 hours
hour = 24
count_pos_label(Y, INTIME, hour)

calculate how many pos label within the first 48hrs, could be different time span
number of neg and pos label within the first 48hr
Counter({1: 39691, 0: 25319})
number of neg and pos label within the first 24hr
Counter({0: 32771, 1: 32239})


In [228]:
Y_copy_2 = Y

In [229]:
print("dataset drop AKI stages column")
Y = Y.drop(["aki_stage"], axis=1)

dataset drop AKI stages column


In [230]:
Y = Y.drop(["subject_id", "hadm_id"], axis=1)

In [231]:
Y.columns

Index(['stay_id', 'charttime', 'creat_low_past_7day', 'creat_low_past_48hr',
       'uo_rt_6hr', 'uo_rt_12hr', 'uo_rt_24hr', 'aki_stage_crrt',
       'aki_stage_smoothed', 'creat', 'vent', 'vasopressor', 'sedative'],
      dtype='object')

# Resampling

In [232]:
# label = ['aki_stage']
skip = ["stay_id", "charttime"]
if MAX_FEATURE_SET:
    discrete_feat = ["sedative", "vasopressor", "vent"]
    skip.extend(discrete_feat)
    # all features that are not in skip are numeric
numeric_feat = list(Y.columns.difference(skip))


if TIME_SAMPLING and MOST_COMMON:
    print("resampling: MOST_COMMON with interval of " + str(SAMPLING_INTERVAL))
    # Resample the data using assigned interval,mode() for most common
    Y = Y.set_index("charttime").groupby("stay_id").resample(SAMPLING_INTERVAL).mode().reset_index()
elif TIME_SAMPLING:
    print("resampling: MEAN & ZERO with interval of " + str(SAMPLING_INTERVAL))
    # Sampling with different strategies per kind of variable
    # label = ['aki_stage']
    skip = ["stay_id", "charttime"]
    if MAX_FEATURE_SET:
        discrete_feat = ["sedative", "vasopressor", "vent"]
        skip.extend(discrete_feat)
    # all features that are not in skip are numeric
    numeric_feat = list(Y.columns.difference(skip))

    # Applying aggregation to features depending on their type
    Y = Y.set_index("charttime").groupby("stay_id").resample(SAMPLING_INTERVAL)
    if MAX_FEATURE_SET:
        X_discrete = Y[discrete_feat].max().fillna(FILL_VALUE).astype(np.int64)
    X_numeric = Y[numeric_feat].mean()
    # X_label = X['aki_stage'].max()
    print("Merging sampled features")
    try:
        Y = pd.concat([X_numeric, X_discrete], axis=1).reset_index()
    except:
        Y = X_numeric.reset_index()
print(Y.shape)

# Label forward fill
# X['aki_stage'] = X['aki_stage'].ffill(limit=RESAMPLE_LIMIT)

resampling: MEAN & ZERO with interval of 1H
Merging sampled features
(18744760, 13)


In [233]:
X = Y

In [234]:
X

Unnamed: 0,stay_id,charttime,aki_stage_crrt,aki_stage_smoothed,creat,creat_low_past_48hr,creat_low_past_7day,uo_rt_12hr,uo_rt_24hr,uo_rt_6hr,sedative,vasopressor,vent
0,30000153,2174-09-29 10:00:00,,0.0,1.2,,,,,,0,0,0
1,30000153,2174-09-29 11:00:00,,,,,,,,,0,0,0
2,30000153,2174-09-29 12:00:00,,0.0,,,,,,,1,0,1
3,30000153,2174-09-29 13:00:00,,,,,,,,,1,0,1
4,30000153,2174-09-29 14:00:00,,0.0,,,,,,,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
18744755,39999810,2115-12-04 18:00:00,,,,,,,,,0,0,0
18744756,39999810,2115-12-04 19:00:00,,,,,,,,,0,0,0
18744757,39999810,2115-12-04 20:00:00,,,,,,,,,0,0,0
18744758,39999810,2115-12-04 21:00:00,,,,,,,,,0,0,0


# fit time span with Yereva

In [235]:
print("Merging intime column to X")
print("Drop rows that has HOURS > 48h, could be other time span. And drop rows that has HOURS < 0")
X = pd.merge(X, INTIME, on=["stay_id"], how="left", copy=False)
X["HOURS"] = (X.charttime - X.intime).apply(lambda s: s / np.timedelta64(1, "s")) / 60.0 / 60
X = X[X["HOURS"] >= 0]
X = X[X["HOURS"] <= MAX_HOUR]
X = X.reset_index(drop=True)

Merging intime column to X
Drop rows that has HOURS > 48h, could be other time span. And drop rows that has HOURS < 0


In [236]:
# check if follows one hour interval
X.loc[X["stay_id"] == 272725].sort_values(by=["HOURS"])["HOURS"]
# X.loc[X["icustay_id"]== 244882].sort_values(by=['HOURS'])["HOURS"]
# X.loc[X["icustay_id"]== 217128].sort_values(by=['HOURS'])["HOURS"]

Series([], Name: HOURS, dtype: float64)

# Imputing 

In [237]:
print("Imputation.")
remove_list = ["stay_id", "charttime", "intime", "HOURS"]

# using most common within each stay_id
if IMPUTE_EACH_ID:
    column_name = list(X.columns)
    for item in remove_list:
        column_name.remove(item)
    for feature in column_name:
        X.loc[X[feature].isnull(), feature] = X.stay_id.map(
            fast_mode(X, ["stay_id"], feature).set_index("stay_id")[feature]
        )

# imputation based on whole column
if IMPUTE_COLUMN:
    imp = SimpleImputer(missing_values=np.nan, strategy=IMPUTE_METHOD)
    cols = list(X.columns)
    for item in remove_list:
        cols.remove(item)
    X[cols] = imp.fit_transform(X[cols])

# If no imputation method selected or only impute each id, for the remaining nan impute direclty with FILL_VALUE
X = X.fillna(FILL_VALUE)

Imputation.


In [238]:
X.loc[X["stay_id"] == 272725].sort_values(by=["HOURS"])["HOURS"]

Series([], Name: HOURS, dtype: float64)

In [239]:
X

Unnamed: 0,stay_id,charttime,aki_stage_crrt,aki_stage_smoothed,creat,creat_low_past_48hr,creat_low_past_7day,uo_rt_12hr,uo_rt_24hr,uo_rt_6hr,sedative,vasopressor,vent,intime,HOURS
0,30000153,2174-09-29 13:00:00,0.0,0.0,0.8,0.9,0.9,0.5321,0.5827,0.7045,1,0,1,2174-09-29 12:09:00,0.850000
1,30000153,2174-09-29 14:00:00,0.0,0.0,0.8,0.9,0.9,0.5321,0.5827,0.7045,0,0,1,2174-09-29 12:09:00,1.850000
2,30000153,2174-09-29 15:00:00,0.0,0.0,0.9,1.2,1.2,0.5321,0.5827,0.7045,1,0,1,2174-09-29 12:09:00,2.850000
3,30000153,2174-09-29 16:00:00,0.0,0.0,0.8,0.9,0.9,0.5321,0.5827,0.7045,1,0,1,2174-09-29 12:09:00,3.850000
4,30000153,2174-09-29 17:00:00,0.0,0.0,0.8,0.9,0.9,0.5321,0.5827,0.7045,0,0,1,2174-09-29 12:09:00,4.850000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3053127,39999810,2115-12-02 20:00:00,0.0,0.0,0.9,0.9,0.9,0.2596,0.2596,0.6410,0,0,0,2115-12-01 00:37:00,43.383333
3053128,39999810,2115-12-02 21:00:00,0.0,0.0,0.9,0.9,0.9,0.2596,0.2596,0.6410,0,0,0,2115-12-01 00:37:00,44.383333
3053129,39999810,2115-12-02 22:00:00,0.0,0.0,0.9,0.9,0.9,0.2596,0.2596,0.6410,0,0,0,2115-12-01 00:37:00,45.383333
3053130,39999810,2115-12-02 23:00:00,0.0,0.0,0.9,0.9,0.9,0.2596,0.2596,0.6410,0,0,0,2115-12-01 00:37:00,46.383333


In [240]:
# more comfortable to review in this order
print("check variables")
try:
    cols = [
        "stay_id",
        "charttime",
        "aniongap_avg",
        "bicarbonate_avg",
        "bun_avg",
        "chloride_avg",
        "creat",
        "diasbp_mean",
        "glucose_avg",
        "heartrate_mean",
        "hematocrit_avg",
        "hemoglobin_avg",
        "potassium_avg",
        "resprate_mean",
        "sodium_avg",
        "spo2_mean",
        "sysbp_mean",
        "uo_rt_12hr",
        "uo_rt_24hr",
        "uo_rt_6hr",
        "wbc_avg",
        "sedative",
        "vasopressor",
        "vent",
        "HOURS",
        "intime",
    ]
    X = X[cols]
    print("success")
except:
    try:
        cols = [
            "stay_id",
            "charttime",
            "aki_stage_crrt",
            "aki_stage_smoothed",
            "creat",
            "creat_low_past_48hr",
            "creat_low_past_7day",
            "uo_rt_12hr",
            "uo_rt_24hr",
            "uo_rt_6hr",
        ]
        X = X[cols]
        print("try 2")
    except:
        print("error")

check variables
try 2


In [241]:
# 'stay_id', 'charttime', 'aki_stage_crrt', 'aki_stage_smoothed', 'creat',
#        'creat_low_past_48hr', 'creat_low_past_7day', 'uo_rt_12hr',
#        'uo_rt_24hr', 'uo_rt_6hr', 'sedative', 'vasopressor', 'vent'

# Add categorical features (details)

In [242]:
copy_X_2 = X

In [243]:
# X = copy_X

In [244]:
dataset_detail.columns

Index(['subject_id', 'hadm_id', 'stay_id', 'gender', 'admission_age',
       'icu_intime', 'icu_outtime', 'anchor_age', 'anchor_year',
       'anchor_year_group', 'dod', 'admittime', 'dischtime', 'deathtime',
       'admission_type', 'admit_provider_id', 'admission_location',
       'discharge_location', 'insurance', 'language', 'marital_status', 'race',
       'edregtime', 'edouttime', 'hospital_expire_flag', 'intime'],
      dtype='object')

In [245]:
if MAX_FEATURE_SET:
    # extract datasets based on id_list
    dataset_detail = dataset_detail.loc[dataset_detail["stay_id"].isin(id_list)]
    # sort by ascending order
    dataset_detail = dataset_detail.sort_values(by=["stay_id"])
    # print(dataset_detail)

    # transfrom categorical data to binary form
    dataset_detail = dataset_detail.drop(["icu_intime"], axis=1)
    dataset_detail = dataset_detail.join(pd.get_dummies(dataset_detail.pop("gender")))
    dataset_detail = dataset_detail.join(pd.get_dummies(dataset_detail.pop("race")))
    dataset_detail = dataset_detail.join(pd.get_dummies(dataset_detail.pop("admission_type")))
    # X = X.drop(['subject_id', 'hadm_id'], axis=1)
    # dataset_detail = dataset_detail.drop(['subject_id', 'hadm_id'], axis=1)
    X = pd.merge(X, dataset_detail, on=["stay_id"], how="left", copy=False)

    numeric_feat.append("admission_age")

In [247]:
X = X.drop(["charttime", "icu_intime"], axis=1)

In [248]:
X

Unnamed: 0,stay_id,aki_stage_crrt,aki_stage_smoothed,creat,creat_low_past_48hr,creat_low_past_7day,uo_rt_12hr,uo_rt_24hr,uo_rt_6hr,subject_id,...,WHITE - RUSSIAN,AMBULATORY OBSERVATION,DIRECT EMER.,DIRECT OBSERVATION,ELECTIVE,EU OBSERVATION,EW EMER.,OBSERVATION ADMIT,SURGICAL SAME DAY ADMISSION,URGENT
0,30000153,0.0,0.0,0.8,0.9,0.9,0.5321,0.5827,0.7045,12466550,...,False,False,False,False,False,False,True,False,False,False
1,30000153,0.0,0.0,0.8,0.9,0.9,0.5321,0.5827,0.7045,12466550,...,False,False,False,False,False,False,True,False,False,False
2,30000153,0.0,0.0,0.9,1.2,1.2,0.5321,0.5827,0.7045,12466550,...,False,False,False,False,False,False,True,False,False,False
3,30000153,0.0,0.0,0.8,0.9,0.9,0.5321,0.5827,0.7045,12466550,...,False,False,False,False,False,False,True,False,False,False
4,30000153,0.0,0.0,0.8,0.9,0.9,0.5321,0.5827,0.7045,12466550,...,False,False,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3053127,39999810,0.0,0.0,0.9,0.9,0.9,0.2596,0.2596,0.6410,17840864,...,False,False,False,False,False,False,True,False,False,False
3053128,39999810,0.0,0.0,0.9,0.9,0.9,0.2596,0.2596,0.6410,17840864,...,False,False,False,False,False,False,True,False,False,False
3053129,39999810,0.0,0.0,0.9,0.9,0.9,0.2596,0.2596,0.6410,17840864,...,False,False,False,False,False,False,True,False,False,False
3053130,39999810,0.0,0.0,0.9,0.9,0.9,0.2596,0.2596,0.6410,17840864,...,False,False,False,False,False,False,True,False,False,False


In [249]:
X.loc[X["stay_id"] == 272725].sort_values(by=["HOURS"])["HOURS"]

KeyError: 'HOURS'

In [250]:
feature_names = [
    "Anion gap",
    "Bicarbonate",
    "Blood Urea Nitrogen",
    "Chloride",
    "Creatinine",
    "Diastolic BP",
    "Glucose",
    "Heart rate",
    "Hematocrit",
    "Hemoglobin",
    "Potassium",
    "Respiratory rate",
    "Sodium",
    "Oxygen saturation",
    "Systolic BP",
    "Urine output 12h",
    "Urine output 24h",
    "Urine output 6h",
    "White cell count",
    "Sedative",
    "Vasopressor",
    "Ventilation",
    "Age",
    "Female gender",
    "Male gender",
    "Asian ethnicity",
    "Black ethnicity",
    "Hispanic ethnicity",
    "Native american",
    "Other ethnicity",
    "Ethnicity unknown",
    "White ethnicity",
    "Elective admission",
    "Emergency admission",
    "Urgent admission",
]

# Cap features between 0.01 / 0.99 quantile and normalisation

In [251]:
numeric_feat

['aki_stage_crrt',
 'aki_stage_smoothed',
 'creat',
 'creat_low_past_48hr',
 'creat_low_past_7day',
 'uo_rt_12hr',
 'uo_rt_24hr',
 'uo_rt_6hr',
 'admission_age']

In [252]:
X.columns

Index(['stay_id', 'aki_stage_crrt', 'aki_stage_smoothed', 'creat',
       'creat_low_past_48hr', 'creat_low_past_7day', 'uo_rt_12hr',
       'uo_rt_24hr', 'uo_rt_6hr', 'subject_id', 'hadm_id', 'admission_age',
       'icu_outtime', 'anchor_age', 'anchor_year', 'anchor_year_group', 'dod',
       'admittime', 'dischtime', 'deathtime', 'admit_provider_id',
       'admission_location', 'discharge_location', 'insurance', 'language',
       'marital_status', 'edregtime', 'edouttime', 'hospital_expire_flag',
       'intime', 'F', 'M', 'AMERICAN INDIAN/ALASKA NATIVE', 'ASIAN',
       'ASIAN - ASIAN INDIAN', 'ASIAN - CHINESE', 'ASIAN - KOREAN',
       'ASIAN - SOUTH EAST ASIAN', 'BLACK/AFRICAN', 'BLACK/AFRICAN AMERICAN',
       'BLACK/CAPE VERDEAN', 'BLACK/CARIBBEAN ISLAND', 'HISPANIC OR LATINO',
       'HISPANIC/LATINO - CENTRAL AMERICAN', 'HISPANIC/LATINO - COLUMBIAN',
       'HISPANIC/LATINO - CUBAN', 'HISPANIC/LATINO - DOMINICAN',
       'HISPANIC/LATINO - GUATEMALAN', 'HISPANIC/LATINO - HO

In [253]:
X = cap_data(X)

Capping between the 0.01 and 0.99 quantile
cap_maks Index(['AMBULATORY OBSERVATION', 'AMERICAN INDIAN/ALASKA NATIVE', 'ASIAN',
       'ASIAN - ASIAN INDIAN', 'ASIAN - CHINESE', 'ASIAN - KOREAN',
       'ASIAN - SOUTH EAST ASIAN', 'BLACK/AFRICAN', 'BLACK/AFRICAN AMERICAN',
       'BLACK/CAPE VERDEAN', 'BLACK/CARIBBEAN ISLAND', 'DIRECT EMER.',
       'DIRECT OBSERVATION', 'ELECTIVE', 'EU OBSERVATION', 'EW EMER.', 'F',
       'HISPANIC OR LATINO', 'HISPANIC/LATINO - CENTRAL AMERICAN',
       'HISPANIC/LATINO - COLUMBIAN', 'HISPANIC/LATINO - CUBAN',
       'HISPANIC/LATINO - DOMINICAN', 'HISPANIC/LATINO - GUATEMALAN',
       'HISPANIC/LATINO - HONDURAN', 'HISPANIC/LATINO - MEXICAN',
       'HISPANIC/LATINO - PUERTO RICAN', 'HISPANIC/LATINO - SALVADORAN', 'M',
       'MULTIPLE RACE/ETHNICITY', 'NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER',
       'OBSERVATION ADMIT', 'OTHER', 'PATIENT DECLINED TO ANSWER',
       'PORTUGUESE', 'SOUTH AMERICAN', 'SURGICAL SAME DAY ADMISSION',
       'UNABLE TO 

In [254]:
X = normalise_data(X, numeric_feat)

Normalizing in [0,1] with min-max normalization


In [255]:
X.loc[X["stay_id"] == 272725].sort_values(by=["HOURS"])["HOURS"]

KeyError: 'HOURS'

In [256]:
# X = X.sort_values(by=['icustay_id', 'HOURS'])
seq_lengths = X.groupby(["stay_id"], as_index=False).size().sort_values(by=["size"], ascending=False)
sequence_length = seq_lengths.max()  # the longest sequence per icustay-id
print(sequence_length)

stay_id    39900002
size          30540
dtype: int64


In [257]:
X.loc[X["stay_id"] == 272725].sort_values(by=["HOURS"])["HOURS"]

KeyError: 'HOURS'

In [258]:
# AL re-write as try except to make it work as hadm_id is not used if only one csv file is used and none are merged
try:
    X.drop(["hadm_id"], axis=1, inplace=True)
except:
    pass

In [260]:
X = X.sort_values(by=["subject_id", "HOURS"])

# Count number of variables for final dataset

In [261]:
features_list = list(X.columns)
print(features_list)
# list of variables to be removed at the end
remove_list_final = ["stay_id", "subject_id", "F"]
for item in remove_list_final:
    features_list.remove(item)

features = len(features_list)
print("number of features: " + str(features))

['stay_id', 'aki_stage_crrt', 'aki_stage_smoothed', 'creat', 'creat_low_past_48hr', 'creat_low_past_7day', 'uo_rt_12hr', 'uo_rt_24hr', 'uo_rt_6hr', 'subject_id', 'admission_age', 'icu_outtime', 'anchor_age', 'anchor_year', 'anchor_year_group', 'dod', 'admittime', 'dischtime', 'deathtime', 'admit_provider_id', 'admission_location', 'discharge_location', 'insurance', 'language', 'marital_status', 'edregtime', 'edouttime', 'hospital_expire_flag', 'intime', 'F', 'M', 'AMERICAN INDIAN/ALASKA NATIVE', 'ASIAN', 'ASIAN - ASIAN INDIAN', 'ASIAN - CHINESE', 'ASIAN - KOREAN', 'ASIAN - SOUTH EAST ASIAN', 'BLACK/AFRICAN', 'BLACK/AFRICAN AMERICAN', 'BLACK/CAPE VERDEAN', 'BLACK/CARIBBEAN ISLAND', 'HISPANIC OR LATINO', 'HISPANIC/LATINO - CENTRAL AMERICAN', 'HISPANIC/LATINO - COLUMBIAN', 'HISPANIC/LATINO - CUBAN', 'HISPANIC/LATINO - DOMINICAN', 'HISPANIC/LATINO - GUATEMALAN', 'HISPANIC/LATINO - HONDURAN', 'HISPANIC/LATINO - MEXICAN', 'HISPANIC/LATINO - PUERTO RICAN', 'HISPANIC/LATINO - SALVADORAN', 'MUL

# Random split subject ID into train(val), and test 

In [262]:
if RANDOM_SPLIT:
    print("extract subject_id list")
    subject_id = X["subject_id"].unique()
    subject_id = np.sort(subject_id)

extract subject_id list


In [263]:
if RANDOM_SPLIT:
    print("number of unique subject id: " + str(len(subject_id)))

number of unique subject id: 44969


In [264]:
if RANDOM_SPLIT:
    print("RANDOM SPLIT")
    print("divide dataset into train, test and validation sets")
    id_train_val, id_test = train_test_split(subject_id, test_size=0.1, random_state=RANDOM_SEED)  # train set is 80%)
    print("test is %d" % len(id_test))
    # remaining 20% split in halves as test and validation 10% and 10%
    id_train, id_val = train_test_split(id_train_val, test_size=0.111, random_state=RANDOM_SEED)  # test 10% valid 10%
    print("train is %d" % len(id_train))
    print("val is %d" % len(id_val))

    # sort list
    id_test.sort()
    id_train.sort()
    id_val.sort()

RANDOM SPLIT
divide dataset into train, test and validation sets
test is 4497
train is 35979
val is 4493


# Use fixed id_list from Yereva

In [265]:
if FIXED:
    # the Yereva id files pathes are here
    DATA_PATH_yereva_test = "data/id_list_yereva/test_listfile.csv"
    DATA_PATH_yereva_train = "data/id_list_yereva/train_listfile.csv"
    DATA_PATH_yereva_val = "data/id_list_yereva/val_listfile.csv"

    print("read csv files")
    # reading csv files
    yereva_test = pd.read_csv(DATA_PATH_yereva_test, sep=",")
    yereva_train = pd.read_csv(DATA_PATH_yereva_train, sep=",")
    yereva_val = pd.read_csv(DATA_PATH_yereva_val, sep=",")

    # convert to list
    yereva_test = yereva_test["notes"].tolist()
    yereva_train = yereva_train["notes"].tolist()
    yereva_val = yereva_val["notes"].tolist()

    yereva_test_subject = []
    yereva_train_subject = []
    yereva_val_subject = []

    for subject in yereva_test:
        yereva_test_subject.append(int(subject.split("_")[0]))
    for subject in yereva_train:
        yereva_train_subject.append(int(subject.split("_")[0]))
    for subject in yereva_val:
        yereva_val_subject.append(int(subject.split("_")[0]))

In [266]:
if FIXED:
    id_test = []
    id_train = []
    id_val = []

    n = 0

    while n < len(subject_id):
        if subject_id[n] in yereva_test_subject:
            id_test.append(subject_id[n])
            n = n + 1
        elif subject_id[n] in yereva_train_subject:
            id_train.append(subject_id[n])
            n = n + 1
        elif subject_id[n] in yereva_val_subject:
            id_val.append(subject_id[n])
            n = n + 1
        else:
            n = n + 1

    id_test.sort()
    id_train.sort()
    id_val.sort()

    print("Fixed list from Yereva")
    print("test is %d" % len(id_test))
    print("train is %d" % len(id_train))
    print("val is %d" % len(id_val))

In [267]:
if FIXED:
    print("combine subject_id list")
    subject_id = id_test + id_train
    subject_id = subject_id + id_val
    subject_id.sort()

    print("number of unique subject id: " + str(len(subject_id)))

# Convert icustay data into individual timeseries csv

In [268]:
# str(list(target.loc[target['icustay_id'] == 237693]['y_true'])[0])

In [269]:
def convert_icustay_to_AKIfolder(dataset, subject_id, output_path, id_train, id_test, id_val, target):

    temp_icustay_list = []  # to store the stay_id under same subject_id
    n = 0  # index to loop through temp_icustay_list
    num_stay = 0
    dataset = X
    temp_dataset = pd.DataFrame()
    sub_temp_dataset = pd.DataFrame()

    train_pairs = []
    test_pairs = []
    val_pairs = []

    for subject in subject_id:
        # make path for subject folder
        dn = os.path.join(OUTPUT_PATH, str(subject))
        try:
            os.makedirs(dn)
        except:
            pass

        temp_dataset = dataset.loc[dataset["subject_id"] == subject].sort_values(by=["stay_id"])
        temp_icustay_list = temp_dataset["stay_id"].unique()
        num_stay = len(temp_icustay_list)
        print(num_stay)
        n = 0

        while n < num_stay:
            sys.stdout.write(
                "\rSUBJECT_ID: {0} STAY_ID: {1} Episode {2}...".format(subject, temp_icustay_list[n], n + 1)
            )
            sub_temp_dataset = temp_dataset.loc[temp_dataset["stay_id"] == temp_icustay_list[n]]
            sub_temp_dataset = sub_temp_dataset.drop(remove_list_final, axis=1)
            # sub_temp_dataset = sub_temp_dataset.set_index('HOURS').sort_index(axis=0)

            sub_temp_dataset.to_csv(
                os.path.join(
                    OUTPUT_PATH,
                    str(subject),
                    "{}_episode{}_timeseries_{}.csv".format(subject, n + 1, temp_icustay_list[n]),
                ),
                index_label="Hours",
            )

            # create list for id list for train/test/val
            if subject in id_train:
                train_pairs.append(
                    (
                        str(subject) + "_note.txt",
                        str(subject) + "_episode" + str(n + 1) + "_timeseries_" + str(temp_icustay_list[n]) + ".csv",
                        str(list(target.loc[target["stay_id"] == temp_icustay_list[n]]["y_true"])[0]),
                    )
                )
            elif subject in id_test:
                test_pairs.append(
                    (
                        str(subject) + "_note.txt",
                        str(subject) + "_episode" + str(n + 1) + "_timeseries_" + str(temp_icustay_list[n]) + ".csv",
                        str(list(target.loc[target["stay_id"] == temp_icustay_list[n]]["y_true"])[0]),
                    )
                )
            elif subject in id_val:
                val_pairs.append(
                    (
                        str(subject) + "_note.txt",
                        str(subject) + "_episode" + str(n + 1) + "_timeseries_" + str(temp_icustay_list[n]) + ".csv",
                        str(list(target.loc[target["stay_id"] == temp_icustay_list[n]]["y_true"])[0]),
                    )
                )

            n = n + 1

    sys.stdout.write("DONE!\n")

    return train_pairs, test_pairs, val_pairs


train_pairs, test_pairs, val_pairs = convert_icustay_to_AKIfolder(
    X, subject_id, OUTPUT_PATH, id_train, id_test, id_val, target
)

1
SUBJECT_ID: 10000980 STAY_ID: 39765666 Episode 1...2
SUBJECT_ID: 10001217 STAY_ID: 37067082 Episode 2...1
SUBJECT_ID: 10001725 STAY_ID: 31205490 Episode 1...1
SUBJECT_ID: 10001884 STAY_ID: 37510196 Episode 1...1
SUBJECT_ID: 10002013 STAY_ID: 39060235 Episode 1...2
SUBJECT_ID: 10002155 STAY_ID: 33685454 Episode 2...1
SUBJECT_ID: 10002348 STAY_ID: 32610785 Episode 1...4
SUBJECT_ID: 10002428 STAY_ID: 38875437 Episode 4...1
SUBJECT_ID: 10002430 STAY_ID: 38392119 Episode 1...1
SUBJECT_ID: 10002443 STAY_ID: 35044219 Episode 1...1
SUBJECT_ID: 10002495 STAY_ID: 36753294 Episode 1...1
SUBJECT_ID: 10002760 STAY_ID: 31831386 Episode 1...1
SUBJECT_ID: 10002930 STAY_ID: 35629889 Episode 1...1
SUBJECT_ID: 10003019 STAY_ID: 30676350 Episode 1...1
SUBJECT_ID: 10003046 STAY_ID: 35514836 Episode 1...3
SUBJECT_ID: 10003400 STAY_ID: 38383343 Episode 3...1
SUBJECT_ID: 10004235 STAY_ID: 34100191 Episode 1...7
SUBJECT_ID: 10004401 STAY_ID: 39801884 Episode 7...1
SUBJECT_ID: 10004422 STAY_ID: 32155744 Episo

In [270]:
def convert_icustay_to_AKIfolder(dataset, subject_id, output_path, id_train, id_test, id_val, target):

    temp_icustay_list = []  # to store the stay_id under same subject_id
    n = 0  # index to loop through temp_icustay_list
    num_stay = 0
    dataset = X
    temp_dataset = pd.DataFrame()
    sub_temp_dataset = pd.DataFrame()

    train_pairs = []
    test_pairs = []
    val_pairs = []

    for subject in subject_id:
        # make path for subject folder
        dn = os.path.join(OUTPUT_PATH, str(subject))
        try:
            os.makedirs(dn)
        except:
            pass

        temp_dataset = dataset.loc[dataset["subject_id"] == subject].sort_values(by=["stay_id"])
        temp_icustay_list = temp_dataset["stay_id"].unique()
        num_stay = len(temp_icustay_list)
        print(num_stay)
        n = 0

        while n < num_stay:
            sys.stdout.write(
                "\rSUBJECT_ID: {0} STAY_ID: {1} Episode {2}...".format(subject, temp_icustay_list[n], n + 1)
            )
            sub_temp_dataset = temp_dataset.loc[temp_dataset["stay_id"] == temp_icustay_list[n]]
            sub_temp_dataset = sub_temp_dataset.drop(remove_list_final, axis=1)
            # sub_temp_dataset = sub_temp_dataset.set_index('HOURS').sort_index(axis=0)

            sub_temp_dataset.to_csv(
                os.path.join(
                    OUTPUT_PATH,
                    str(subject),
                    "{}_episode{}_timeseries_{}.csv".format(subject, n + 1, temp_icustay_list[n]),
                ),
                index_label="Hours",
            )

            # create list for id list for train/test/val
            if subject in id_train:
                train_pairs.append(
                    (
                        str(subject) + "_note.txt",
                        str(subject) + "_episode" + str(n + 1) + "_timeseries_" + str(temp_icustay_list[n]) + ".csv",
                        str(list(target.loc[target["stay_id"] == temp_icustay_list[n]]["y_true"])[0]),
                    )
                )
            elif subject in id_test:
                test_pairs.append(
                    (
                        str(subject) + "_note.txt",
                        str(subject) + "_episode" + str(n + 1) + "_timeseries_" + str(temp_icustay_list[n]) + ".csv",
                        str(list(target.loc[target["stay_id"] == temp_icustay_list[n]]["y_true"])[0]),
                    )
                )
            elif subject in id_val:
                val_pairs.append(
                    (
                        str(subject) + "_note.txt",
                        str(subject) + "_episode" + str(n + 1) + "_timeseries_" + str(temp_icustay_list[n]) + ".csv",
                        str(list(target.loc[target["stay_id"] == temp_icustay_list[n]]["y_true"])[0]),
                    )
                )

            n = n + 1

    sys.stdout.write("DONE!\n")

    return train_pairs, test_pairs, val_pairs


train_pairs, test_pairs, val_pairs = convert_icustay_to_AKIfolder(
    X, subject_id, OUTPUT_PATH, id_train, id_test, id_val, target
)

1
SUBJECT_ID: 10000980 STAY_ID: 39765666 Episode 1...2
SUBJECT_ID: 10001217 STAY_ID: 37067082 Episode 2...1
SUBJECT_ID: 10001725 STAY_ID: 31205490 Episode 1...1
SUBJECT_ID: 10001884 STAY_ID: 37510196 Episode 1...1
SUBJECT_ID: 10002013 STAY_ID: 39060235 Episode 1...2
SUBJECT_ID: 10002155 STAY_ID: 33685454 Episode 2...1
SUBJECT_ID: 10002348 STAY_ID: 32610785 Episode 1...4
SUBJECT_ID: 10002428 STAY_ID: 38875437 Episode 4...1
SUBJECT_ID: 10002430 STAY_ID: 38392119 Episode 1...1
SUBJECT_ID: 10002443 STAY_ID: 35044219 Episode 1...1
SUBJECT_ID: 10002495 STAY_ID: 36753294 Episode 1...1
SUBJECT_ID: 10002760 STAY_ID: 31831386 Episode 1...1
SUBJECT_ID: 10002930 STAY_ID: 35629889 Episode 1...1
SUBJECT_ID: 10003019 STAY_ID: 30676350 Episode 1...1
SUBJECT_ID: 10003046 STAY_ID: 35514836 Episode 1...3
SUBJECT_ID: 10003400 STAY_ID: 38383343 Episode 3...1
SUBJECT_ID: 10004235 STAY_ID: 34100191 Episode 1...7
SUBJECT_ID: 10004401 STAY_ID: 39801884 Episode 7...1
SUBJECT_ID: 10004422 STAY_ID: 32155744 Episo

# Move subject_timeseries.csv file to train/test folder 

In [271]:
def move_to_partition(subjects_root_path, patients, partition):
    if not os.path.exists(os.path.join(subjects_root_path, partition)):
        os.mkdir(os.path.join(subjects_root_path, partition))
    for patient in patients:
        src = os.path.join(subjects_root_path, str(patient))
        dest = os.path.join(subjects_root_path, partition)
        for filename in os.listdir(src):
            shutil.move(os.path.join(src, str(filename)), dest)
        os.rmdir(src)

In [272]:
move_to_partition(OUTPUT_PATH, id_train, "train")
move_to_partition(OUTPUT_PATH, id_val, "train")
move_to_partition(OUTPUT_PATH, id_test, "test")

Error: Destination path 'data/AKI\train\10001217_episode1_timeseries_34592300.csv' already exists

# Create test_listfile.csv, train_listfile.csv, val_listfile.csv, and move to AKI folder

In [273]:
with open(os.path.join(OUTPUT_PATH, "train_listfile.csv"), "w") as listfile:
    listfile.write("notes,stay,y_true\n")
    for (n, x, y) in train_pairs:
        listfile.write("{},{},{}\n".format(n, x, str(y)))
with open(os.path.join(OUTPUT_PATH, "val_listfile.csv"), "w") as listfile:
    listfile.write("notes,stay,y_true\n")
    for (n, x, y) in val_pairs:
        listfile.write("{},{},{}\n".format(n, x, str(y)))

with open(os.path.join(OUTPUT_PATH, "test_listfile.csv"), "w") as listfile:
    listfile.write("notes,stay,y_true\n")
    for (n, x, y) in test_pairs:
        listfile.write("{},{},{}\n".format(n, x, str(y)))