# Cleaning Data

## Reading data and basic cleaning

In [None]:
import numpy as np
import pandas as pd
import pyarrow as pa
import fastparquet as fp
import seaborn as sns
import matplotlib.pyplot as plt
import os
from functools import reduce
import math

# for jupyter notebook, this is necessary to show plots
%matplotlib inline

In [None]:
# change current working directory
default_dir = '..'

if os.getcwd() != default_dir:
    os.chdir(default_dir)

In [None]:
# initalize input directory
input_dir = '..'

# read parquet files into dataframes
df_vitals = pd.read_parquet(input_dir + '/Vitals/')
df_surgery = pd.read_parquet(input_dir + '/Surgery/')
df_reanimatie = pd.read_parquet(input_dir + '/Reanimatiebeleid/')
df_lab = pd.read_parquet(input_dir + '/Lab/')
df_ic = pd.read_parquet(input_dir + '/IC_Opnames/')
df_demo = pd.read_parquet(input_dir + '/Demographics/')

In [None]:
# # check duplicates
# df_reanimatie.duplicated().sum()

# drop duplicated rows and reset indices
df_reanimatie = df_reanimatie.drop_duplicates().reset_index(drop=True)
df_lab = df_lab.drop_duplicates().reset_index(drop=True)

In [None]:
# create a copy
copy_vitals = df_vitals.copy()
copy_surgery = df_surgery.copy()
copy_reanimatie = df_reanimatie.copy()
copy_lab = df_lab.copy()
copy_ic = df_ic.copy()
copy_demo = df_demo.copy()

In [None]:
# drop unnecessary variables
copy_vitals = copy_vitals.drop(columns = ['MetingOms', 'MetingEenheid', 'MeetOptie'])
copy_surgery = copy_surgery.drop(columns = ['NaarAfdelingDatumTijd', 'HoofdverrichtingOms', 'PrioriteitOms'])
copy_reanimatie = copy_reanimatie.drop(columns = ['Gekozen_beleid', 'Reanimeren', 'TmDatum'])
copy_lab = copy_lab.drop(columns = ['BepalingCode', 'GroepOms', 'Eenheid', 'GlimsCode'])

In [None]:
# # drop duplicated rows and reset indices because dropping variables give duplicates
# # but we'll just do this at the next section per each dataframe
# copy_reanimatie = copy_reanimatie.drop_duplicates().reset_index(drop=True)
# copy_lab = copy_lab.drop_duplicates().reset_index(drop=True)

# copy_reanimatie.duplicated().sum()

In [None]:
# # check duplicated values from copy dataframes
# # Dropping columns introduces duplicated rows because, 
# # for example, in copy_lab, the BepalingCode was different for the same patient, or etc, 
# # and dropping those kinds of columns can introduce duplicates. 

# duplicates_vitals = copy_vitals[copy_vitals.duplicated()]
# duplicates_surgery = copy_surgery[copy_surgery.duplicated()]
# duplicates_reanimatie = copy_reanimatie[copy_reanimatie.duplicated()]
# duplicates_lab = copy_lab[copy_lab.duplicated()]
# duplicates_ic = copy_ic[copy_ic.duplicated()]
# duplicates_demo = copy_demo[copy_demo.duplicated()]

# duplicates_lab

In [None]:
# remove case sensitivity in variable names
copy_vitals.columns = copy_vitals.columns.str.lower()
copy_surgery.columns = copy_surgery.columns.str.lower()
copy_reanimatie.columns = copy_reanimatie.columns.str.lower()
copy_lab.columns = copy_lab.columns.str.lower()
copy_ic.columns = copy_ic.columns.str.lower()
copy_demo.columns = copy_demo.columns.str.lower()

In [None]:
# raname variables using _
copy_vitals = copy_vitals.rename(columns={'pid': 'p_id', 
                                  'opnameid': 'opname_id', 
                                'metingdatumtijd': 'meting_datum_tijd',       
                                'meetwaarde1': 'meet_waarde1',  
                                'meetwaarde2': 'meet_waarde2', 
                                'meetwaarde3': 'meet_waarde3'})

copy_surgery = copy_surgery.rename(columns={'pid': 'p_id', 
                                  'opnameid': 'opname_id', 
                                'operatieid': 'operatie_id',       
                                'ok_begindatumtijd': 'ok_begin_datum_tijd',  
                                'ok_einddatumtijd': 'ok_eind_datum_tijd', 
                                'hoofdverrichtingcode': 'hoofdverrichting_code',
                                'prioriteitcode': 'prioriteit_code'})

copy_reanimatie = copy_reanimatie.rename(columns={'pid': 'p_id', 
                                  'opnameid': 'opname_id', 
                                'vanafdatum': 'vanaf_datum'})

copy_lab = copy_lab.rename(columns={'pid': 'p_id', 
                                  'opnameid': 'opname_id', 
                                'bepalingoms': 'bepaling_oms',       
                                'labdatumtijd': 'lab_datum_tijd'})

copy_ic = copy_ic.rename(columns={'pid': 'p_id', 
                                  'opnameid': 'opname_id', 
                                'opnamedatumtijd': 'ic_opname_datum_tijd',       
                                'ontslagdatumtijd': 'ic_ontslag_datum_tijd',  
                                'specialismecode': 'ic_specialisme_code', 
                                'afdelingcode': 'afdelings_code'})

copy_demo = copy_demo.rename(columns={'pid': 'p_id', 
                                  'opnameid': 'opname_id', 
                                'overlijdensdatum': 'overlijdens_datum',       
                                'opnamedatumtijd': 'opname_datum_tijd',  
                                'ontslagdatumtijd': 'ontslag_datum_tijd', 
                                'specialismecode': 'specialisme_code',
                                'opnametypeoms': 'opname_type_oms'})

In [None]:
# # check if there is only date
# copy_demo['overlijdens_datum'].dt.time.value_counts()

In [None]:
# dataframes = [copy_demo, copy_vitals, copy_surgery, copy_reanimatie, copy_lab, copy_ic]
# merged_df = dataframes[0]
# for df in dataframes[1:]:
#     merged_df = pd.merge(merged_df, df, on=['p_id', 'opname_id'], how='left')
#     del df

# merged_df

## Converting datetime and handling duplicated rows

### Vitals:    
As they (at least NIBP) were measured in Nanoseconds (e.g. 11:30:00:020, 11:30:00.010), converting 'meting_datum_tijd' to a standard datetime (rounded to seconds) creates many duplicates based on a combination of 'p_id', 'opname_id', 'meting_datum_tijd', 'meting'. datetime64 can also truncate nanoseconds, if using [s] (e.g. 16:25:26.290448 to 16:25:26 from df_demo)    
Data loss is inevitable when handling this, as we do not consider nanoseconds in real life. An average value for MW1, MW2, and MW3 is used.    
The same approach will be used to convert 'meting_datum_tijd' to minutes or every 10 minutes. It might be interesting to experiment with the discretization of time and its effect on a result. 

In [None]:
# # NOT USED
# # convert meting_datum_tijd to standard datetime format with seconds
# copy_vitals['meting_datum_tijd'] = copy_vitals['meting_datum_tijd'].values.astype('datetime64[s]')

# # check converted dataframe that do not have same datetime value as original dataframe
# copy_vitals[copy_vitals['meting_datum_tijd'] != df_vitals['MetingDatumTijd']].head()

# # calculate the average value for the combination
# copy_vitals = copy_vitals.groupby(['p_id', 'opname_id', 'meting_datum_tijd', 'meting'], as_index=False).agg({
#     'meet_waarde1': 'mean', 'meet_waarde2': 'mean', 'meet_waarde3': 'mean'})

# # drop duplicated rows
# copy_vitals = copy_vitals.drop_duplicates().reset_index(drop=True)
# copy_vitals.head()

In [None]:
# # check which rows are kept and dropped if assuming date time is converted
# copy_vitals = copy_vitals.sort_values(by='meting_datum_tijd')
# copy_vitals['temp_meting_datum_tijd'] = copy_vitals['meting_datum_tijd'].values.astype('datetime64[s]')
# cols = ['p_id', 'opname_id', 'meting', 'temp_meting_datum_tijd']
# condition = copy_vitals.duplicated(subset=cols, keep=False)
# duplicates = copy_vitals[condition].copy()
# duplicates['action'] = duplicates.duplicated(subset=cols, keep='last').map({True: 'drop', False: 'keep'})

# kept_rows = duplicates[duplicates['action'] == 'keep']
# dropped_rows = duplicates[duplicates['action'] == 'drop']

# comparison = kept_rows.merge(
#     dropped_rows,
#     on=cols,
#     how='inner',
#     suffixes=('_kept', '_dropped')
# )
# comparison.head()

Rounding up measurement time

In [None]:
# # OLD CODE
# # assume the date time type is rounded from nanoseconds to hour, 
# # then based on the combination, only keep the recent value per time
# copy_vitals = copy_vitals.sort_values(by='meting_datum_tijd')
# copy_vitals['temp_meting_datum_tijd'] = copy_vitals['meting_datum_tijd'].values.astype('datetime64[h]')
# copy_vitals = copy_vitals.drop_duplicates(subset=['p_id', 'opname_id', 'meting', 'temp_meting_datum_tijd'], keep='last')
# copy_vitals = copy_vitals.drop(columns=['temp_meting_datum_tijd'])
# copy_vitals['meting_datum_tijd'] = copy_vitals['meting_datum_tijd'].values.astype('datetime64[h]')
# copy_vitals.head()

In [None]:
# round to nearest interval per parameter
# then based on the combination, only keep the recent value per time
# info_cols are variables that can change by time. For instance leeftijd doesn't
def round_nearest_interval(df, datetime_cols, info_cols, N, additional_ids=[]):

    p_id = 'p_id'
    opname_id = 'opname_id'

    df = df.sort_values(by=datetime_cols[0])
    
    # N = 0, 0.5, 1, 2
    for datetime_col in datetime_cols:
        df[datetime_col] = pd.to_datetime(df[datetime_col], errors='coerce')    
        
        # nearest minute from nanoseconds
        if N == 0:
            interval = 'T'
        
        # nearest hour or 2 hours or more
        elif N >= 1:
            interval = f'{int(N)}H'
            
        # if 0 < N < 1
        else:
            minutes = int(N * 60)
            interval = f'{minutes}T'

        df['temp_' + datetime_col] = df[datetime_col].dt.round(interval)

    # keep only the recent measurement based on this combination
    temp_datetime_cols = ['temp_' + col for col in datetime_cols]
    subset_cols = [p_id, opname_id] + info_cols + temp_datetime_cols + additional_ids
    df = df.drop_duplicates(subset=subset_cols, keep='last')

    for datetime_col in datetime_cols:
        df[datetime_col] = df['temp_' + datetime_col]
        df = df.drop(columns=['temp_' + datetime_col])

    return df

In [None]:
copy_vitals = round_nearest_interval(
    df=copy_vitals,
    datetime_cols=['meting_datum_tijd'],
    info_cols=['meet_waarde1', 'meet_waarde2', 'meet_waarde3'],
    N=1
)

In [None]:
copy_surgery = round_nearest_interval(
    df=copy_surgery,
    datetime_cols=['ok_begin_datum_tijd', 'ok_eind_datum_tijd'],
    info_cols=['hoofdverrichting_code', 'prioriteit_code'],
    N=1,
    additional_ids=['operatie_id']
)

In [None]:
copy_reanimatie = round_nearest_interval(
    df=copy_reanimatie,
    datetime_cols=['vanaf_datum'],
    info_cols=['care_order'],
    N=1
)

In [None]:
copy_lab = round_nearest_interval(
    df=copy_lab,
    datetime_cols=['lab_datum_tijd'],
    info_cols=['bepaling_oms', 'uitslag'],
    N=1
)

In [None]:
copy_ic.info()

In [None]:
copy_ic[['ic_opname_datum_tijd', 'ic_ontslag_datum_tijd']] = (
    copy_ic[['ic_opname_datum_tijd', 'ic_ontslag_datum_tijd']].values.astype('datetime64[s]')
)

copy_ic = round_nearest_interval(
    df=copy_ic,
    datetime_cols=['ic_opname_datum_tijd', 'ic_ontslag_datum_tijd'],
    info_cols=['ic_specialisme_code', 'afdelings_code'],
    N=1
)

In [None]:
copy_demo = round_nearest_interval(
    df=copy_demo,
    datetime_cols=['overlijdens_datum', 'opname_datum_tijd', 'ontslag_datum_tijd'],
    info_cols=['specialisme_code', 'spoed', 'opname_type_oms'],
    N=1
)

In [None]:
# # NOT USED
# # check original dataframe that do not have same datetime value as original dataframe
# df_vitals[copy_vitals['meting_datum_tijd'] != df_vitals['MetingDatumTijd']].head()

In [None]:
# check if there are duplicated rows based on the combination
duplicates = copy_vitals.duplicated(subset=['p_id', 'opname_id', 'meting_datum_tijd', 'meting'], keep=False)
copy_vitals[duplicates]

### Surgery:   
There are no duplicates based on the defined combination.  
Therefore, we do not do anything extra here. 

In [None]:
copy_surgery.head()

In [None]:
# convert date and time columns to a standard datetime format with seconds
copy_surgery[['ok_begin_datum_tijd', 'ok_eind_datum_tijd']] = (
    copy_surgery[['ok_begin_datum_tijd', 'ok_eind_datum_tijd']].values.astype('datetime64[s]')
)
    
# check if there are duplicated rows based on the combination
duplicates = copy_surgery.duplicated(subset=['p_id', 'opname_id', 'operatie_id', 
                                             'ok_begin_datum_tijd', 'ok_eind_datum_tijd'], 
                                     keep=False)
copy_surgery[duplicates]

In [None]:
copy_surgery.info()

### Reanimatie:   
We drop some duplicated rows after removing other columns that are not going to be used (Gekozen_beleid, Reanimeren, TmDatum).   
Also for other duplicated rows grouped by the defined combinations, only keep rows that have recent rows.    
vanaf_datum == TmDatum does not give a valid care order because it means care order had been updated later. 

In [None]:
copy_reanimatie.head()

In [None]:
# convert date and time column to a standard datetime format with seconds
copy_reanimatie['vanaf_datum'] = copy_reanimatie['vanaf_datum'].values.astype('datetime64[s]')

# check the first occurence of the duplicates based on the combination
first_duplicates = copy_reanimatie.duplicated(subset=['p_id', 'opname_id', 'vanaf_datum'], keep='first')
copy_reanimatie[first_duplicates]

In [None]:
# # # check if there are any rows that have different vanaf_datum before converting and after converting
# # # there are not. Therefore the duplicates above occured likely due to other columns (Gekozen_beleid, Reanimeren, TmDatum) then. 
# df_reanimatie[copy_reanimatie['vanaf_datum'] != df_reanimatie['VanafDatum']].head(10)

In [None]:
# check if there are duplicated rows based on the combination
duplicates = copy_reanimatie.duplicated(subset=['p_id', 'opname_id', 'vanaf_datum'], keep=False)
copy_reanimatie[duplicates]

In [None]:
# get the first duplicated row
duplicate0 = copy_reanimatie[duplicates].iloc[0]

# store the first duplicated row's p_id and opname_id
p_id0 = duplicate0['p_id']
opname_id0 = duplicate0['opname_id']

# show the original data of the patient from the duplicated row
# vanaf_datum is same and only keep the top row (latest decision)
# as if vanaf_datum and TmDatum (end date) are the same, 
# this means care order has been changed afterwards
original0 = df_reanimatie[(df_reanimatie['PID'] == p_id0) & (df_reanimatie['OpnameID'] == opname_id0)]
original0

In [None]:
# get the third duplicated row
duplicate2 = copy_reanimatie[duplicates].iloc[2]

# store the third duplicated row's p_id and opname_id
p_id2 = duplicate2['p_id']
opname_id2 = duplicate2['opname_id']

# show the original data of the patient from the duplicated row
original2 = df_reanimatie[(df_reanimatie['PID'] == p_id2) & (df_reanimatie['OpnameID'] == opname_id2)]
original2

In [None]:
copy_reanimatie = copy_reanimatie[~first_duplicates].reset_index(drop=True)
copy_reanimatie.info()

In [None]:
# check if only the first occurence indeed keeps intact
copy_reanimatie[(copy_reanimatie['p_id'] == p_id0) & (copy_reanimatie['opname_id'] == opname_id0)]

In [None]:
copy_reanimatie.info()

### Lab:   
Similar to Reanimatie dataframe, duplicates happened due to other unused columns. Therefore we would calculate the average values for uitslag for the given lab_datum_tijd. 

In [None]:
copy_lab.head()

In [None]:
# convert date and time column to a standard datetime format with seconds
copy_lab['lab_datum_tijd'] = copy_lab['lab_datum_tijd'].values.astype('datetime64[s]')

# check if there are duplicated rows based on the combination
duplicates = copy_lab.duplicated(subset=['p_id', 'opname_id', 'bepaling_oms', 'lab_datum_tijd'], keep=False)
copy_lab[duplicates]

In [None]:
# # check original dataframe that do not have same datetime value as original dataframe
# # there are no duplicates. The duplicates exist also likely due to other columns 
# df_lab[copy_lab['lab_datum_tijd'] != df_lab['Labdatumtijd']].head()

In [None]:
# # NOT USED
# # calculate the average value for the combination
# copy_lab = copy_lab.groupby(['p_id', 'opname_id', 'bepaling_oms', 'lab_datum_tijd'], 
#                             as_index=False).agg({'uitslag': 'mean'})

# # drop duplicated rows
# copy_lab = copy_lab.drop_duplicates().reset_index(drop=True)
# copy_lab.head()

In [None]:
# filter non-numeric values for uitslag
copy_lab['uitslag'] = pd.to_numeric(copy_lab['uitslag'], errors='coerce')

# same as 'meting_datum_tijd' in vitals. 
# assume converted datetime and only keep the recent one based on the combination
copy_lab = copy_lab.sort_values(by='lab_datum_tijd')
copy_lab['temp_lab_datum_tijd'] = copy_lab['lab_datum_tijd'].values.astype('datetime64[s]')
copy_lab = copy_lab.drop_duplicates(
    subset=['p_id', 'opname_id', 'bepaling_oms', 'temp_lab_datum_tijd'],
    keep='last'
)
copy_lab = copy_lab.drop(columns=['temp_lab_datum_tijd'])
copy_lab['lab_datum_tijd'] = copy_lab['lab_datum_tijd'].values.astype('datetime64[s]')
copy_lab.head()

In [None]:
# check if there are duplicated rows based on the combination
duplicates = copy_lab.duplicated(subset=['p_id', 'opname_id', 'bepaling_oms', 'lab_datum_tijd'], keep=False)
copy_lab[duplicates]

### IC_Opnames:   
Same as Surgery dataframe, there are no duplicates based on the defined combination.  
Therefore, we do not do anything extra here. 

In [None]:
copy_ic.head()

In [None]:
# check if there are duplicated rows based on the combination
duplicates = copy_ic.duplicated(subset=['p_id', 'opname_id', 
                                        'ic_opname_datum_tijd', 'ic_ontslag_datum_tijd'], keep=False)
copy_ic[duplicates]

In [None]:
copy_ic = copy_ic.drop_duplicates(
    subset=['p_id', 'opname_id', 'ic_opname_datum_tijd', 'ic_ontslag_datum_tijd', 'ic_specialisme_code'],
    keep='last'
)

copy_ic[duplicates]

In [None]:
copy_ic.info()

### Demographics:   
Same as Surgery dataframe, there are no duplicates based on the defined combination.  
Therefore, we do not do anything extra here.  

In [None]:
copy_demo.head()

In [None]:
# convert date and time column to a standard datetime format with seconds
copy_demo['overlijdens_datum'] = pd.to_datetime(copy_demo['overlijdens_datum'])
copy_demo['opname_datum_tijd'] = pd.to_datetime(copy_demo['opname_datum_tijd'])
copy_demo['ontslag_datum_tijd'] = pd.to_datetime(copy_demo['ontslag_datum_tijd'])

copy_demo[['overlijdens_datum', 'opname_datum_tijd', 'ontslag_datum_tijd']] = (
    copy_demo[['overlijdens_datum', 'opname_datum_tijd', 'ontslag_datum_tijd']].values.astype('datetime64[s]')
)
    
# check if there are duplicated rows based on the combination
duplicates = copy_demo.duplicated(subset=['p_id', 'opname_id', 'geslacht', 
                                          'leeftijd', 'overlijdens_datum', 
                                          'opname_datum_tijd', 'ontslag_datum_tijd'], 
                                  keep=False)
copy_demo[duplicates]

In [None]:
copy_demo.info()

## Check duplicates rows overall

In [None]:
# check any remaining duplicated rows
duplicates_vitals = copy_vitals[copy_vitals.duplicated()]
duplicates_surgery = copy_surgery[copy_surgery.duplicated()]
duplicates_reanimatie = copy_reanimatie[copy_reanimatie.duplicated()]
duplicates_lab = copy_lab[copy_lab.duplicated()]
duplicates_ic = copy_ic[copy_ic.duplicated()]
duplicates_demo = copy_demo[copy_demo.duplicated()]

duplicates_vitals

In [None]:
duplicates_vitals

In [None]:
duplicates_surgery

In [None]:
duplicates_reanimatie

In [None]:
duplicates_lab

In [None]:
duplicates_ic

In [None]:
duplicates_demo

## Filtering and handling missing datetime

In [None]:
# condition = copy_demo['opname_datum_tijd'] > copy_demo['ontslag_datum_tijd']
# copy_demo[condition]

In [None]:
# remove rows where Ontslag occurs before Opname
condition = copy_demo['opname_datum_tijd'] > copy_demo['ontslag_datum_tijd']
copy_demo = copy_demo[~condition].reset_index(drop=True)

In [None]:
# check missing values in dates
print(copy_vitals['meting_datum_tijd'].isna().sum())
print(copy_surgery[['ok_begin_datum_tijd', 'ok_eind_datum_tijd']].isna().sum())
print(copy_reanimatie['vanaf_datum'].isna().sum())
print(copy_lab['lab_datum_tijd'].isna().sum())
print(copy_ic[['ic_opname_datum_tijd', 'ic_ontslag_datum_tijd']].isna().sum())
print(copy_demo[['overlijdens_datum', 'opname_datum_tijd', 'ontslag_datum_tijd']].isna().sum())

Handling missing values from above

In [None]:
# check where the ontslag_datum_tijd is missing
copy_demo[copy_demo['ontslag_datum_tijd'].isna()]

In [None]:
# drop the case where OntslagDatum missing as we cannot assume 'time' but only 'date' from Overlijdens'Datum'
# also this ontslag_datum_tijd is missing possibly due to extracting data around that time
copy_demo = copy_demo[copy_demo['ontslag_datum_tijd'].notna()].reset_index(drop=True)
copy_demo[copy_demo['ontslag_datum_tijd'].isna()]

In [None]:
# overlijdens_datum: we have to assume patients are alive when this variable is missing. 
# leave them as they are. Some models can accept NaN values

# TODO for future -> Do it in cleaning_v2: 
# however, we could create an extra column that indicates alive or not alive (this is an event)
# mark True if they're alive within 12 hours?? because that's the focus? 
# but from which 'event' 12 hours? from admission (opname)?

In [None]:
# ok_eind_datum_tijd. 
# check if OntslagDatum exists then for missing Einddatum values based on copy_demo
# What should I do with this? Fill ontslag_datum_tijd with einddatumtijd?? or leave as they are now?
# Were there even surgerys with these patients? 
# Would drop the rows because rows with missing ok_eind_datum_tijd not gonna be defined as a valid episode / event. 
# (because you need start time and end time)
# surgery wasn't performed probably. but still have to ask ashely.
# see below

merged_df = copy_demo.merge(
    copy_surgery[['p_id', 'opname_id', 'ok_eind_datum_tijd', 'ok_begin_datum_tijd']],
    on = ['p_id', 'opname_id'],
    how = 'left'
).reset_index(drop=True)

# patients where surgery einddatum missing
filtered_df = merged_df[merged_df['ok_eind_datum_tijd'].isna()]

# patients where surgery einddatum missing
filtered_df.head()

# count numbers: 0 
# people who have missing ok_eind_datum_tijd have ontslag_datum_tijd
num = filtered_df['ontslag_datum_tijd'].isna().sum()
print(f"Numbers of patients whose einddatumtijd don't exist, as well as ontslagdatumtijd: {num}")

In [None]:
# people who have missing ok_eind_datum_tijd have ontslag_datum_tijd
filtered_df[filtered_df['ontslag_datum_tijd'].isna()].head()

In [None]:
# cannot assume exact datetime
# But I think we could keep this for defining an event,
# if our focus in on the ontslag_datum_tijd instead of ok_eind_datum_tijd

# no, we just remove these.
# This should generally not happen but it might happen
# if there are some smaller surgery that are performed in holding or in the recovery.
copy_surgery = copy_surgery[copy_surgery['ok_eind_datum_tijd'].notna()].reset_index(drop=True)
copy_surgery[copy_surgery['ok_eind_datum_tijd'].isna()]

In [None]:
# Why the date time starts at 00:00:00 a lot? (both opname datum tijd_ic, ontslag ic)
# somehow the rows were created separately and should find a continous time per p_id, opname_id, and smth smth 
# ic_opnames has many separate rows for patients, surgery.. opnamedatum tijd etc 
# Look the note 1-1
# reason: this is because of finance. A new IC admissions starts then because of billing hours

# when do they change patient's afdelings code? what does that mean?
# and why do they split times for the same afdelings code too? 
# should I find a continous time for both cases? 
# (either drop afdelings code and merge rows, or only merge rows for the same afdelings code)

# this is for handling rows that have split time due to billing hours
non_split_ic = copy_ic[~copy_ic.duplicated(subset=['p_id', 'opname_id', 'ic_specialisme_code', 'afdelings_code'], keep=False)]

split_ic = copy_ic.sort_values(by=['p_id', 'opname_id', 'ic_specialisme_code', 'afdelings_code'])
split_ic = split_ic[split_ic.duplicated(subset=['p_id', 'opname_id', 'ic_specialisme_code', 'afdelings_code'], keep=False)]
split_ic

In [None]:
# set ic_opname_datum_tijd to earliest time, ic_ontslag_datum_tijd to latest time for each duplicated group
# leave index as p_id, opname_id, ic_specialisme_code, afdelings_code
# if we aren't interested in afdelings_code anymore, then set index with 3 cols only (w/o afdelings_code)
# we can keep it
aggregated_ic = split_ic.groupby(['p_id', 'opname_id', 'ic_specialisme_code', 'afdelings_code']).agg({
    'ic_opname_datum_tijd': 'min', 'ic_ontslag_datum_tijd': 'max'}).reset_index()

copy_ic = pd.concat([non_split_ic, aggregated_ic]).reset_index(drop=True)
copy_ic

In [None]:
copy_ic['ic_opname_datum_tijd'].value_counts()

In [None]:
copy_ic['ic_ontslag_datum_tijd'].value_counts()

In [None]:
# check if there are still duplicated rows based on the combination
duplicates = copy_ic.sort_values(by=['p_id', 'opname_id', 'ic_specialisme_code', 'afdelings_code'])
duplicates = duplicates[duplicates.duplicated(subset=['p_id', 'opname_id', 'ic_specialisme_code', 'afdelings_code'], keep=False)]
duplicates

# Filtering and cleaning other columns

## Vitals

In [None]:
# check whether columns with float type are indeed float or integer
cols = ['meet_waarde1', 'meet_waarde2', 'meet_waarde3']

def is_actually_int(x):
    if pd.isna(x):
        return None
    if isinstance(x, float):
        return x.is_integer()
    return False

applied = copy_vitals[cols].applymap(is_actually_int)
copy_vitals['integers'] = applied.all(axis=1, skipna=True)
rows_integers = copy_vitals[copy_vitals['integers']]
rows_floats = copy_vitals[~copy_vitals['integers']]

In [None]:
rows_integers

In [None]:
rows_floats

In [None]:
rows_integers['meting'].value_counts()

In [None]:
rows_floats['meting'].value_counts()

In [None]:
rows_floats[rows_floats['meting']=='AVPU']

In [None]:
# copy_vitals[(copy_vitals['meting'] == 'Temp')]['meet_waarde1'].value_counts()

In [None]:
# check any rows that have missing values to calculate the meet_waarde3 values for NIBP meting
copy_vitals[(copy_vitals['meting'] == 'NIBP') & (copy_vitals.isna().any(axis=1))]

In [None]:
# fixing meet_waarde3 values for NIBP where both MW1 and MW2 are not missing 
# MAP = DP + 1/3(SP – DP)
condition = (
    (copy_vitals['meting'] == 'NIBP') &
    copy_vitals['meet_waarde1'].notna() &
    copy_vitals['meet_waarde2'].notna()
)

copy_vitals.loc[condition, 'meet_waarde3'] = (
    copy_vitals.loc[condition, 'meet_waarde2'] +
    (copy_vitals.loc[condition, 'meet_waarde1'] - copy_vitals.loc[condition, 'meet_waarde2']) / 3
)

copy_vitals.head()

In [None]:
# remove rows whose 'meet_waarde1' is missing for other meting that are NOT NIBP
condition2 = (
    (copy_vitals['meting'] != 'NIBP') &
    (copy_vitals['meet_waarde1'].isna())
)

copy_vitals = copy_vitals[~condition2].drop_duplicates(keep='last').reset_index(drop=True)
copy_vitals[copy_vitals['meet_waarde1'].isna()]
# copy_vitals[copy_vitals['meting']=='HR']

In [None]:
# remove rows whose 'meet_waarde1' or 'meet_waarde2' or both are missing, for 'NIBP'
# because 'NIBP' should have at least those two values.

condition3 = (
    (copy_vitals['meting'] == 'NIBP') & 
    (copy_vitals['meet_waarde1'].isna() | copy_vitals['meet_waarde2'].isna())
)

copy_vitals = copy_vitals[~condition3].drop_duplicates(keep='last').reset_index(drop=True)

# check if there are still rows that have missing values
copy_vitals[(
    (copy_vitals['meting'] == 'NIBP') & 
    (copy_vitals['meet_waarde1'].isna() | copy_vitals['meet_waarde2'].isna())
)]

In [None]:
# just check the values of AVPU
copy_vitals[copy_vitals['meting']=='AVPU']['meet_waarde1'].unique()

In [None]:
# filter each meting, except for AVPU (categorical) and O2 (don't need). 
# exclude EMV value
condition4 = (
    ((copy_vitals['meting'] == 'NIBP') & 
     (copy_vitals['meet_waarde1'] > 0) & (copy_vitals['meet_waarde1'] < 400) &
     (copy_vitals['meet_waarde2'] > 0) & (copy_vitals['meet_waarde2'] < 400) &
     (copy_vitals['meet_waarde3'] > 0) & (copy_vitals['meet_waarde3'] < 400)) |
    
    ((copy_vitals['meting'] == 'HR') & 
     (copy_vitals['meet_waarde1'] >= 0) & (copy_vitals['meet_waarde1'] < 300)) |
    
    ((copy_vitals['meting'] == 'Temp') & 
     (copy_vitals['meet_waarde1'] >= 29) & (copy_vitals['meet_waarde1'] <= 43)) |
    
    ((copy_vitals['meting'] == 'Resp') & 
     (copy_vitals['meet_waarde1'] >= 0) & (copy_vitals['meet_waarde1'] < 60)) |
    
    ((copy_vitals['meting'] == 'SpO2') & 
     (copy_vitals['meet_waarde1'] > 50)) |
    
    (copy_vitals['meting'] == 'O2') | 
    
    (copy_vitals['meting'] == 'AVPU')
)

copy_vitals = copy_vitals[condition4].reset_index(drop=True)
copy_vitals

In [None]:
# check if rows with O2 are still included
copy_vitals[copy_vitals['meting'] == 'O2'].head()

## Surgery

In [None]:
copy_surgery.head(5)

In [None]:
# drop rows that have no alphabets in the hoofdverreichting_code, as we cannot assume them
unique_codes = copy_surgery['hoofdverrichting_code'].unique().tolist()
diff_codes = [code for code in unique_codes if len(code) != len(unique_codes[0])]
copy_surgery = copy_surgery[~copy_surgery['hoofdverrichting_code'].isin(diff_codes)].reset_index(drop=True)

In [None]:
# create a nieuw column for prioriteit code that has less categories
def convert_prioriteit(code):
    if code in ['E', 'B', 'SE', 'SW']:
        return 'Elective'
    elif code in ['S', 'SA', 'A']:
        return 'Acute'
    elif code == 'O':
        return 'Unknown'

copy_surgery['prioriteit'] = copy_surgery['prioriteit_code'].apply(convert_prioriteit)
copy_surgery['prioriteit'].nunique()

## Reanimatie

In [None]:
# nothing to do
copy_reanimatie.head()

## Lab

In [None]:
copy_lab.head()

In [None]:
# filter out text values
copy_lab['uitslag'] = pd.to_numeric(copy_lab['uitslag'], errors='coerce')
copy_lab = copy_lab.dropna(subset=['uitslag'])

# we do not need to convert the type as integer as there are indeed float values
# copy_lab['uitslag'] = copy_lab['uitslag'].round()
# copy_lab['uitslag'] = copy_lab['uitslag'].astype('Int64')

# drop duplicated rows
copy_lab = copy_lab.drop_duplicates().reset_index(drop=True)

print(copy_lab['uitslag'].isna().sum())
print(copy_lab['uitslag'].dtypes)

In [None]:
# check if some of values are indeed float
is_integer = copy_lab['uitslag'] % 1 == 0
print(copy_lab['uitslag'][~is_integer])

## IC_Opnames

In [None]:
# nothing to do
copy_ic.head()

## Demographics

In [None]:
copy_demo.head()

In [None]:
copy_demo.info()

In [None]:
# remove leading and trailing spaces
def remove_spaces(df):
    for col in df.select_dtypes(include=[object]):
        df[col] = df[col].apply(lambda x: x.strip() if isinstance(x, str) else x)
    return df

copy_demo = remove_spaces(copy_demo)

In [None]:
copy_demo.info()

In [None]:
# # create a new column for specialisemecode + spoed
# copy_demo['specialisme_spoed'] = copy_demo['specialisme_code'] + "_" + copy_demo['spoed']

In [None]:
# replace space with _ only for columns with object type
def replace_spaces(df):
    for col in df.select_dtypes(include=[object]):
        df[col] = df[col].apply(lambda x: x.replace(" ", "_") if isinstance(x, str) else x)
    return df

copy_reanimatie = replace_spaces(copy_reanimatie)
copy_demo = replace_spaces(copy_demo)

In [None]:
copy_demo.info()

## Check if there are duplicated rows just in case

In [None]:
# # check duplicated values from copy dataframes just in case
duplicates_vitals = copy_vitals[copy_vitals.duplicated()]
duplicates_surgery = copy_surgery[copy_surgery.duplicated()]
duplicates_reanimatie = copy_reanimatie[copy_reanimatie.duplicated()]
duplicates_lab = copy_lab[copy_lab.duplicated()]
duplicates_ic = copy_ic[copy_ic.duplicated()]
duplicates_demo = copy_demo[copy_demo.duplicated()]

duplicates_vitals

In [None]:
duplicates_surgery

In [None]:
duplicates_reanimatie

In [None]:
duplicates_lab

In [None]:
duplicates_ic

In [None]:
duplicates_demo

# Reorganizing

### Vitals

The original Vitals dataframe has 'meting' consisted of multiple different types of values (HR, NIBP, etc), and some of them have 3 values per measurement time, while some of them have 1 value per measurement time.   
Therefore, we would prefer to have a column that have the same types/data-ish illustrated like below    

* Possible cons of the reorganized dataframe:    
1. More missing values (but we don't know how it'll do as a consequence)    
2. More columns (For EMM, it doesn't matter)

Oh but reshaping dataframe and dropped unnecessary columns actually made the dataframe way less sparse with missing values, which I think it's good.    
Also not many more columns are created    

======    
This is necessary to reduce overheads when merging/joining, so that it has less rows that grow exponentially    
Also to reduce memory usage     

In [None]:
copy_vitals.shape

In [None]:
copy_vitals.info()

In [None]:
copy_vitals.isna().sum().sum()

In [None]:
memory = copy_vitals.memory_usage(deep=True).sum() / (1024 ** 2)
print(f"Total memory usage: {memory} MB")

In [None]:
# reorganizing dataframe vitals. must increase RAM >= 32 (at least)

# unpivot a dataframe from wide to long format, 
# with identifier variables (id_vars) remained as they are 
# and non-identifier variables (var_name, value_name) will be changed
# waarde_type (the name of the original column) and waarde (the value in that column)
copy_vitals = copy_vitals.melt(id_vars=['p_id', 'opname_id', 'meting_datum_tijd', 'meting'], 
                             var_name='waarde_type', value_name='waarde')

In [None]:
copy_vitals.info()

In [None]:
# create a new column with the name of meting and its value (MW1, MW2, MW3)
copy_vitals['nieuwe_waarde'] = copy_vitals['meting'] + "_" + copy_vitals['waarde_type']
copy_vitals

In [None]:
copy_vitals.info()

In [None]:
# # check why _intergers columns were created
# p = copy_vitals['p_id'].iloc[-1]
# df_vitals[df_vitals['PID']==p]

In [None]:
copy_vitals.head(10)

In [None]:
# pivot a dataframe from long to wide format, using the new column and original values
copy_vitals = copy_vitals.drop(['meting', 'waarde_type'], axis=1)

copy_vitals = copy_vitals.pivot_table(
    index=['p_id', 'opname_id', 'meting_datum_tijd'],
    columns='nieuwe_waarde',
    values='waarde',
    aggfunc='last'
).reset_index()

copy_vitals.columns.name = None

copy_vitals.head()

In [None]:
# convert dtypes back to float64 other than exceptions
exceptions = ['p_id', 'opname_id', 'meting_datum_tijd']
for column in copy_vitals.columns:
    if column not in exceptions:
        copy_vitals[column] = pd.to_numeric(copy_vitals[column], errors='coerce').astype('float64')

In [None]:
copy_vitals.info()

In [None]:
copy_vitals.columns

In [None]:
# remove columns that end with _integers, this occured most likely due to NaN values
copy_vitals = copy_vitals.loc[:, ~copy_vitals.columns.str.endswith('_integers')]
copy_vitals.head()

In [None]:
copy_vitals.columns

In [None]:
# remove columns if they only contain na values
copy_vitals = copy_vitals.dropna(axis=1, how='all')
copy_vitals.head()

In [None]:
copy_vitals.columns

In [None]:
# less cells than copy_vitals, seems good
copy_vitals.shape

In [None]:
copy_vitals.isna().sum().sum()

In [None]:
memory = copy_vitals.memory_usage(deep=True).sum() / (1024 ** 2)
print(f"Total memory usage: {memory} MB")

In [None]:
copy_vitals.info()

### Surgery

In [None]:
# use only the first 3 characters and drop the original column
copy_surgery['hoofdverrichting_code'] = copy_surgery['hoofdverrichting_code'].str[:3]
copy_surgery['hoofdverrichting_code'].value_counts()

In [None]:
# remove the exception 
copy_surgery['hoofdverrichting_code'] = copy_surgery['hoofdverrichting_code'].replace('BM0', 'BM')

In [None]:
copy_surgery.isna().sum().sum()

In [None]:
memory = copy_surgery.memory_usage(deep=True).sum() / (1024 ** 2)
print(f"Total memory usage: {memory} MB")

In [None]:
# copy_surgery["hoofdverrichting_code_1"] = copy_surgery["hoofdverrichting_code"].astype("category")

In [None]:
# # the best
# copy_surgery["hoofdverrichting_code_2"] = copy_surgery["hoofdverrichting_code"].astype("category")
# copy_surgery["hoofdverrichting_code_2"] = copy_surgery["hoofdverrichting_code_2"].cat.codes.astype('int8')

In [None]:
# codes, uniques = pd.factorize(copy_surgery["hoofdverrichting_code"])
# copy_surgery["hoofdverrichting_code_3"] = codes

In [None]:
copy_surgery["hoofdverrichting_code1"] = copy_surgery["hoofdverrichting_code"].astype("category")
print(copy_surgery["hoofdverrichting_code1"].cat.categories)
copy_surgery["hoofdverrichting_code1"] = copy_surgery["hoofdverrichting_code1"].cat.codes.astype('int8')

In [None]:
copy_surgery.memory_usage(deep=True)

In [None]:
# create dummy variables
# DO NOT use dtype=bool as a merge function will convert the dtype into object which is expensive
# DO use 'boolean'
# USE this for cleaning the data and remove them later
def create_dummies(df, columns, prefix=None):
    
    # if it's a single column, convert it to a list
    if isinstance(columns, str):
        columns = [columns]
    
    # otherwise process each column to create dummy variables
    for column in columns:
        
        # check if prefix is provided, otherwise use column name
        col_prefix = prefix if prefix else column
        
        # create dummy variables
        dummies = pd.get_dummies(df[column], prefix=col_prefix, dtype='boolean')
        
        # concatenate the original dataframe with the new dataframe
        df = pd.concat([df, dummies], axis=1)
        
#         # drop the original column
#         df = df.drop(columns=[column])
    
    return df

In [None]:
# this creates too many 
copy_surgery = create_dummies(copy_surgery, 'hoofdverrichting_code')
copy_surgery.head()

In [None]:
copy_surgery.isna().sum().sum()

In [None]:
copy_surgery["prioriteit1"] = copy_surgery["prioriteit"].astype("category")
print(copy_surgery["prioriteit1"].cat.categories)
copy_surgery["prioriteit1"] = copy_surgery["prioriteit1"].cat.codes.astype('int8')

In [None]:
# # # make the prioriteit_code and _nieuw as boolean column
# # dummies = pd.get_dummies(copy_surgery['prioriteit_code'], prefix='prioriteit_code', dtype='boolean')
# # copy_surgery = pd.concat([copy_surgery, dummies], axis=1)
# # copy_surgery = copy_surgery.drop(columns=['prioriteit_code'])

# remove either one of the codes here, depends on which one we use
copy_surgery = copy_surgery.drop(columns=['prioriteit_code'])

# rename these to _code 
copy_surgery = create_dummies(copy_surgery, 'prioriteit', 'prioriteit_code')
copy_surgery.head()

In [None]:
copy_surgery.memory_usage(deep=True)

### Reanimatie

In [None]:
memory = copy_reanimatie.memory_usage(deep=True).sum() / (1024 ** 2)
print(f"Total memory usage: {memory} MB")

In [None]:
# # make the care_order as boolean column
copy_reanimatie = create_dummies(copy_reanimatie, 'care_order')
copy_reanimatie.head()

copy_reanimatie["care_order1"] = copy_reanimatie["care_order"].astype("category")
print(copy_reanimatie["care_order1"].cat.categories)
copy_reanimatie["care_order1"] = copy_reanimatie["care_order1"].cat.codes.astype('int8')

In [None]:
memory = copy_reanimatie.memory_usage(deep=True).sum() / (1024 ** 2)
print(f"Total memory usage: {memory} MB")

### Lab

In [None]:
# replacing space with _ from text values (column wise)
copy_lab['bepaling_oms'] = copy_lab['bepaling_oms'].str.strip()
copy_lab['bepaling_oms'] = copy_lab['bepaling_oms'].str.replace(" ", "_")
copy_lab['bepaling_oms'].value_counts()

In [None]:
memory = copy_lab.memory_usage(deep=True).sum() / (1024 ** 2)
print(f"Total memory usage: {memory} MB")

In [None]:
# reorganizing lab dataframe such that it can have each bepaling column with each value
copy_lab = copy_lab.pivot(index=['p_id', 'opname_id', 'lab_datum_tijd'], 
                                  columns='bepaling_oms', values='uitslag').reset_index()
copy_lab.columns.name = None
copy_lab.head()

In [None]:
copy_lab.columns

In [None]:
memory = copy_lab.memory_usage(deep=True).sum() / (1024 ** 2)
print(f"Total memory usage: {memory} MB")

### IC_Opnames

In [None]:
memory = copy_ic.memory_usage(deep=True).sum() / (1024 ** 2)
print(f"Total memory usage: {memory} MB")

In [None]:
# NOTE: I'm not using these vars, so it's fine to keep it like this
# make the ic_specialisme_code and afdelings_code as boolean column
copy_ic = create_dummies(copy_ic, ['ic_specialisme_code', 'afdelings_code'])
copy_ic.head()

In [None]:
memory = copy_ic.memory_usage(deep=True).sum() / (1024 ** 2)
print(f"Total memory usage: {memory} MB")

### Demographics

In [None]:
memory = copy_demo.memory_usage(deep=True).sum() / (1024 ** 2)
print(f"Total memory usage: {memory} MB")

In [None]:
copy_demo.columns

In [None]:
copy_demo = create_dummies(copy_demo, ['geslacht', 'spoed', 'specialisme_code', 'opname_type_oms'])
copy_demo.head()

In [None]:
copy_demo['specialisme_code1'] = copy_demo['specialisme_code'].astype("category")
print(copy_demo['specialisme_code1'].cat.categories)
copy_demo['specialisme_code1'] = copy_demo['specialisme_code1'].cat.codes.astype('int8')

In [None]:
memory = copy_demo.memory_usage(deep=True).sum() / (1024 ** 2)
print(f"Total memory usage: {memory} MB")

# Creating a deterioration event and time to first event

In [None]:
copy_reanimatie.info()

In [None]:
# event 1: where the death occurs when a full resuscitation code is in effect.
# if care_order_full_code missing, handle them as False

# time-to-event: copy_demo's ontslag datum tijd - 12 hours (instead of overlijdens_datum)
merged_temp = copy_demo.merge(copy_reanimatie[['opname_id', 'p_id', 'care_order_full_code']],
                            on=['opname_id', 'p_id'], 
                            how='left')

merged_temp['care_order_full_code'] = merged_temp['care_order_full_code'].fillna(False)
merged_temp['death_fullcode'] = merged_temp['overlijdens_datum'].notna() & merged_temp['care_order_full_code'].fillna(False)
copy_demo['death_fullcode'] = merged_temp['death_fullcode']

merged_temp.info()

In [None]:
copy_demo['death_fullcode'].isna().sum()

In [None]:
# event 2: where the death occurs in the ICU
# if overlidens_datum exists and ic_ontslag_datum is same as ontslag_datum_tijd, then the patient died at IC
# time-to-event: ic_ontslag_datum_tijd - 12 hours

merged_temp = merged_temp.merge(copy_ic[['opname_id', 'p_id', 'ic_ontslag_datum_tijd']],
                            on=['opname_id', 'p_id'], 
                            how='left')

merged_temp['death_ic'] = (merged_temp['overlijdens_datum'].notna() & 
                            (merged_temp['ic_ontslag_datum_tijd'] == merged_temp['ontslag_datum_tijd']))

copy_demo['death_ic'] = merged_temp['death_ic']
copy_demo['death_ic'] = copy_demo['death_ic'].fillna(False)

merged_temp[merged_temp['death_ic']==True]

In [None]:
copy_demo['death_ic'].isna().sum()

In [None]:
# event 3: ICU admission when the ICU stay is 6 hours or longer
# but almost 80 percent is True
# time-to-event: ic_opname_datum_tijd - 12 hours

copy_ic['ic_6hr'] = (copy_ic['ic_ontslag_datum_tijd'] - copy_ic['ic_opname_datum_tijd'] >= pd.Timedelta(hours=6))

copy_ic[copy_ic['ic_6hr']==False]

In [None]:
copy_ic['ic_6hr'].isna().sum()

In [None]:
# event 4: acute surgery, if followed by ICU admission
# time-to-event: ok_begin_datum_tijd - 12 hours

copy_ic1 = copy_ic[['p_id', 'opname_id', 'ic_opname_datum_tijd', 'ic_6hr']]
copy_surgery1 = copy_surgery[['p_id', 'opname_id', 'ok_eind_datum_tijd', 'prioriteit_code_Acute']]

merged_temp1 = pd.merge(copy_ic1, copy_surgery1, on=['p_id', 'opname_id'], how='inner')

merged_temp1['time_diff'] = merged_temp1['ic_opname_datum_tijd'] - merged_temp1['ok_eind_datum_tijd']

merged_temp1['acute_ic'] = (
    (merged_temp1['time_diff'] >= pd.Timedelta(0)) &
    (merged_temp1['time_diff'] <= pd.Timedelta(hours=1)) &
    (merged_temp1['prioriteit_code_Acute'] == True)
)

merged_temp1 = merged_temp1.drop('time_diff', axis=1)

copy_demo['acute_ic'] = merged_temp1['acute_ic']
copy_demo['acute_ic'] = copy_demo['acute_ic'].fillna(False)

copy_demo[copy_demo['acute_ic']==True]

In [None]:
# merging dataframe that are only necessary to calculate the time-to-event
merged_temp2 = pd.merge(merged_temp, merged_temp1, on=['p_id', 'opname_id'], how='inner')
merged_temp2['death_ic'] = merged_temp2['death_ic'].astype('boolean')
merged_temp2['ic_6hr'] = merged_temp2['ic_6hr'].astype('boolean')

In [None]:
merged_temp2.info()

In [None]:
def get_time_to_first_event(row):
    events = []
    if row['death_fullcode'] and pd.notna(row['ontslag_datum_tijd']):
        events.append(('death_fullcode', row['ontslag_datum_tijd']))
        
    if row['death_ic'] and pd.notna(row['ic_ontslag_datum_tijd']):
        events.append(('death_ic', row['ic_ontslag_datum_tijd']))
        
    if row['ic_6hr'] and pd.notna(row['ic_opname_datum_tijd']):
        events.append(('ic_6hr', row['ic_opname_datum_tijd']))
        
    if row['acute_ic'] and pd.notna(row['ok_eind_datum_tijd']) and pd.notna(row['ic_opname_datum_tijd']):
        events.append(('acute_ic', row['ok_eind_datum_tijd']))

    # if no 4 events, then return observation time, False for boolean vars, 0 as event type
    if not events:
        no_event_time = row['ontslag_datum_tijd'] - row['opname_datum_tijd']
        return (no_event_time, False, False, False, False, 0, False)
    
    earliest_event = min(events, key=lambda x: x[1])
    
    time_to_first_event = earliest_event[1] - row['opname_datum_tijd'] - pd.Timedelta(hours=12)

    death_fullcode_first = earliest_event[0] == 'death_fullcode'
    death_ic_first = earliest_event[0] == 'death_ic'
    ic_6hr_first = earliest_event[0] == 'ic_6hr'
    acute_ic_first = earliest_event[0] == 'acute_ic'

    event_type = 0
    if death_fullcode_first:
        first_event = 1
    elif death_ic_first:
        first_event = 2
    elif ic_6hr_first:
        first_event = 3
    elif acute_ic_first:
        first_event = 4
        
    # adding a target variable if not 0 type (if one of four events happened)
    is_first = first_event != 0

    return (time_to_first_event, death_fullcode_first, death_ic_first, ic_6hr_first, acute_ic_first, first_event, is_first)

result = merged_temp2.apply(get_time_to_first_event, axis=1)

merged_temp2['time_to_first_event'] = result.apply(lambda x: x[0])
merged_temp2['death_fullcode_first'] = result.apply(lambda x: x[1])
merged_temp2['death_ic_first'] = result.apply(lambda x: x[2])
merged_temp2['ic_6hr_first'] = result.apply(lambda x: x[3])
merged_temp2['acute_ic_first'] = result.apply(lambda x: x[4])
merged_temp2['first_event'] = result.apply(lambda x: x[5])
merged_temp2['is_first'] = result.apply(lambda x: x[6])

merged_temp2.head()

In [None]:
merged_temp2.columns

In [None]:
count_death_fullcode = (
    merged_temp2[merged_temp2['death_fullcode'] & merged_temp2['ontslag_datum_tijd'].notna()]
    .groupby('opname_id')['ontslag_datum_tijd']
    .nunique()
)
merged_temp2['count_death_fullcode'] = merged_temp2['opname_id'].map(count_death_fullcode).fillna(0).astype(int)

count_death_ic = (
    merged_temp2[merged_temp2['death_ic'] & merged_temp2['ic_ontslag_datum_tijd'].notna()]
    .groupby('opname_id')['ic_ontslag_datum_tijd']
    .nunique()
)
merged_temp2['count_death_ic'] = merged_temp2['opname_id'].map(count_death_ic).fillna(0).astype(int)

count_ic_6hr = (
    merged_temp2[merged_temp2['ic_6hr'] & merged_temp2['ic_opname_datum_tijd'].notna()]
    .groupby('opname_id')['ic_opname_datum_tijd']
    .nunique()
)
merged_temp2['count_ic_6hr'] = merged_temp2['opname_id'].map(count_ic_6hr).fillna(0).astype(int)

count_acute_ic = (
    merged_temp2[merged_temp2['acute_ic'] & merged_temp2['ok_eind_datum_tijd'].notna() 
                 & merged_temp2['ic_opname_datum_tijd'].notna()]
    .groupby('opname_id')['ok_eind_datum_tijd']
    .nunique()
)
merged_temp2['count_acute_ic'] = merged_temp2['opname_id'].map(count_acute_ic).fillna(0).astype(int)

print(merged_temp2.head())

In [None]:
max(merged_temp2['count_acute_ic'])

In [None]:
pd.set_option('display.max_columns', None)
max_count = merged_temp2['count_acute_ic'].max()
merged_temp2[merged_temp2['count_acute_ic'] == max_count]


In [None]:
merged_temp2['first_event'].value_counts()

In [None]:
# investigate how to handle the negative values of time
# 1) if the time period is > -(12hrs), then we see it as error, discard that (so -1 days +18:43:00 is NOT error)
# 2) otherwise set to 0, (instead of without -12 hours, because we agreed to maximize the time)
negative = merged_temp2['time_to_first_event'] < pd.Timedelta(0)
negative = merged_temp2[negative]
negative

In [None]:
# how does it have late ok_begin_datum_tijd than opname datum tijd?
opname1 = merged_temp2[merged_temp2['time_to_first_event'] == '-8 days +12:47:00']['opname_id']

In [None]:
opname1

In [None]:
copy_demo[copy_demo['opname_id'].isin(opname1)]

In [None]:
copy_surgery[copy_surgery['opname_id'].isin(opname1)]

In [None]:
condition = merged_temp2['time_to_first_event'] < pd.Timedelta(hours=-12)
merged_temp2 = merged_temp2.loc[~condition]

negative = merged_temp2['time_to_first_event'] < pd.Timedelta(0)
negative = merged_temp2[negative]
negative

In [None]:
merged_temp2[merged_temp2['time_to_first_event'] == '-8 days +12:47:00']

In [None]:
# if the time is between -12 hours to 0, then we consider this as 0
condition = (merged_temp2['time_to_first_event'] >= pd.Timedelta(hours=-12)) & (merged_temp2['time_to_first_event'] < pd.Timedelta(hours=0))
merged_temp2.loc[condition, 'time_to_first_event'] = pd.Timedelta(hours=0)
merged_temp2.head()

In [None]:
merged_temp2.head(10)

In [None]:
copy_demo = pd.merge(copy_demo, merged_temp2, on=['p_id', 'opname_id'], how='left', suffixes=('', '_drop'))
copy_demo = copy_demo[[c for c in copy_demo.columns if not c.endswith('_drop')]]
copy_demo.head()

In [None]:
# for boolean column that contains NaN, fill in False
boolean_col = copy_demo.select_dtypes(include=['bool', 'object']).columns
copy_demo[boolean_col] = copy_demo[boolean_col].fillna(False)

boolean_col = copy_demo.select_dtypes(include=['bool']).columns
copy_demo[boolean_col] = copy_demo[boolean_col].astype('boolean')

In [None]:
copy_demo.info()

In [None]:
copy_demo['first_event'] = copy_demo['first_event'].fillna(0).astype('int8')

In [None]:
copy_demo['time_to_first_event'] = copy_demo['time_to_first_event'].dt.total_seconds() / 60
copy_demo['time_to_first_event'] = copy_demo['time_to_first_event'].astype('UInt32')

This is old code below. It's not used.

In [None]:
# # create deceased event
# copy_demo['is_deceased'] = copy_demo['overlijdens_datum'].notna()
# copy_demo[copy_demo['overlijdens_datum'].notna()].head()

In [None]:
# # create deceased event based on the period (discharge - admission)
# # assume that as soon as a patient passes away, discharge happens at the same time, 
# # as overlijdens_datum does not have time
# # also we only deal with patients who passed away 

# # do not consider calculating rows where ontslag_datum_tijd dates are eariler than overlijdens_datum
# # because the focus is in death within hospital
# # TODO: however, can we not use these patients for estimating deterioration on other stuff? (vital score)

# deceased= copy_demo['is_deceased']
# same_date = copy_demo['ontslag_datum_tijd'].dt.date == copy_demo['overlijdens_datum'].dt.date
# copy_demo['is_deceased_hospital'] = deceased & same_date
# copy_demo.loc[copy_demo['is_deceased_hospital'], 'survival_period_hospital'] = \
#     ((copy_demo['ontslag_datum_tijd'] - copy_demo['opname_datum_tijd']).dt.total_seconds() / 3600).round(1)

# # based on deceased with 12 hours or 24 hours
# copy_demo = copy_demo.assign(
#     is_deceased_12h = deceased & same_date & (copy_demo['survival_period_hospital'] < 12),
#     is_deceased_24h = deceased & same_date & (copy_demo['survival_period_hospital'] < 24)
# )

# copy_demo.loc[copy_demo['is_deceased_12h'], 'survival_period_hospital_12h'] = \
#     ((copy_demo['ontslag_datum_tijd'] - copy_demo['opname_datum_tijd']).dt.total_seconds() / 3600).round(1)

# copy_demo.loc[copy_demo['is_deceased_24h'], 'survival_period_hospital_24h'] = \
#     ((copy_demo['ontslag_datum_tijd'] - copy_demo['opname_datum_tijd']).dt.total_seconds() / 3600).round(1)

In [None]:
# copy_demo[copy_demo['survival_period_hospital_24h'].notna()]

In [None]:
# remove case sensitivity again
copy_vitals.columns = copy_vitals.columns.str.lower()
copy_surgery.columns = copy_surgery.columns.str.lower()
copy_reanimatie.columns = copy_reanimatie.columns.str.lower()
copy_lab.columns = copy_lab.columns.str.lower()
copy_ic.columns = copy_ic.columns.str.lower()
copy_demo.columns = copy_demo.columns.str.lower()

# Convert some datetime variables as duration

In [None]:
# # duration surgery
# copy_surgery['duration_surgery'] = (
#     copy_surgery['ok_eind_datum_tijd'] - copy_surgery['ok_begin_datum_tijd']).dt.total_seconds() / 60

# copy_surgery = copy_surgery.drop(['ok_begin_datum_tijd', 'ok_eind_datum_tijd'], axis=1)

In [None]:
# # duration ic
# copy_ic['duration_ic'] = (
#     copy_ic['ic_ontslag_datum_tijd'] - copy_ic['ic_opname_datum_tijd']).dt.total_seconds() / 60

# copy_ic = copy_ic.drop(['ic_ontslag_datum_tijd', 'ic_opname_datum_tijd'], axis=1)

In [None]:
# # duration hospital
# copy_demo['duration_hospital'] = (
#     copy_demo['ontslag_datum_tijd'] - copy_demo['opname_datum_tijd']).dt.total_seconds() / 60

# copy_demo = copy_demo.drop(['opname_datum_tijd', 'ontslag_datum_tijd', 'care_order_full_code',
#                             'ic_ontslag_datum_tijd', 'ic_opname_datum_tijd', 'ok_begin_datum_tijd', 
#                             'prioriteit_code_acute'], axis = 1)

# Converting for datetime

In [None]:
dataframes = [copy_vitals, copy_surgery, copy_reanimatie, copy_lab, copy_ic, copy_demo]
total_memory = 0

for df in dataframes:
    memory = df.memory_usage(deep=True).sum() / (1024 ** 2)
    total_memory += memory
    
print(f"Total memory usage: {total_memory} MB")

In [None]:
copy_surgery.info()

In [None]:
# using this function because if I read the parquet file in other jupyterfile, then datetime64 changes
# # this should be used after rounding up time
# # NOT USED, only parts of it
# # defining more variables (season, time of day (morning))
def get_year(year):
    if 2018 <= year <= 2019:
        return 'year1'
    else:
        return 'year2'

def get_season(month):
    if 3 <= month <= 5:
        return 'Spring'
    elif 6 <= month <= 8:
        return 'Summer'
    elif 9 <= month <= 11:
        return 'Autumn'
    else:
        return 'Winter'
    
def get_day(day):
    if 1 <= day <= 10:
        return 'day1'
    elif 11 <= day <= 20:
        return 'day2'
    else:
        return 'day3'
    
def get_time_of_day(hour):
    if 5 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 17:
        return 'Afternoon'
    elif 17 <= hour < 21:
        return 'Evening'
    else:
        return 'Night'
    
# this increases memory but we might need this, at least temporarily, for calculating periods
# split datetime columns into separate columns such as year, day, hour, etc
def split_datetime_to_numeric_cols(dataframes):

    # iterate for each dataframe
    for i in range(len(dataframes)):

        # identify columns whose types are datetime64
        df = dataframes[i]
        datetime_cols = df.select_dtypes(include=['datetime64']).columns
        
        for datetime_col in datetime_cols:
            
#             # extracting attributes (I don't have to typecase here, but this is to compare memory)
            df[datetime_col + '_year'] = df[datetime_col].dt.year.astype('UInt16')
            df[datetime_col + '_month'] = df[datetime_col].dt.month.astype('UInt8')
            df[datetime_col + '_day_of_month'] = df[datetime_col].dt.day.astype('UInt8')
            df[datetime_col + '_hour'] = df[datetime_col].dt.hour.astype('UInt8')
#             df[datetime_col + '_minute'] = df[datetime_col].dt.minute.astype('UInt8')
#             df[datetime_col + '_second'] = df[datetime_col].dt.second.astype('UInt8')
            
#             # weekday/weekends, days of the week (monday)
#             df[datetime_col + '_is_weekday'] = df[datetime_col].dt.weekday.isin(range(0, 5)).astype('boolean')
#             df[datetime_col + '_day_of_week'] = df[datetime_col].dt.day_name()
            
            # get each datetime column for each datetime column
#             df[datetime_col + '_year'] = df[datetime_col].dt.year.apply(get_year)
#             df[datetime_col + '_season'] = df[datetime_col].dt.month.apply(get_season)
#             df[datetime_col + '_day'] = df[datetime_col].dt.day.apply(get_day)
#             df[datetime_col + '_time_of_day'] = df[datetime_col].dt.hour.apply(get_time_of_day)

#             # one-hot encoding
#             for category in ['year', 'season', 'day', 'time_of_day']:
#                 cate_col = datetime_col + '_' + category
#                 dummies = pd.get_dummies(df[cate_col], prefix=datetime_col, dtype='boolean')
                
#                 # drop the cate_col except for dummies
#                 dummies.columns = [col.replace('_' + category + '_', '_') for col in dummies.columns]
#                 df = pd.concat([df, dummies], axis=1)
#                 df = df.drop(cate_col, axis=1)

        dataframes[i] = df.reset_index(drop=True)

    return dataframes

In [None]:
dataframes = split_datetime_to_numeric_cols(dataframes)
copy_vitals = dataframes[0]
copy_surgery = dataframes[1]
copy_reanimatie = dataframes[2]
copy_lab = dataframes[3]
copy_ic = dataframes[4]
copy_demo = dataframes[5]

In [None]:
# def meting_bool(df, col="meting_datum_tijd"):
#     df[col] = pd.to_datetime(df[col])

#     # extract
#     df['year'] = df[col].dt.year
#     df['month'] = df[col].dt.month
#     df['day'] = df[col].dt.day
#     df['hour'] = df[col].dt.hour
    
#     # year
#     df['is_year1'] = (df['year'] >= 2018) & (df['year'] <= 2019)
#     df['is_year2'] = ~df['is_year1']
    
#     # seasons
#     df['is_spring'] = df['month'].isin([3, 4, 5])
#     df['is_summer'] = df['month'].isin([6, 7, 8])
#     df['is_autumn'] = df['month'].isin([9, 10, 11])
#     df['is_winter'] = df['month'].isin([12, 1, 2])
    
#     # day
#     df['is_day1'] = (df['day'] >= 1) & (df['day'] <= 10)
#     df['is_day2'] = (df['day'] >= 11) & (df['day'] <= 20)
#     df['is_day3'] = df['day'] >= 21
    
#     # time of day
#     df['is_morning'] = (df['hour'] >= 5) & (df['hour'] < 12)
#     df['is_afternoon'] = (df['hour'] >= 12) & (df['hour'] < 17)
#     df['is_evening'] = (df['hour'] >= 17) & (df['hour'] < 21)
#     df['is_night'] = ~(df['is_morning'] | df['is_afternoon'] | df['is_evening'])

#     df.drop(columns=['year', 'month', 'day', 'hour'], inplace=True)
    
#     return df

# meting_bool(copy_vitals, col="meting_datum_tijd")

In [None]:
def meting_int(df, col="meting_datum_tijd"):

    df[col] = pd.to_datetime(df[col])
    
    df['m_year'] = df[col].dt.year
    df['m_month'] = df[col].dt.month
    df['m_day'] = df[col].dt.day
    df['m_hour'] = df[col].dt.hour
    
    df = df[['m_year', 'm_month', 'm_day', 'm_hour']].copy()
    
    return df

meting_int(copy_vitals)

In [None]:
copy_vitals.info()

In [None]:
dataframes = [copy_vitals, copy_surgery, copy_reanimatie, copy_lab, copy_ic, copy_demo]
total_memory = 0

for df in dataframes:
    memory = df.memory_usage(deep=True).sum() / (1024 ** 2)
    total_memory += memory
    
print(f"Total memory usage: {total_memory} MB")

### OLD CODE for NEWS2 (NOT NEEDED)

In [None]:
# # calculate the total NEWS2 score based on vital measurements
# # Not needed 
# def calculate_news2(vitals):
    
#     # resp
#     if 'resp_meet_waarde1' in vitals.columns:
#         vitals['resp_score'] = 0
#         vitals.loc[vitals['resp_meet_waarde1'] <= 8, 'resp_score'] = 3
#         vitals.loc[vitals['resp_meet_waarde1'].between(9, 11), 'resp_score'] = 1
#         vitals.loc[vitals['resp_meet_waarde1'].between(21, 24), 'resp_score'] = 2
#         vitals.loc[vitals['resp_meet_waarde1'] >= 25, 'resp_score'] = 3
    
#     # o2
#     if 'o2_meet_waarde1' in vitals.columns:
#         vitals['o2_score'] = 0
#         vitals.loc[vitals['o2_meet_waarde1'] <= 91, 'o2_score'] = 3
#         vitals.loc[vitals['o2_meet_waarde1'].between(92, 93), 'o2_score'] = 2
#         vitals.loc[vitals['o2_meet_waarde1'].between(94, 95), 'o2_score'] = 1
    
#     # temperature
#     if 'temp_meet_waarde1' in vitals.columns:
#         vitals['temp_score'] = 0
#         vitals.loc[vitals['temp_meet_waarde1'] <= 35, 'temp_score'] = 3
#         vitals.loc[vitals['temp_meet_waarde1'].between(35.1, 36), 'temp_score'] = 1
#         vitals.loc[vitals['temp_meet_waarde1'].between(38.1, 39), 'temp_score'] = 1
#         vitals.loc[vitals['temp_meet_waarde1'] >= 39.1, 'temp_score'] = 2
    
#     # systolic blood pressure (nibp_meet_waarde1)
#     if 'nibp_meet_waarde1' in vitals.columns:
#         vitals['nibp1_score'] = 0
#         vitals.loc[vitals['nibp_meet_waarde1'] <= 90, 'nibp1_score'] = 3
#         vitals.loc[vitals['nibp_meet_waarde1'].between(91, 100), 'nibp1_score'] = 2
#         vitals.loc[vitals['nibp_meet_waarde1'].between(101, 110), 'nibp1_score'] = 1
#         vitals.loc[vitals['nibp_meet_waarde1'] >= 220, 'nibp1_score'] = 3
    
#     # heart rate
#     if 'hr_meet_waarde1' in vitals.columns:
#         vitals['hr_score'] = 0
#         vitals.loc[vitals['hr_meet_waarde1'] <= 40, 'hr_score'] = 3
#         vitals.loc[vitals['hr_meet_waarde1'].between(41, 50), 'hr_score'] = 1
#         vitals.loc[vitals['hr_meet_waarde1'].between(91, 110), 'hr_score'] = 1
#         vitals.loc[vitals['hr_meet_waarde1'].between(111, 130), 'hr_score'] = 2
#         vitals.loc[vitals['hr_meet_waarde1'] >= 131, 'hr_score'] = 3
        
#     # spo2
#     if 'spo2_meet_waarde1' in vitals.columns:
#         vitals['spo2_score'] = 0
#         vitals.loc[vitals['spo2_meet_waarde1'] != 0, 'spo2_score'] = 2
    
#     # calculate the total NEWS2 score
#     cols = ['resp_score', 'o2_score', 'temp_score', 'nibp1_score', 'hr_score', 'avpu_score', 'spo2_score']
#     score = [col for col in cols if col in vitals.columns]
#     vitals['news2_score'] = vitals[score].sum(axis=1)
    
#     return vitals

# calculate_news2(copy_vitals)

In [None]:
# f = calculate_news2(copy_vitals)
# f['news2_score'].value_counts()

https://www.nice.org.uk/advice/mib205/chapter/The-technology#:~:text=Medium%20risk%20(aggregate%20score%205,to%20higher%2Ddependency%20care%20area.

Low risk (aggregate score 1 to 4) – prompt assessment by ward nurse to decide on change to frequency of monitoring or escalation of clinical care.

Medium risk (aggregate score 5 to 6) – urgent review by ward-based doctor or acute team nurse to decide on escalation to critical care team.

High risk (aggregate score of 7 or over) – emergency assessment by critical care team, usually leading to patient transfer to higher-dependency care area.

So deterioration: Low risk -> Medium risk, Medium risk -> High risk 

In [None]:
# copy_vitals['news2_low_risk'] = copy_vitals['news2_score'].between(1, 4, inclusive='both')
# copy_vitals['news2_medium_risk'] = copy_vitals['news2_score'].between(5, 6, inclusive='both')
# copy_vitals['news2_high_risk'] = copy_vitals['news2_score'] >= 7

In [None]:
# copy_vitals.head()

In [None]:
# copy_vitals = copy_vitals.sort_values(['p_id', 'opname_id', 'meting_datum_tijd']).reset_index(drop=True)

# grouped = copy_vitals.groupby(['p_id', 'opname_id'])

# copy_vitals['prev_low_risk'] = grouped['news2_low_risk'].shift(1)
# copy_vitals['prev_medium_risk'] = grouped['news2_medium_risk'].shift(1)
# copy_vitals['prev_meting_datum_tijd'] = grouped['meting_datum_tijd'].shift(1)

# copy_vitals['time_diff_hours'] = (
#     copy_vitals['meting_datum_tijd'] - copy_vitals['prev_meting_datum_tijd']
# ).dt.total_seconds() / 3600

# copy_vitals['is_vital_worse_12h'] = (
#     (copy_vitals['time_diff_hours'] <= 12) &
#     ((copy_vitals['prev_low_risk'] & copy_vitals['news2_medium_risk']) |
#         (copy_vitals['prev_medium_risk'] & copy_vitals['news2_high_risk'])))

# copy_vitals.head()

In [None]:
# copy_vitals[(copy_vitals['is_vital_worse_12h']==True) & (copy_vitals['news2_high_risk']==True)]

In [None]:
# op1 = copy_vitals[(copy_vitals['is_vital_worse_12h']==True) & (copy_vitals['news2_high_risk']==True)].iloc[0]['opname_id']

In [None]:
# copy_vitals[copy_vitals['opname_id']==op1]

# Converting data types

To efficiently manage the memory size

In [None]:
copy_vitals.info()

In [None]:
copy_surgery.info()

In [None]:
copy_reanimatie.info()

In [None]:
copy_lab.info()

In [None]:
copy_ic.info()

In [None]:
copy_demo.info()

In [None]:
# check which cols don't have NaN values (float)
copy_lab.columns[copy_lab.notna().all()]

In [None]:
# Vitals: test how many decimal digits each type can handle
# decimal places up to 1 (nums after point)
o = 4.59
o = round(o, 1)
o_float32 = np.float32(o)
o_float_n = o_float32.astype('float16')

print(o_float32)
print(o_float_n)

o = 45.59
o = round(o, 1)
o_float32 = np.float32(o)
o_float_n = o_float32.astype('float16')

# if the interger part is > 2, float32 better
print(o_float32)
print(o_float_n)

o = 455.59
o = round(o, 1)
o_float32 = np.float32(o)
o_float_n = o_float32.astype('float16')

print(o_float32)
print(o_float_n)

In [None]:
# Lab: test how many decimal digits each type can handle
# decimal places up to 2 (nums after point)
o = -1.188
o = round(o, 2)
o_float32 = np.float32(o)
o_float_n = o_float32.astype('float16')

print(o_float32)
print(o_float_n)

o = 1.188
o = round(o, 2)
o_float32 = np.float32(o)
o_float_n = o_float32.astype('float16')

print(o_float32)
print(o_float_n)

o = 11.188
o = round(o, 2)
o_float32 = np.float32(o)
o_float_n = o_float32.astype('float16')

print(o_float32)
print(o_float_n)

# if the interger part is > 2, float32 better
o = 111.188
o = round(o, 2)
o_float32 = np.float32(o)
o_float_n = o_float32.astype('float16')

print(o_float32)
print(o_float_n)

o = 1111.188
o = round(o, 2)
o_float32 = np.float32(o)
o_float_n = o_float32.astype('float16')

print(o_float32)
print(o_float_n)

In [None]:
# float precision
precision_float16 = np.finfo(np.float16).precision
precision_float32 = np.finfo(np.float32).precision
precision_float64 = np.finfo(np.float64).precision
precision_float128 = np.finfo(np.float128).precision

print(precision_float16)
print(precision_float32)
print(precision_float64)
print(precision_float128)

In [None]:
print(copy_vitals.min())
print(copy_vitals.max())
print(copy_lab.min())
print(copy_lab.max())

In [None]:
copy_vitals

In [None]:
copy_vitals['nibp_meet_waarde3'][0]

In [None]:
def convert_dtypes(df, decimal_places=None):

    # loop through each column
    for col in df.columns:
        
        # if the datatype is numeric
        if pd.api.types.is_numeric_dtype(df[col]):
            
            # get min and max values
            col_min = df[col].min()
            col_max = df[col].max()
            
            # if the datatype is float
            # this doesn't check whether the content is indeed float or not
            # it only checks explicit datatype            
            if pd.api.types.is_float_dtype(df[col]):
                
                # only round if decimal_places is provided
                if decimal_places is not None:
                    
                    # round the column to the specified decimal places
                    df[col] = df[col].round(decimal_places)
                
                # get integer part of maximum absolute value
                # we don't use col_min for calculating max digits, because min is -1.0 anyway
                integer_part = int(abs(col_max))
                
                # count number of digits in integer part
                int_digits = len(str(integer_part))
                
                # decide whether to convert to float16 or float32
                # float16 doesn't work depends on the machine
#                 if int_digits < 3:
#                     df[col] = df[col].astype(np.float16)
#                 else:
#                     df[col] = df[col].astype(np.float32)

                df[col] = df[col].astype(np.float32)
                    
            # otherwise if the datatype is integer
            elif pd.api.types.is_integer_dtype(df[col]):
                
                # if the minimum is smaller than 0, set signed integer
                if col_min < 0:
                    
                    # assign nullable type Int8 or Int16 depends on the range
                    if col_min >= np.iinfo(np.int8).min and col_max <= np.iinfo(np.int8).max:
                        df[col] = df[col].astype('Int8')
                    elif col_min >= np.iinfo(np.int16).min and col_max <= np.iinfo(np.int16).max:
                        df[col] = df[col].astype('Int16')
                
                # if the minimum is NOT smaller than 0, set unsigned integer
                else:
                    
                    # assign nullable type UInt8 or UInt16 depends on the range
                    if col_max <= np.iinfo(np.uint8).max:
                        df[col] = df[col].astype('UInt8')
                    elif col_max <= np.iinfo(np.uint16).max:
                        df[col] = df[col].astype('UInt16')

    # return the converted dataframe
    return df

In [None]:
# convert data types into all dataframes
copy_vitals = convert_dtypes(copy_vitals, decimal_places = 1)
copy_surgery = convert_dtypes(copy_surgery)
copy_reanimatie = convert_dtypes(copy_reanimatie)
copy_lab = convert_dtypes(copy_lab, decimal_places = 2)
copy_ic = convert_dtypes(copy_ic)
copy_demo = convert_dtypes(copy_demo)

In [None]:
# can see the data is rounded up to 1 decimal place
copy_vitals['nibp_meet_waarde3'][0]

In [None]:
dataframes = [copy_vitals, copy_surgery, copy_reanimatie, copy_lab, copy_ic, copy_demo]

total_memory = 0

for df in dataframes:
    memory = df.memory_usage(deep=True).sum() / (1024 ** 2)
    total_memory += memory
    
print(f"Total memory usage: {total_memory} MB")

In [None]:
copy_vitals.head()

In [None]:
# this is only for the display purposes, doesn't affect actual data
# 3 because if i do 2, then it may round up to the display over already rounded data
# the dataframe number is binary so it may look different than actual data though
pd.set_option('display.precision', 3)
copy_lab

In [None]:
copy_vitals['nibp_meet_waarde3'][0]

# Extra Cleaning

In [None]:
# in lab, remove symbols
copy_lab.columns = (copy_lab.columns
                    .str.replace(r'[\(\)]', '', regex=True)
                    .str.replace('-', '_'))
copy_lab.info()

In [None]:
copy_vitals.info()

In [None]:
copy_surgery.info()

In [None]:
copy_reanimatie.info()

In [None]:
copy_lab.info()

In [None]:
copy_ic.info()

In [None]:
copy_demo.info()

In [None]:
# copy_vitals = copy_vitals.drop('meting_datum_tijd', axis=1)

# # because it's 0 now
# copy_vitals = copy_vitals.drop('meting_datum_tijd_minute', axis=1)

# copy_reanimatie = copy_reanimatie.drop('vanaf_datum', axis=1)

# copy_lab = copy_lab.drop('lab_datum_tijd', axis=1)

# copy_ic = copy_ic.drop('ic_6hr', axis=1)

# copy_demo = copy_demo.drop(['overlijdens_datum', 'overlijdens_datum_hour', 'overlijdens_datum_minute'], axis=1)

In [None]:
count_cols = ['count_death_fullcode', 'count_death_ic', 'count_ic_6hr', 'count_acute_ic']
copy_demo[count_cols] = copy_demo[count_cols].fillna(0)
copy_demo[count_cols] = copy_demo[count_cols].astype('UInt8')

In [None]:
copy_demo.info()

# Save dataframes to parquet   
Saving to parquet messes the datetime64 somehow

In [None]:
dataframes = [copy_vitals, copy_surgery, copy_reanimatie, copy_lab, copy_ic, copy_demo]

In [None]:
# initialize titles
titles = ["vitals", "surgery", "reanimatie", "lab", "ic_opnames", "demographics"]

# specify the output directory for saving parquet files
output_dir = '..'

# create the folder if it does not exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# save each dataframe as a parquet file
for df, title in zip(dataframes, titles):
    output_path = os.path.join(output_dir, f"{title}.parquet")

    # check if the file already exists
    if not os.path.isfile(output_path):
        try:
            # save dataframe to parquet file
            df.to_parquet(output_path)
            
        except Exception as e:
            print(f"Failed to save {title}: {e}.")
            continue

In [None]:
v = pd.read_parquet(output_dir + '/vitals.parquet')
v.head()

In [None]:
v.info()

In [None]:
v['nibp_meet_waarde3'][0]

In [None]:
copy_vitals.columns

In [None]:
copy_demo['opname_datum_tijd']

In [None]:
copy_demo['opname_datum_tijd'].sort_values()