In [1]:
%reset -f
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re, sqlite3, pickle, time, datetime, random, os
from dateutil.relativedelta import relativedelta

pd.options.display.float_format = '{:,.4f}'.format

# Load Processed parameters and data

In [2]:
##############################################################################
#######################################
### Relevant fields for pre-processing
#######################################
reference_date = datetime.datetime(1970,1,1)

##### Fields in table patients ##########
patient_field         = 'patient_id'
age_field             = 'age_at_admission\r'
sex_field             = 'sex'
ethnic_field          = 'ethnic_origin'
death_ind_field       = 'death_indicator'
death_date_field      = 'date_of_death'
death_date_code_field = 'date_code_of_death'
mort_in_hosp_field    = 'Thirty_day_mort'

##### Fields in table admissions ########
admission_field           = 'admission_id'
diagnosis_field           = 'episode_diagnoses'
admn_date_field           = 'admission_date_time'
admn_discharge_field      = 'discharge_date_time'
admn_date_code_field      = 'admission_date_code_time'
admn_discharge_code_field = 'discharge_date_code_time'
lengthofstay_field        = 'lengthofstay'
isPneumonia_field         = 'isPneumonia'
mortal_admin_field        = 'mortal_admin'
comorbidity_field         = 'Comorbidity_score'
icu_admin_field           = 'icu_count\r'
no_eobs_field             = 'no_obs_eobs'
no_haematology_field      = 'no_haematology_eobs'
meds_drug_field           = 'Drug_supplied'
specific_comorb_field     = 'Specific Comorbidity'
prev_admin_field          = 'had_Prev_admin'
has_spin_field            = 'adm_has_spin'
cubr65_field              = 'CURB65'
antibiot_4h_field         = 'antibiotic_4h'

 ##### Fields in table eObservations #####
eObs_time_field      = 'timestamp'
eObs_time_code_field = 'timestamp_code'
eObs_time_prev_obs   = 'time_since_prev_obs_in_mins'
eObs_resprate_field  = 'rr'
eObs_sbp_field       = 'sbp'
eObs_dbp_field       = 'dbp'
eObs_newscore_field  = 'ews'
eObs_heartrate_field = 'heart_rate'
eObs_temptr_field    = 'temperature\r'
eObs_oxygen_field    = 'Oxygen_Saturation'

##### Fields in table haematological tests #####
test_code_field      = 'local_test_code'
test_time_field      = 'sample_collection_date_time'
test_time_code_field = 'sample_collection_date_code_time'

In [3]:
data = pickle.load( open('DataFrame_pickles/df_patients_admissions_2016_18_v4.pickle', 'rb'))
df_patients   = data[0]
df_admissions = data[1]

df_eobs = pickle.load( open("DataFrame_pickles/df_eobs_3d.pickle", "rb" ))
df_haem = pickle.load( open('DataFrame_pickles/new_haematology_3d.pickle', 'rb')) 
df_haem = df_haem.rename(columns = {'sample_collection_date_time':'timestamp'})

# Selection of Blood Tests


In [4]:
df = pd.DataFrame(df_haem.count())
df = df.rename(columns = {0: 'Count'}).sort_values(by = ['Count'], ascending = False)
df['Per'] = df['Count'] / len(df_haem)
display(df)
BT_include = df[df['Per'] >= 0.7].index.tolist()[2:]
BT_include = ['CREA','UREA','K','GFR','WBC','PLT','HCT','HGB','RBC','MCH','MCV','NEUAB','TLYMAB',
              'EOSAB','MONAB','BASAB','ALB','ALP','BILI']

print("------------------------------------------------------")
print("Number of Blood Test to include in the merging dataset", len(BT_include))
print("")
print(BT_include)

Unnamed: 0,Count,Per
admission_id,123168,1.0
WBC,123168,1.0
MCV,123168,1.0
MCH,123168,1.0
timestamp,123168,1.0
HGB,123168,1.0
HCT,123168,1.0
RBC,123168,1.0
K,123168,1.0
UREA,123168,1.0


------------------------------------------------------
Number of Blood Test to include in the merging dataset 19

['CREA', 'UREA', 'K', 'GFR', 'WBC', 'PLT', 'HCT', 'HGB', 'RBC', 'MCH', 'MCV', 'NEUAB', 'TLYMAB', 'EOSAB', 'MONAB', 'BASAB', 'ALB', 'ALP', 'BILI']


In [5]:
df_haem_merge = df_haem[[admission_field, eObs_time_field] + BT_include]

In [6]:
display(df_haem_merge.describe())
display(df_eobs.describe())

Unnamed: 0,admission_id,timestamp,CREA,UREA,K,GFR,WBC,PLT,HCT,HGB,...,MCH,MCV,NEUAB,TLYMAB,EOSAB,MONAB,BASAB,ALB,ALP,BILI
count,123168,123168,123168.0,123168.0,123168.0,123060.0,123168.0,123144.0,123168.0,123168.0,...,123168.0,123168.0,122076.0,122064.0,122088.0,122064.0,122016.0,119172.0,119100.0,117624.0
unique,10264,4091,15888.0,18220.0,5580.0,9524.0,18875.0,23202.0,19790.0,8950.0,...,14136.0,4939.0,46953.0,20935.0,9097.0,14268.0,3594.0,4447.0,15214.0,5728.0
top,99155969,2018-03-07 12:00:00,600.0,30.0,4.0,90.0015,3.0,90.0,0.362,115.0,...,30.0,90.0,20.0,0.3,0.01,0.5,0.02,38.0,400.0,6.0
freq,12,69,656.0,2309.0,3358.0,35979.0,1562.0,5129.0,374.0,971.0,...,867.0,3354.0,2788.0,1655.0,5347.0,1839.0,12999.0,4732.0,2309.0,5586.0


Unnamed: 0,Oxygen_Saturation,Assisted_O2
count,4017600.0,4017600.0
mean,95.7939,0.9873
std,2.5353,0.1121
min,60.0,0.0
25%,95.1053,1.0
50%,96.0,1.0
75%,97.1429,1.0
max,100.0,1.0


In [7]:
A = df_admissions[admission_field].unique().tolist()
B = df_eobs[admission_field].unique().tolist()
C = df_haem_merge[admission_field].unique().tolist()
print("No. admissions in admissions", len(A))
print("No. admissions in eobs", len(B))
print("No. admissions in haematology", len(C))

D = set(A)&set(B)&set(C)

print("No. admissions in all", len(D))
admissions_to_process = list(D)

No. admissions in admissions 18537
No. admissions in eobs 13950
No. admissions in haematology 10264
No. admissions in all 10264


In [8]:
df = pd.merge(df_eobs[df_eobs[admission_field].isin(admissions_to_process)], df_haem_merge, on = [admission_field, eObs_time_field],how="left")

In [9]:
df.describe()

Unnamed: 0,Oxygen_Saturation,Assisted_O2
count,2956032.0,2956032.0
mean,95.8487,0.993
std,2.5281,0.0836
min,60.0,0.0
25%,95.2,1.0
50%,96.0,1.0
75%,97.2222,1.0
max,100.0,1.0


### Merging check Point before interpolation

In [10]:
admin = 67762672
admin = 52573088

print("__________________________________________________________________________________________________________")
print("ADMISSION INFORMATION")
display(df_admissions[df_admissions[admission_field]==admin])

print("")
print("")
print("=========================== time series EOBS table ====================================")
display(df_eobs[df_eobs[admission_field] == admin].iloc[:4])
print("")
print("")
print("=========================== time series HAEMATOLOGY info table ====================================")
display(df_haem_merge[df_haem_merge[admission_field] == admin].iloc[:4])
print("")
print("")
print("=========================== time series HMIXED table ====================================")
display(df[df[admission_field] == admin].iloc[:4])

print("__________________________________________________________________________________________________________")
print("")
print("df_eobs, number of records:", len(df_eobs[df_eobs[admission_field] == admin]))

print("df_haem, number of records:", len(df_haem_merge[df_haem_merge[admission_field] == admin]))

print("df_mixed number of records", len(df[df[admission_field] == admin]))


__________________________________________________________________________________________________________
ADMISSION INFORMATION


Unnamed: 0,admission_id,episode_diagnoses,patient_id,age_at_admission\r,admission_date_time,admission_date_code_time,discharge_date_time,discharge_date_code_time,lengthofstay,isPneumonia,...,icu_count\r,Comorbidity_score,no_obs_eobs,no_haematology_eobs,receivedMedicines,Specific Comorbidity,had_Prev_admin,adm_has_spin,CURB65,antibiotic_4h
9234,52573088,"||A41.9,L97.X,K59.0,Y45.0,R33.X,I25.9,E11.9,E0...",39843456,80,2018-03-01 16:18:00,17591.6792,2018-03-16 11:30:00,17606.4792,14 days 19:12:00,1,...,0,6,80,10,0,1,1.0,,,






Unnamed: 0,admission_id,timestamp,time_since_prev_obs_in_mins,rr,ews,heart_rate,temperature\r,timestamp_code,sbp,dbp,Oxygen_Saturation,Assisted_O2
0,52573088,2018-03-01 11:15:00,15,20.0,1,71.0,38.3,17591.4668,113.0,57.0,96.0,1.0
1,52573088,2018-03-01 11:30:00,15,20.0,1,72.0,38.2,17591.4763,114.0,56.3333,95.8333,1.0
2,52573088,2018-03-01 11:45:00,15,20.0,1,73.0,38.1,17591.4861,115.0,55.6667,95.6667,1.0
3,52573088,2018-03-01 12:00:00,15,20.0,1,74.0,38.0,17591.496,116.0,55.0,95.5,1.0






Unnamed: 0,admission_id,timestamp,CREA,UREA,K,GFR,WBC,PLT,HCT,HGB,...,MCH,MCV,NEUAB,TLYMAB,EOSAB,MONAB,BASAB,ALB,ALP,BILI
0,52573088,2018-03-01 12:00:00,62.0,3.5,4.8,85.0,19.6,332.0,0.338,111.0,...,30.6,94.0,18.23,0.53,0.23,0.61,0.05,38.0,93.0,12.0
1,52573088,2018-03-01 18:00:00,60.4167,3.4083,4.7,85.4168,18.775,331.9167,0.3373,110.75,...,30.6417,94.0833,17.3958,0.5658,0.2175,0.5983,0.0467,38.0,93.0,12.0
2,52573088,2018-03-02 00:00:00,58.8333,3.3167,4.6,85.8336,17.95,331.8333,0.3367,110.5,...,30.6833,94.1667,16.5617,0.6017,0.205,0.5867,0.0433,38.0,93.0,12.0
3,52573088,2018-03-02 06:00:00,57.25,3.225,4.5,86.2504,17.125,331.75,0.336,110.25,...,30.725,94.25,15.7275,0.6375,0.1925,0.575,0.04,38.0,93.0,12.0






Unnamed: 0,admission_id,timestamp,time_since_prev_obs_in_mins,rr,ews,heart_rate,temperature\r,timestamp_code,sbp,dbp,...,MCH,MCV,NEUAB,TLYMAB,EOSAB,MONAB,BASAB,ALB,ALP,BILI
1154880,52573088,2018-03-01 11:15:00,15,20.0,1,71.0,38.3,17591.4668,113.0,57.0,...,,,,,,,,,,
1154881,52573088,2018-03-01 11:30:00,15,20.0,1,72.0,38.2,17591.4763,114.0,56.3333,...,,,,,,,,,,
1154882,52573088,2018-03-01 11:45:00,15,20.0,1,73.0,38.1,17591.4861,115.0,55.6667,...,,,,,,,,,,
1154883,52573088,2018-03-01 12:00:00,15,20.0,1,74.0,38.0,17591.496,116.0,55.0,...,30.6,94.0,18.23,0.53,0.23,0.61,0.05,38.0,93.0,12.0


__________________________________________________________________________________________________________

df_eobs, number of records: 288
df_haem, number of records: 12
df_mixed number of records 288


## Interpolation of Haematology Features

In [11]:
def interpolate_by_method(series, method, plot = 0):
    if ('polynomial' in method) or ('nearest' in method):
        interpolate = series.interpolate(method = method[:-1], order = int(method[-1]))
    else:
        interpolate = series.interpolate(method = method, order = method)
    if plot != 0: interpolate.plot()
    return interpolate

def fill_top_bottom_values(series):
    interp =series.copy()
    first_non_na = 0
    last_non_na  = 0
    for idx,val in interp.items():
        if (val != val) and (first_non_na == 0): # if val is nan and first_non_na = 0
            continue
        elif not (val != val) and (first_non_na == 0):
            first_non_na = idx       
        elif (val != val) and (first_non_na != 0) and (last_non_na == 0):
            last_non_na = idx - relativedelta(minutes = 15)
    
    if first_non_na !=0:
        for i in range(len(interp.loc[:first_non_na]) - 1):
            time_to_change  = first_non_na - relativedelta(minutes = 15 * (i+1))
            fifth_next_time = time_to_change + relativedelta(minutes = 15*10)
            interp.at[time_to_change] = interp.loc[time_to_change:fifth_next_time].mean()

    if last_non_na !=0:
        for i in range(len(interp.loc[last_non_na:])-1):
            time_to_change  = last_non_na + relativedelta(minutes = 15 * (i+1))
            fifth_prev_time = time_to_change - relativedelta(minutes = 15*10)
            interp.at[time_to_change] = interp.loc[fifth_prev_time:time_to_change].mean()
    return interp

In [12]:
t =time.time()
df_eobs_haem_mx = pd.DataFrame(columns = df.columns)
list_features = list(df_haem_merge.columns[2:])

for adm in df[admission_field].unique().tolist():

    df_eob_mx_adm   = df[df[admission_field] == adm].copy()
    upsamp_times    = df_eob_mx_adm[eObs_time_field].values
    
    for idx_feat, feature in enumerate(list_features):
        
        values  = df_eob_mx_adm[feature].values
        series_ = pd.Series(values, index=upsamp_times)
        if len(series_) - series_.isna().sum() < 4: continue
        interp  = interpolate_by_method(series_, 'linear')
        if len(interp) == interp.isna().sum(): continue
            
        interp = interp.fillna(method="bfill")    
        interp = interp.fillna(method="ffill")    
    
        #interp  = fill_top_bottom_values(interp)
        
        df_eob_mx_adm[feature] = interp.values
        
    df_eobs_haem_mx = pd.concat([df_eobs_haem_mx, df_eob_mx_adm])
   
print("elpased:", time.time()-t)

elpased: 7942.566093206406


In [13]:
# --------------------------------------------------------------------
#Reduce the time series to 144 samples in the three days. which means a reading every 30 minutes. 
# --------------------------------------------------------------------
df_eobs_haem_mx = df_eobs_haem_mx.iloc[::2]
df_eobs_haem_mx.reset_index(drop=True)

Unnamed: 0,admission_id,timestamp,time_since_prev_obs_in_mins,rr,ews,heart_rate,temperature\r,timestamp_code,sbp,dbp,...,MCH,MCV,NEUAB,TLYMAB,EOSAB,MONAB,BASAB,ALB,ALP,BILI
0,74554266,2018-02-19 00:45:00,15,18.0000,0,78.0000,36.1000,17581.0378,120.0000,67.0000,...,31.7000,95.3333,17.1900,7.0000,0.0400,0.5000,,46.0000,198.0000,13.0000
1,74554266,2018-02-19 01:15:00,15,18.5000,0,74.7500,36.1250,17581.0592,121.2500,66.5000,...,31.7000,95.3333,17.1900,7.0000,0.0400,0.5000,,46.0000,198.0000,13.0000
2,74554266,2018-02-19 01:45:00,15,19.0000,0,71.5000,36.1500,17581.0805,122.5000,66.0000,...,31.7000,95.3333,17.1900,7.0000,0.0400,0.5000,,46.0000,198.0000,13.0000
3,74554266,2018-02-19 02:15:00,15,19.5000,1,68.2500,36.1750,17581.1018,123.7500,65.5000,...,31.7000,95.3333,17.1900,7.0000,0.0400,0.5000,,46.0000,198.0000,13.0000
4,74554266,2018-02-19 02:45:00,15,20.0000,1,65.0000,36.2000,17581.1231,125.0000,65.0000,...,31.7000,95.3333,17.1900,7.0000,0.0400,0.5000,,46.0000,198.0000,13.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1478011,35965296,2017-10-09 11:30:00,15,17.2222,2,103.0556,36.8000,17448.4788,105.7222,76.7778,...,32.0000,92.0000,11.3500,1.1400,0.1600,0.6000,0.0200,41.0000,105.0000,41.0000
1478012,35965296,2017-10-09 12:00:00,15,17.0000,2,102.5000,36.8000,17448.4991,106.5000,77.0000,...,32.0000,92.0000,11.3500,1.1400,0.1600,0.6000,0.0200,41.0000,105.0000,41.0000
1478013,35965296,2017-10-09 12:30:00,15,16.7778,2,101.9444,36.8000,17448.5194,107.2778,77.2222,...,32.0000,92.0000,11.3500,1.1400,0.1600,0.6000,0.0200,41.0000,105.0000,41.0000
1478014,35965296,2017-10-09 13:00:00,15,16.5556,2,101.3889,36.8000,17448.5397,108.0556,77.4444,...,32.0000,92.0000,11.3500,1.1400,0.1600,0.6000,0.0200,41.0000,105.0000,41.0000


# Resumen

In [14]:
admin = 67762672
admin = 52573088


print("__________________________________________________________________________________________________________")
print("ADMISSION INFORMATION")
display(df_admissions[df_admissions[admission_field]==admin])

print("")
print("")
print("=========================== MIX time series BEFORE interpolation ====================================")
display(df[df[admission_field] == admin].iloc[:5])

print("")
print("")
print("=========================== MIX time series AFTER interpolation ====================================")
display(df_eobs_haem_mx[df_eobs_haem_mx[admission_field] == admin].iloc[:5])

print("__________________________________________________________________________________________________________")
print("")
print("Number of readings in timeseries BEFORE interpolation", len(df[df[admission_field] == admin]))
print("Number of readings in timeseries AFTER  interpolation", len(df_eobs_haem_mx[df_eobs_haem_mx[admission_field] == admin]))

__________________________________________________________________________________________________________
ADMISSION INFORMATION


Unnamed: 0,admission_id,episode_diagnoses,patient_id,age_at_admission\r,admission_date_time,admission_date_code_time,discharge_date_time,discharge_date_code_time,lengthofstay,isPneumonia,...,icu_count\r,Comorbidity_score,no_obs_eobs,no_haematology_eobs,receivedMedicines,Specific Comorbidity,had_Prev_admin,adm_has_spin,CURB65,antibiotic_4h
9234,52573088,"||A41.9,L97.X,K59.0,Y45.0,R33.X,I25.9,E11.9,E0...",39843456,80,2018-03-01 16:18:00,17591.6792,2018-03-16 11:30:00,17606.4792,14 days 19:12:00,1,...,0,6,80,10,0,1,1.0,,,






Unnamed: 0,admission_id,timestamp,time_since_prev_obs_in_mins,rr,ews,heart_rate,temperature\r,timestamp_code,sbp,dbp,...,MCH,MCV,NEUAB,TLYMAB,EOSAB,MONAB,BASAB,ALB,ALP,BILI
1154880,52573088,2018-03-01 11:15:00,15,20.0,1,71.0,38.3,17591.4668,113.0,57.0,...,,,,,,,,,,
1154881,52573088,2018-03-01 11:30:00,15,20.0,1,72.0,38.2,17591.4763,114.0,56.3333,...,,,,,,,,,,
1154882,52573088,2018-03-01 11:45:00,15,20.0,1,73.0,38.1,17591.4861,115.0,55.6667,...,,,,,,,,,,
1154883,52573088,2018-03-01 12:00:00,15,20.0,1,74.0,38.0,17591.496,116.0,55.0,...,30.6,94.0,18.23,0.53,0.23,0.61,0.05,38.0,93.0,12.0
1154884,52573088,2018-03-01 12:15:00,15,20.0,1,75.0,37.9,17591.5062,117.0,54.3333,...,,,,,,,,,,






Unnamed: 0,admission_id,timestamp,time_since_prev_obs_in_mins,rr,ews,heart_rate,temperature\r,timestamp_code,sbp,dbp,...,MCH,MCV,NEUAB,TLYMAB,EOSAB,MONAB,BASAB,ALB,ALP,BILI
1154880,52573088,2018-03-01 11:15:00,15,20.0,1,71.0,38.3,17591.4668,113.0,57.0,...,30.6,94.0,18.23,0.53,0.23,0.61,0.05,38.0,93.0,12.0
1154882,52573088,2018-03-01 11:45:00,15,20.0,1,73.0,38.1,17591.4861,115.0,55.6667,...,30.6,94.0,18.23,0.53,0.23,0.61,0.05,38.0,93.0,12.0
1154884,52573088,2018-03-01 12:15:00,15,20.0,1,75.0,37.9,17591.5062,117.0,54.3333,...,30.6417,94.0833,17.3958,0.5658,0.2175,0.5983,0.0467,38.0,93.0,12.0
1154886,52573088,2018-03-01 12:45:00,15,20.0,1,77.0,37.7,17591.5274,119.0,53.0,...,30.6417,94.0833,17.3958,0.5658,0.2175,0.5983,0.0467,38.0,93.0,12.0
1154888,52573088,2018-03-01 13:15:00,15,20.0,2,76.6,37.78,17591.55,110.2,52.2,...,30.6417,94.0833,17.3958,0.5658,0.2175,0.5983,0.0467,38.0,93.0,12.0


__________________________________________________________________________________________________________

Number of readings in timeseries BEFORE interpolation 288
Number of readings in timeseries AFTER  interpolation 144


# Add Confusion to data
The confusion field is a binary field computed from the Early warning score. 
Document NEWS2 Chart 1_The NEWS scoring system from [Link](https://www.rcplondon.ac.uk/projects/outputs/national-early-warning-score-news-2)

In [15]:
# -----------------------------------------------------------------
# Functions to find confusion
# -----------------------------------------------------------------
def ews_rr_score(rr_row):
    rr_score = 0
    if (rr_row >= 25) or (rr_row < 9):     rr_score = 3
    elif (21 <= rr_row) and (rr_row <= 24):rr_score = 2
    elif (9 <= rr_row) and (rr_row <= 11): rr_score = 1
    else: rr_score = 0
    return rr_score
# -----------------------------------------------------------------
# Oxygen saturation Score
def ews_ox_score(ox_row, as_ox_row):
    ox_score = 0
    if (ox_row <= 91): ox_score = 3
    elif (92 <= ox_row) and (ox_row <= 93): ox_score = 2
    elif (93 < ox_row) and (ox_row <= 95): ox_score  = 1
    else: ox_score = 0        
    if as_ox_row == 1: ox_score = ox_score + 2
    return ox_score

# -----------------------------------------------------------------
# Systolic blood presure score
def ews_sbp_score(sbp_row):
    sbp_score = 0
    if (sbp_row >= 220) or (sbp_row <= 90):    sbp_score = 3
    elif (91 <= sbp_row) and (sbp_row <= 100): sbp_score = 2
    elif (101 <= sbp_row) and (sbp_row <= 110):sbp_score = 1
    else: sbp_score = 0
    return sbp_score

# -----------------------------------------------------------------
# Heart rate score
def ews_hr_score(hr_row):
    hr_score = 0
    if (hr_row >= 131) or (hr_row <= 40):    hr_score = 3
    elif (111 <= hr_row) and (hr_row <= 130): hr_score = 2
    elif (91 <= hr_row) and (hr_row <= 110):hr_score = 1
    elif (41 <= hr_row) and (hr_row <= 50): hr_score = 1
    else: hr_score = 0
    return hr_score

# -----------------------------------------------------------------
# Temperature score
def ews_temp_score(temp_row):
    temp_score = 0
    if (temp_row <= 31.5):   temp_score = 3
    elif (39.1 <= temp_row): temp_score = 2
    elif (35.1 <= temp_row) and (temp_row <= 36):temp_score = 1
    elif (38.1 <= temp_row) and (temp_row <= 39): temp_score = 1
    else: temp_score = 0
    return temp_score

# -----------------------------------------------------------------
# Compute Confusion using the information of the Row
def confusion_from_row(row):
    confusion_row = 0
    ews_row = row['ews'] 
    rr_row = row['rr']
    ox_row    = row['Oxygen_Saturation']
    as_ox_row = row['Assisted_O2']
    sbp_row    = row['sbp']
    hr_row    = row['heart_rate']
    temp_row    = row['temperature\r']
    ews_score = ews_rr_score(rr_row)
    ews_score = ews_score + ews_ox_score(ox_row, as_ox_row)
    ews_score = ews_score + ews_sbp_score(sbp_row)
    ews_score = ews_score + ews_hr_score(hr_row)
    ews_score = ews_score + ews_temp_score(temp_row)
    if (abs(ews_score - ews_row) <= 3) and (abs(ews_score - ews_row) > 0): confusion_row = 1
    return confusion_row 

In [16]:
t = time.time()
df_eobs_haem_mx['Confusion'] = df_eobs_haem_mx.apply(lambda x:confusion_from_row(x), axis = 1)
col = df_eobs_haem_mx.pop('Confusion')
df_eobs_haem_mx.insert(12, col.name, col)

print("time elapsed: ", time.time() - t)

time elapsed:  43.91019582748413


In [17]:
df_eobs_haem_mx.columns

Index(['admission_id', 'timestamp', 'time_since_prev_obs_in_mins', 'rr', 'ews',
       'heart_rate', 'temperature\r', 'timestamp_code', 'sbp', 'dbp',
       'Oxygen_Saturation', 'Assisted_O2', 'Confusion', 'CREA', 'UREA', 'K',
       'GFR', 'WBC', 'PLT', 'HCT', 'HGB', 'RBC', 'MCH', 'MCV', 'NEUAB',
       'TLYMAB', 'EOSAB', 'MONAB', 'BASAB', 'ALB', 'ALP', 'BILI'],
      dtype='object')

In [18]:
df_eobs_haem_mx.head(5)

Unnamed: 0,admission_id,timestamp,time_since_prev_obs_in_mins,rr,ews,heart_rate,temperature\r,timestamp_code,sbp,dbp,...,MCH,MCV,NEUAB,TLYMAB,EOSAB,MONAB,BASAB,ALB,ALP,BILI
0,74554266,2018-02-19 00:45:00,15,18.0,0,78.0,36.1,17581.0378,120.0,67.0,...,31.7,95.3333,17.19,7.0,0.04,0.5,,46.0,198.0,13.0
2,74554266,2018-02-19 01:15:00,15,18.5,0,74.75,36.125,17581.0592,121.25,66.5,...,31.7,95.3333,17.19,7.0,0.04,0.5,,46.0,198.0,13.0
4,74554266,2018-02-19 01:45:00,15,19.0,0,71.5,36.15,17581.0805,122.5,66.0,...,31.7,95.3333,17.19,7.0,0.04,0.5,,46.0,198.0,13.0
6,74554266,2018-02-19 02:15:00,15,19.5,1,68.25,36.175,17581.1018,123.75,65.5,...,31.7,95.3333,17.19,7.0,0.04,0.5,,46.0,198.0,13.0
8,74554266,2018-02-19 02:45:00,15,20.0,1,65.0,36.2,17581.1231,125.0,65.0,...,31.7,95.3333,17.19,7.0,0.04,0.5,,46.0,198.0,13.0


# Cleanning the final mixed data frame

In [19]:
df_eobs_haem_mx.isna().sum()

admission_id                        0
timestamp                           0
time_since_prev_obs_in_mins         0
rr                                  0
ews                                 0
heart_rate                          0
temperature\r                       0
timestamp_code                      0
sbp                                 0
dbp                                 0
Oxygen_Saturation                   0
Assisted_O2                         0
Confusion                           0
CREA                            46479
UREA                            46479
K                               46479
GFR                             47343
WBC                             46479
PLT                             46767
HCT                             46479
HGB                             46479
RBC                             46479
MCH                             46479
MCV                             46479
NEUAB                           59295
TLYMAB                          59439
EOSAB       

In [20]:
t =time.time()
df_eobs_haem_mx_v2 = pd.DataFrame(columns = df_eobs_haem_mx.columns)
for adm in df_eobs_haem_mx[admission_field].unique().tolist():
    df_adm = df_eobs_haem_mx[df_eobs_haem_mx[admission_field] == adm].copy()
    if df_adm.isna().sum().sum() == 0:
        df_eobs_haem_mx_v2 = pd.concat([df_eobs_haem_mx_v2, df_adm])
print("elpased:", time.time()-t)   

elpased: 5260.413067340851


In [21]:
df_eobs_haem_mx_v2.isna().sum()

admission_id                   0
timestamp                      0
time_since_prev_obs_in_mins    0
rr                             0
ews                            0
heart_rate                     0
temperature\r                  0
timestamp_code                 0
sbp                            0
dbp                            0
Oxygen_Saturation              0
Assisted_O2                    0
Confusion                      0
CREA                           0
UREA                           0
K                              0
GFR                            0
WBC                            0
PLT                            0
HCT                            0
HGB                            0
RBC                            0
MCH                            0
MCV                            0
NEUAB                          0
TLYMAB                         0
EOSAB                          0
MONAB                          0
BASAB                          0
ALB                            0
ALP       

In [22]:
df_eobs_haem_mx_v2.describe()

Unnamed: 0,admission_id,timestamp,time_since_prev_obs_in_mins,rr,ews,heart_rate,temperature\r,timestamp_code,sbp,dbp,...,MCH,MCV,NEUAB,TLYMAB,EOSAB,MONAB,BASAB,ALB,ALP,BILI
count,1352160,1352160,1352160,1352160.0,1352160,1352160.0,1352160.0,1352160.0,1352160.0,1352160.0,...,1352160.0,1352160.0,1352160.0,1352160.0,1352160.0,1352160.0,1352160.0,1352160.0,1352160.0,1352160.0
unique,9390,91239,1,51243.0,20,102516.0,78061.0,1352047.0,118023.0,98683.0,...,13270.0,4664.0,42978.0,19600.0,8535.0,13369.0,3423.0,4288.0,14502.0,5476.0
top,78793793,2017-12-30 08:15:00,15,18.0,0,90.0,36.5,17123.862,111.0,68.0,...,30.0,90.0,20.0,0.3,0.01,0.5,0.02,36.0,400.0,6.0
freq,144,40,1352160,283666.0,361903,21374.0,23856.0,2.0,7273.0,9426.0,...,8961.0,34550.0,24867.0,17243.0,55036.0,20841.0,138295.0,51335.0,25471.0,63280.0


In [23]:
len(df_eobs_haem_mx_v2[admission_field].unique().tolist())

9390

In [24]:
features = df_eobs_haem_mx_v2.columns[2:]
for feat in features:
    if df_eobs_haem_mx_v2[feat].dtypes == 'O':
        df_eobs_haem_mx_v2[feat] = pd.to_numeric(df_eobs_haem_mx_v2[feat])
pickle.dump([df_eobs_haem_mx_v2], open('DataFrame_pickles/df_eobs_heam_mixed_2016_2018_V2.pickle', 'wb'))

In [25]:
t =time.time()
df_eobs_haem_mx_v3 = pd.DataFrame(columns = df_eobs_haem_mx.columns[:15])
for adm in df_eobs_haem_mx[admission_field].unique().tolist():
    df_adm = df_eobs_haem_mx[df_eobs_haem_mx[admission_field] == adm][df_eobs_haem_mx.columns[:15]].copy()
    if df_adm.isna().sum().sum() == 0:
        df_eobs_haem_mx_v3 = pd.concat([df_eobs_haem_mx_v3, df_adm])
print("elpased:", time.time()-t)   

elpased: 2977.750351190567


In [26]:
len(df_eobs_haem_mx_v3[admission_field].unique().tolist())

9941

In [27]:
features = df_eobs_haem_mx_v3.columns[2:]
for feat in features:
    if df_eobs_haem_mx_v3[feat].dtypes == 'O':
        df_eobs_haem_mx_v3[feat] = pd.to_numeric(df_eobs_haem_mx_v3[feat])
pickle.dump([df_eobs_haem_mx_v3], open('DataFrame_pickles/df_eobs_heam_mixed_2019_2020_V3.pickle', 'wb'))