### Descriptions:
COHORT:
* Conor's original cohort: 52,314
* Remove admit years of 2014 and 2020: 52,298
* Remove 8805 csn with non-full code or <18y.old: 43,493 
* Remove 173 csn without vital signs and GCS, and 29 csn with only GCS and no other vital signs: 43,291
* Remove 201 csn wihout any labels during hospital stays: 43,090
* Further remove 82 patients without any labels at 24 hour mark: 43,008 (**cohort_labels.csv**)
* **final**: remove all patients without a complete set of VS: 41,654

On BQ **cohort_final_with_labels_complete1VS** is our most updated cohort, size of 41654 unique encounters, and *cohort_labels* from Tiffany is marked as to_keep on BQ

JOIN all features together with the final cohort:

Inputs: cohort_final (processed in R2 notebook), cohort_demo_final (R1), vitals_clean (R2), labs_clean (R3)

* Combine cohort with demographic, vitals, and labs to the long format
* Use final cohort size of 41,654, only patients who are full code, 18yr or above, and have at least a complete set of 1st VS
* Demographic have indicators of missingness (ESI, H and W) and one hot coding for categorical variables (gender and race)

Output: 
* **features_demos_vitals_labs.csv** 3,308,906 rows in long format with anon_id, csn, label_24_recent, admit_time, 
* with feature_type, features, values, and time (NA for demo, recorded for vitals and result for labs)

### Importing R libraries

In [None]:
library(caret) # import this before glmnet to avoid rlang version problem
library(xgboost)
library(data.table)
library(tidyverse)
library(lubridate)
library(Matrix)
# library(slam)
library(glmnet)
library(bit64)
# library(mtools) for one hot coder, not available on Nero or use caret or tidyr
library(mice)
options(repr.matrix.max.rows=200, repr.matrix.max.cols=40)

### Call back all datasets: 
* demographic with ESI
* vitals with GCS (note that this data has 43320 rows, but for ESI imputation, remove those with GCS as well, so we have 43291 left)
* labs, still with 9999999 values

In [None]:
# nrow(cohort_vitals_clean %>% filter(anon_id == "JCd49287")) #23
# cohort_demo_clean %>% filter(anon_id == "JCd49287")

cohort <- read.csv("./Data/cohort_final.csv")
# demos <-  read.csv("./Data/cohort_demo_completed.csv")
demos <- read.csv("./Data/cohort_demo_final.csv") # updated demographic with latest cohort
vitals <- read.csv("./Data/vitals_clean.csv")
labs <- read.csv("./Data/labs_clean.csv")

nrow(cohort) # cohort final 41654
nrow(demos)
nrow(vitals) #1,274,314
nrow(labs) #1,368,351

nrow(demos %>% select(anon_id, pat_enc_csn_id_coded) %>% distinct())
# nrow(demos %>% select(pat_enc_csn_id_coded) %>% distinct())

nrow(vitals %>% select(anon_id, pat_enc_csn_id_coded) %>% distinct())
# nrow(vitals %>% select(pat_enc_csn_id_coded) %>% distinct())

nrow(labs %>% select(anon_id, pat_enc_csn_id_coded) %>% distinct()) # 39226
# nrow(labs %>% select(pat_enc_csn_id_coded) %>% distinct())

# nrow(demos %>% distinct(pat_enc_csn_id_coded) %>% filter(pat_enc_csn_id_coded %in% cohort$pat_enc_csn_id_coded))
# nrow(vitals %>% distinct(pat_enc_csn_id_coded) %>% filter(pat_enc_csn_id_coded %in% cohort$pat_enc_csn_id_coded))
# nrow(labs %>% distinct(pat_enc_csn_id_coded) %>% filter(pat_enc_csn_id_coded %in% cohort$pat_enc_csn_id_coded))

nrow(cohort %>% distinct(pat_enc_csn_id_coded) %>% filter(pat_enc_csn_id_coded %in% demos$pat_enc_csn_id_coded))
nrow(cohort %>% distinct(pat_enc_csn_id_coded) %>% filter(pat_enc_csn_id_coded %in% vitals$pat_enc_csn_id_coded))
nrow(cohort %>% distinct(pat_enc_csn_id_coded) %>% filter(pat_enc_csn_id_coded %in% labs$pat_enc_csn_id_coded))

In [None]:
colnames(cohort)
colnames(demos)
colnames(vitals)
colnames(labs)

### OLD --- when Tiffany had a list of patients who had no labels throughout the hospital stays

### Remove patients in hospitals with missing levels of care
* 202: no labels at all for the entire hospital stays, smallest set
* 806: no levels at the admission level, adt table
* 136: no levels of care at 0 - 12hrs
* 82: no levels of care at the 24hrs

In [None]:
# JCdcafca and 131187786922, 20015518, 0, 2016-05-15 20:57:00+00:00 
# in the no_labels cohort (and Conor's cohort) but not in the updated cohort
noinco <- cohort[cohort$pat_enc_csn_id_coded %in% no_labels$pat_enc_csn_id_coded,]
head(no_labels[!no_labels$pat_enc_csn_id_coded %in% noinco$pat_enc_csn_id_coded, ])

In [None]:
# remove patients in the final cohort with missing labels:
cohort <- anti_join(cohort, no_labels, by = c("anon_id", "pat_enc_csn_id_coded"))
nrow(cohort) # 43291 - 202
head(cohort)

### OK to continue here

In [None]:
summary(vitals %>% group_by(features) %>% select(values))
summary(labs %>% group_by(features) %>% select(values))

### Check cohort patients who are not in the vital signs table
Note that all NA were dropped from vital signs. Another approach is to keep and impute them for same time in wide format tables

In [None]:
demos <- demos %>% select(-c(SBP, DBP, Pulse, RR, SpO2, Temp))
colnames(demos)

In [None]:
dim(demos)
colnames(demos)
demo_long <- gather(demos, features, values, ESI_i:race.White, factor_key=TRUE) %>%
                mutate(feature_type = "demo") %>% select(-c(inpatient_data_id_coded, label_max24))
                
head(demo_long, n=1)
dim(demos)
nrow(demo_long) # 43291*29 (cols with values)

In [None]:
summary(demo_long$values)
demo_long %>% group_by(features) %>% count()

In [None]:
head(demo_long, n=1)
head(vitals, n=1)
head(labs, n=1)

In [None]:
# clean vitals and labs to merge
vitals <- vitals %>% select(anon_id, pat_enc_csn_id_coded, admit_time, features, values, feature_type, time=recorded_time) 
labs <- labs %>% select(anon_id, pat_enc_csn_id_coded, admit_time, features, values, feature_type, time=result_time)

In [None]:
head(labs %>% arrange(values))
head(labs %>% arrange(desc(values)))

In [None]:
# combine demos, vitals and labs, long format, with "time"
feat3 <- bind_rows(demo_long, vitals, labs)
feat3 <- as.data.frame(unclass(feat3))
nrow(feat3)

In [None]:
head(feat3, n=1)
tail(feat3, n=1)

In [None]:
feat3 %>% count(feature_type)

In [None]:
feat3 %>% group_by(feature_type, features) %>% count()

In [None]:
summary(feat3)

In [None]:
# remember the labs and vitals still contain patients who have no other vital signs except for a GCS
nrow(feat3 %>% select(anon_id, pat_enc_csn_id_coded) %>% distinct())
nrow(feat3 %>% select(pat_enc_csn_id_coded) %>% distinct())

### Explore -- GCS and ESI -- No need to redo
* GCS was not used to impute ESI because there are not many encounters with GCS score
* So encounters without any other VS but even with GCS are excluded from the data
* However, we check to see these encounters with GCS and they all have ESI
* We will not keep them in the data anyways, because if some don't have ESI, it's too many loops to include/exclude

### Join with the final cohort


In [None]:
nrow(feat3)
nrow(feat3 %>% distinct(anon_id, pat_enc_csn_id_coded))
head(feat3, n=1)
nrow(cohort)
head(cohort, n=1)

In [None]:
cohort <- cohort %>% mutate(admit_time = ymd_hms(admit_time))
feat3 <- feat3 %>% mutate(admit_time = ymd_hms(admit_time))
head(cohort, n=1)
head(feat3, n=1)

In [None]:
final_feat3 <- left_join(cohort, feat3)
nrow(final_feat3) # 3,308,906
nrow(final_feat3 %>% distinct(anon_id, pat_enc_csn_id_coded))

In [None]:
nrow(final_feat3 %>% select(anon_id, pat_enc_csn_id_coded) %>% distinct())
nrow(final_feat3 %>% select(pat_enc_csn_id_coded) %>% distinct())
nrow(final_feat3 %>% select(anon_id) %>% distinct())

In [None]:
head(final_feat3, n=1)

In [None]:
final_feat3 %>% count(feature_type)

In [None]:
final_feat3 %>% group_by(feature_type, features) %>% count()

In [None]:
summary(final_feat3)

In [None]:
final_feat3 <- final_feat3 %>% select(anon_id, pat_enc_csn_id_coded, label_24hr_recent, admit_time,
                                     feature_type, features, values, time)
head(final_feat3)

In [None]:
tail(final_feat3)

In [None]:
write.csv(final_feat3, file = "./Data/features_demos_vitals_labs.csv", row.names=FALSE)

### OLD --

In [None]:
# write.csv(cohort, file = "./Data/cohort_has_vs_hxlabels.csv", row.names=FALSE)

In [None]:
# read Tiffany's label
labels <- read.csv("./Data/labels.csv")
nrow(labels)

In [None]:
head(labels, n=1)
head(cohort, n=1)

In [None]:
summary(labels)
colnames(labels)

In [None]:
new_cohort <- cohort %>% select(-admit_time) %>% left_join(labels) %>%
                    rename(label_24hr_recent = X_24hr_recent_label, label_12hr_recent = X_12hr_recent_label)
nrow(new_cohort)
summary(new_cohort)

In [None]:
# write.csv(new_cohort, "./Data/cohort_final_with_labels.csv", row.names = FALSE)

In [None]:
length(cohort$pat_enc_csn_id_coded %in% new_cohort$pat_enc_csn_id_coded)
length(new_cohort$pat_enc_csn_id_coded %in% cohort$pat_enc_csn_id_coded)
length(new_cohort$pat_enc_csn_id_coded %in% feat3$pat_enc_csn_id_coded)
length(feat3$pat_enc_csn_id_coded %in% new_cohort$pat_enc_csn_id_coded)
nrow(feat3)