# Teil 4 Demo 2: Abfragen mit Tidyverse

## Bibliotheken & Konfiguration

In [1]:
packages <- c("readr", "dplyr", "stringr", "tidyr")
install.packages(setdiff(packages, rownames(installed.packages())))
lapply(packages, require, character.only = TRUE)

base_url <- "https://raw.githubusercontent.com/ganslats/TMF-School-2022-Block-4/master/Rohdaten/mimic-iii-demo/"

Loading required package: readr

Loading required package: dplyr


Attaching package: 'dplyr'


The following objects are masked from 'package:stats':

    filter, lag


The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union


Loading required package: stringr

Loading required package: tidyr



## Ausgewählte MIMIC III-Rohdaten laden

In [2]:
mimic.patients.raw      <- read_csv(paste(base_url, "PATIENTS.csv", sep=""),
                                    col_types = cols(row_id = col_double(), subject_id = col_double(), gender = col_character(), dob = col_datetime(format = ""), dod = col_datetime(format = ""), dod_hosp = col_datetime(format = ""), dod_ssn = col_datetime(format = ""), expire_flag = col_double()))
mimic.admissions.raw    <- read_csv(paste(base_url, "ADMISSIONS.csv", sep=""),
                                    col_types = cols(  row_id = col_double(), subject_id = col_double(), hadm_id = col_double(), admittime = col_datetime(format = ""), dischtime = col_datetime(format = ""), deathtime = col_datetime(format = ""), admission_type = col_character(), admission_location = col_character(), discharge_location = col_character(), insurance = col_character(), language = col_character(), religion = col_character(), marital_status = col_character(), ethnicity = col_character(), edregtime = col_datetime(format = ""), edouttime = col_datetime(format = ""), diagnosis = col_character(), hospital_expire_flag = col_double(), has_chartevents_data = col_double()))
mimic.prescriptions.raw <- read_csv(paste(base_url, "PRESCRIPTIONS.csv", sep=""),
                                    col_types = cols(row_id = col_double(), subject_id = col_double(), hadm_id = col_double(), icustay_id = col_double(), startdate = col_datetime(format = ""), enddate = col_datetime(format = ""), drug_type = col_character(), drug = col_character(), drug_name_poe = col_character(), drug_name_generic = col_character(), formulary_drug_cd = col_character(), gsn = col_character(), ndc = col_character(), prod_strength = col_character(), dose_val_rx = col_character(), dose_unit_rx = col_character(), form_val_disp = col_character(), form_unit_disp = col_character(), route = col_character()))
#head(mimic.patients.raw)
#head(mimic.admissions.raw)
#head(mimic.prescriptions.raw)

## Alle Spalten eines Tibble abfragen

In [3]:
head(mimic.patients.raw)

row_id,subject_id,gender,dob,dod,dod_hosp,dod_ssn,expire_flag
<dbl>,<dbl>,<chr>,<dttm>,<dttm>,<dttm>,<dttm>,<dbl>
9467,10006,F,2094-03-05,2165-08-12,2165-08-12,2165-08-12,1
9472,10011,F,2090-06-05,2126-08-28,2126-08-28,,1
9474,10013,F,2038-09-03,2125-10-07,2125-10-07,2125-10-07,1
9478,10017,F,2075-09-21,2152-09-12,,2152-09-12,1
9479,10019,M,2114-06-20,2163-05-15,2163-05-15,2163-05-15,1
9486,10026,F,1895-05-17,2195-11-24,,2195-11-24,1


## Ausgewählte Spalten eines Tibble abfragen

In [4]:
head(mimic.patients.raw %>% select(subject_id, gender))

subject_id,gender
<dbl>,<chr>
10006,F
10011,F
10013,F
10017,F
10019,M
10026,F


## Ausgewählte Zeilen eines Tibble abfragen

In [5]:
head(mimic.patients.raw %>% filter(gender == 'M'))

# Merke: bei Vergleichen muss das doppelte Gleichheitszeichen (==) verwendet werden!

row_id,subject_id,gender,dob,dod,dod_hosp,dod_ssn,expire_flag
<dbl>,<dbl>,<chr>,<dttm>,<dttm>,<dttm>,<dttm>,<dbl>
9479,10019,M,2114-06-20,2163-05-15,2163-05-15,2163-05-15,1
9489,10029,M,2061-04-10,2140-09-21,,2140-09-21,1
9491,10032,M,2050-03-29,2138-05-21,2138-05-21,2138-05-21,1
9494,10035,M,2053-04-13,2133-03-30,,2133-03-30,1
9501,10042,M,2076-05-06,2150-12-03,,2150-12-03,1
9502,10043,M,2109-04-07,2191-02-07,,2191-02-07,1


## Ergebnis auf eindeutige Datensätze reduzieren (DISTINCT)

In [6]:
mimic.patients.raw %>% 
    select(gender) %>% 
    distinct()

gender
<chr>
F
M


## Einfache Aggregation: Gesamtzahl der Datensätze eines Tibble abfragen

In [7]:
mimic.patients.raw %>% summarize(n = n())

n
<int>
100


## Aggregation & Gruppierung: Anzahl nach Geschlecht ermitteln

In [8]:
mimic.patients.raw %>% 
    group_by(gender) %>% 
    summarize(n = n(), .groups="keep")

# Merke: die Option .groups="keep" ist nötig, damit das Gruppierungsmerkmal in der Ausgabe übernommen wird!

Your code contains a unicode char which cannot be displayed in your
current locale and R will silently convert it to an escaped form when the
R kernel executes this code. This can lead to subtle errors if you use
such chars to do comparisons. For more information, please see
https://github.com/IRkernel/repr/wiki/Problems-with-unicode-on-windows

gender,n
<chr>,<int>
F,55
M,45


## Zeilen nach der Aggregation filtern

In [9]:
mimic.patients.raw %>% 
    group_by(gender) %>% 
    summarize(n = n(), .groups="keep") %>% 
    filter(n > 50)

gender,n
<chr>,<int>
F,55


## Tibbles per JOIN miteinander verknüpfen

In [10]:
head(mimic.patients.raw %>% 
     inner_join(mimic.admissions.raw, by = "subject_id") %>% 
     select(subject_id, gender, diagnosis)
)

subject_id,gender,diagnosis
<dbl>,<chr>,<chr>
10006,F,SEPSIS
10011,F,HEPATITIS B
10013,F,SEPSIS
10017,F,HUMERAL FRACTURE
10019,M,ALCOHOLIC HEPATITIS
10026,F,STROKE/TIA


## Patient:innen mit mehr als einem Intensiv-Aufenthalt abfragen

In [11]:
head(mimic.patients.raw %>% 
     inner_join(mimic.admissions.raw, by = "subject_id") %>% 
     group_by(subject_id) %>% 
     summarize(n = n(), .groups="keep") %>% 
     filter(n > 1)
)

subject_id,n
<dbl>,<int>
10059,2
10088,3
10094,2
10117,2
10119,2
10124,2


## 2 Subsets von Verschreibungen für die Demo verschiedener Joins erzeugen

* Patient:innenen mit Hauptdiagnose Sepsis
* Patient:innen mit Gabe von Vancomycin (Reserve-Antibiotikum z.B. bei MRSA)

In [12]:
demo.sepsis     <- mimic.admissions.raw %>% 
                        filter(str_detect(tolower(diagnosis), "sepsis")) %>% 
                        select(subject_id) %>%
                        distinct()
demo.vancomycin <- mimic.prescriptions.raw %>% 
                        filter(str_detect(tolower(drug), "vancomycin")) %>% 
                        select(subject_id) %>% 
                        distinct()

## Patient:innen abfragen, die sowohl eine Sepsis als Hauptdiagnose hatten als auch Vancomycin erhalten haben (INNER JOIN)

In [13]:
demo.sepsis %>% 
    inner_join(demo.vancomycin, by = "subject_id", keep=TRUE, suffix = c("_sepsis", "_vanco"))

subject_id_sepsis,subject_id_vanco
<dbl>,<dbl>
10006,10006
10036,10036
10056,10056
10088,10088
10094,10094
41976,41976


## Alle Patient:innen abfragen, die eine Sepsis als Hauptdiagnose hatten sowie die mit Vancomycingabe ergänzen (LEFT OUTER JOIN)

In [14]:
demo.sepsis %>% 
    left_join(demo.vancomycin, by = "subject_id", keep=TRUE, suffix = c("_sepsis", "_vanco"))

subject_id_sepsis,subject_id_vanco
<dbl>,<dbl>
10006,10006.0
10013,
10036,10036.0
10056,10056.0
10088,10088.0
10094,10094.0
40601,
41976,41976.0


## Alle Patient:innen abfragen, die eine Vancomycingabe hatten und die keine Sepsisdiagnose hatten (RIGHT OUTER JOIN)

In [15]:
head(demo.sepsis %>% 
     right_join(demo.vancomycin, by = "subject_id", keep=TRUE, suffix = c("_sepsis", "_vanco")),
15)

subject_id_sepsis,subject_id_vanco
<dbl>,<dbl>
10006.0,10006
10036.0,10036
10056.0,10056
10088.0,10088
10094.0,10094
41976.0,41976
,43735
,10065
,44228
,10124


## Tatsächliche Hauptdiagnosen für die Patient:innen mit Vancomycingabe ohne Sepsis ermitteln

In [16]:
head(demo.vancomycin %>% 
     inner_join(mimic.admissions.raw, by = "subject_id") %>% 
     left_join(demo.sepsis, by = "subject_id", keep=TRUE, suffix=c("", "_sepsis")) %>% 
     filter(is.na(subject_id_sepsis)) %>% 
     group_by(diagnosis) %>% 
     summarize(n = n(), .groups="keep") %>% 
     arrange(desc(n)) 
, 20)

diagnosis,n
<chr>,<int>
FEVER,3
ASTHMA;CHRONIC OBST PULM DISEASE,2
CONGESTIVE HEART FAILURE,2
FAILURE TO THRIVE,2
GASTROINTESTINAL BLEED,2
HYPOTENSION,2
LIVER FAILURE,2
PNEUMONIA,2
UPPER GI BLEED,2
ABDOMINAL PAIN,1
