# Medication and diagnosis visualization 

As part of the phase 1.1 we have explored the relation between medication and diagnosis representing in a scatter plot: % patients with a diagnostic vs % patients on a mediacation. The plots are focused only on values before admission, and each scatter plot includes two data series: one for never severe and another one for ever severe. Each dot represent a site and the value for never severe, while each triangle represent a site and the value for ever severe. The colors are used based on the country of each site. 

In [None]:
#############
# LIBRARIES #
#############
library(dplyr)
library(tidyr)
library(ggplot2)

Set the working directory where the files are (using as input the files in "latest")

In [None]:
setwd("./4CE/phase1.1/latest/")

In [None]:
#function to create a list with the files to analyze
fileList <- function( path, pattern, pediatricSites, pediatric ){
 
  fileListInput <- list.files( path = path,pattern = pattern)
  if( pediatric == TRUE){
    fileListInput  <- fileListInput[ grepl( paste( pediatricSites, collapse = "|"), x = fileListInput)]
  }else{
    fileListInput  <- fileListInput[ !grepl( paste( pediatricSites, collapse = "|"), x = fileListInput)]
  }
  return( fileListInput)
}

Create a list with all the files distinguishing between:
- Demographics data
- Medication data
- Diagnosis data

We will distinguish between pediatric and non pediatric (select_pediatric TRUE or FALSE)
The list of the current sites with pediatric data are detailed in the vector 'pediatricSites'

In [None]:
pediatricSites <- c("BCH", "CHOP", "RP401PED", "APHPPED")
select_pediatric = FALSE
fileListDiag <- fileList( path = "./",pattern = "Diag", pediatricSites = pediatricSites, pediatric = select_pediatric)
fileListMed <- fileList( path = "./",pattern = "Med", pediatricSites = pediatricSites, pediatric = select_pediatric)
fileListDemog <- fileList( path = "./",pattern = "Demog", pediatricSites = pediatricSites, pediatric = select_pediatric)

Put together all the diagnosis data from the distinct sites

In [None]:
for( i in 1:length( fileListDiag ) ){
  print(i)
  selection <- read.delim( fileListDiag[i], sep = ",", colClasses = "character")
  colnames(selection) <- tolower( colnames( selection ) )
  
  if( i== 1){
    allDiagnosis <- selection
  }else{ 
    allDiagnosis <- rbind( allDiagnosis, selection )
  }
}
#transform column names to know they are referring to diagnosis
colnames( allDiagnosis)[4:7] <- paste0( colnames( allDiagnosis)[4:7], "_diagnosis")

Put together all the medication data from the distinct sites

In [None]:
for( i in 1:length( fileListMed ) ){
  print(i)
  selection <- read.delim( fileListMed[i], sep = ",", colClasses = "character")
  colnames(selection) <- tolower( colnames( selection ) )
  
  if( i== 1){
    allMedication <- selection
  }else{ 
    allMedication <- rbind( allMedication, selection )
  }
}
#transform column names to know they are referring to medication
colnames( allMedication)[3:6] <- paste0( colnames( allMedication)[3:6], "_medication")

Put together all the demographic data from the distinct sites

In [None]:
for( i in 1:length( fileListDemog ) ){
  print(i)
  selection <- read.delim( fileListDemog[i], sep = ",", colClasses = "character")
  colnames( selection ) <- tolower( colnames( selection ))
  selection <- selection[ selection$sex == "all" &
                            selection$age_group == "all" & 
                            selection$race == "all", ]
  
  if( i== 1){
    allDemographics <- selection
  }else{ 
    allDemographics <- rbind( allDemographics, selection )
  }
}

#estimate the total never severe per site and change the column names to know they are refered to total counts per site
allDemographics[ allDemographics < 0 ] <- NA
allDemographics$num_patients_never_severe <- as.numeric(allDemographics$num_patients_all) - as.numeric(allDemographics$num_patients_ever_severe)
allDemographics<- allDemographics[, c("siteid", "num_patients_all", "num_patients_ever_severe", "num_patients_never_severe")]
colnames(allDemographics) <- c("siteid", "totalPatients", "totalEverSever", "totalNeverSevere")

Put all the information together mergin all dataframes by siteid

In [None]:
finalDataSet <- merge( allDiagnosis, allMedication, by = "siteid")
finalDataSet <- merge( finalDataSet, allDemographics, by = "siteid")

Transform obfuscated and unknown values (-99, -999) into NA

In [None]:
finalDataSet[,c(4:7,9:15)] <- sapply(finalDataSet[,c(4:7,9:15)],as.numeric)
finalDataSet[ finalDataSet < 0 ] <- NA

Put together all the ICSM datasets as "ICSM"

In [None]:
ICSMdata <- finalDataSet[finalDataSet$siteid %in% c( "ICSM1", "ICSM20", "ICSM5" ), -1]
ICSM <- ICSMdata %>% 
  group_by(icd_code_3chars, icd_version, med_class) %>% 
  summarise_all( sum ) %>%
  mutate( siteid = "ICSM")

#remove the individual ICSM sets and add the aggregated one
ICSM <- as.data.frame( ICSM )
finalDataSet <- finalDataSet[ ! finalDataSet$siteid %in% c( "ICSM1", "ICSM20", "ICSM5" ), ]
finalDataSet <- rbind( finalDataSet, ICSM )

Add the ICD description for each diagnosis, using the mapping file "2020AA_Icd9_Icd10_Dictionary.txt", extracted from UMLS2020AA

In [None]:
mapping <- read.delim("./mappingFiles/2020AA_Icd9_Icd10_Dictionary.txt", colClasses = "character")
colnames(mapping) <- c("icd", "icd_version", "description" )
#we map according to the pair ICD + version to avoid issues when same code is present in ICD9 and ICD10 but refer to different diagnosis
mapping$icdPair <- paste0( mapping$icd, "-", mapping$icd_version)

finalDataSet$icdPair <- paste0( finalDataSet$icd_code_3chars, "-", finalDataSet$icd_version)
toanalyze <- merge( finalDataSet, mapping, by = "icdPair")

Add percentage estimations:
- ever severe and on X medication / total ever severe
- ever severe and with X disease / total ever severe
- never severe qnd on X medication / total never severe
- never severe and with X disease / total never severe

In [None]:
# ever severe and on X medication / total ever severe
# ever severe and with X disease / total ever severe
toanalyze$ratio_everSevereOnMedication_beforeAdmission <-  round( 100*(toanalyze$num_patients_ever_severe_before_admission_medication / toanalyze$totalEverSever), 3)
toanalyze$ratio_everSevereWithDiagnosis_beforeAdmission <- round( 100*(toanalyze$num_patients_ever_severe_before_admission_diagnosis / toanalyze$totalEverSever), 3)

# never severe qnd on X medication / total never severe
# never severe and with X disease / total never severe
toanalyze$num_patients_never_severe_before_admission_diagnosis <- toanalyze$num_patients_all_before_admission_diagnosis - toanalyze$num_patients_ever_severe_before_admission_diagnosis
toanalyze$num_patients_never_severe_before_admission_medication <- toanalyze$num_patients_all_before_admission_medication - toanalyze$num_patients_ever_severe_before_admission_medication
toanalyze$ratio_nonSevereOnMedication_beforeAdmission <-  round( 100*( toanalyze$num_patients_never_severe_before_admission_medication / toanalyze$totalNeverSevere), 3)
toanalyze$ratio_nonSevereWithDiagnosis_beforeAdmission <- round( 100*( toanalyze$num_patients_never_severe_before_admission_diagnosis/ toanalyze$totalNeverSevere), 3)

Remove negative values and values > 100 that can appear due to obfucation blurring

In [None]:
toanalyze[ toanalyze < 0 ] <- NA
toanalyze[ , c(20,21,24,25) ][ toanalyze[ , c(20,21,24,25) ] > 100 ] <- NA

Add site mapping

In [None]:
if( select_pediatric == TRUE){
  siteMaping <- read.delim("./mappingFiles/SiteID_Map_Pediatric.csv", colClasses = "character", sep=",")
}else{
  siteMaping <- read.delim("./mappingFiles/SiteID_Map_None_Pediatric.csv", colClasses = "character", sep=",")
}

toanalyze <- merge( toanalyze, siteMaping, by.x = "siteid", by.y = "Acronym")

Transform the format of data.frame to plot ever and never severe in the same plot

In [None]:
everSevere <- toanalyze[ , c("siteid", "med_class", "description", "ratio_everSevereOnMedication_beforeAdmission", "ratio_everSevereWithDiagnosis_beforeAdmission", "Country.Color")]
everSevere$status <- "Severe"
colnames(everSevere) <- gsub("ratio_everSevere", "",colnames( everSevere))

neverSevere <- toanalyze[ , c("siteid", "med_class", "description", "ratio_nonSevereOnMedication_beforeAdmission", "ratio_nonSevereWithDiagnosis_beforeAdmission", "Country.Color")]
neverSevere$status <- "Never Severe"
colnames(neverSevere) <- gsub("ratio_nonSevere", "",colnames( neverSevere))

toplot <- rbind( everSevere, neverSevere)
toplot <- toplot[ complete.cases( toplot ), ]
save( toplot, file="./toplot.RData")

Example of the scatter plot generated. 

In [None]:
medication_class <- "COAGB"
diagnosis_group <- "Acute kidney failure"

datatoplot <- toplot[ toplot$description == diagnosis_group &
                        toplot$med_class == medication_class,]
datatoplot <- datatoplot[ complete.cases( datatoplot ), ]

xlim <- max( c( datatoplot$WithDiagnosis_beforeAdmission), na.rm = TRUE)
ylim <- max( c( datatoplot$OnMedication_beforeAdmission), na.rm = TRUE)


ggplot( data = datatoplot, mapping = aes( x = WithDiagnosis_beforeAdmission, 
                                          y = OnMedication_beforeAdmission), 
        shape = status) +
  geom_point( aes(shape= factor( status)), size = 2, alpha = 0.5, 
              color = datatoplot$Country.Color)+
  geom_text(aes(label=siteid),hjust=0, vjust=0, size = 2)+
  theme(legend.position = "bottom", 
        plot.title = element_text(hjust = 0.5, size = 10), 
        axis.title.x = element_text(size = 8), 
        axis.title.y = element_text(size = 8)) +
  scale_x_continuous(name= paste0( "% patients with ", medication_class," medication class"), limits=c(0, xlim+1)) +
  scale_y_continuous(name= paste0( "% patients with ", diagnosis_group), limits=c(0, ylim+1)) +
  labs(title= paste0( medication_class, " - ", diagnosis_group ), shape = "Severity status")