## Diagnosis visualization
- We will use the ACT ontology, and we will use a mapping file that will allow us to go from ICD9CM to ICD10CM
- The file used ACT_ICD10_ICD9_3_colmns_actual_nodeName.csv is available in the shared folder. 

In [None]:
######################
## ICD descriptions ##
######################
ncats <- read.delim( file       = "./ACT_ICD10_ICD9_3_colmns_actual_nodeName.csv", 
                     sep        = ",", 
                     colClasses = "character", 
                     header     = TRUE)

#clean the file removing those that are empty or those that belongs to groups of diagnostics
ncats <- ncats[ ncats$C_BASECODE != "", ]
ncats <- ncats[- grep("-", ncats$C_BASECODE), ]

#split the diagnosis code column to have on one side the code and in the other the version
ncats$icd_code <- sapply(strsplit( as.character(ncats$C_BASECODE), "[:]"), '[', 2)
ncats$icd_version <- sapply(strsplit( as.character(ncats$C_BASECODE), "[:]"), '[', 1)
ncats$icd_version <- gsub( "CM", "", ncats$icd_version)
ncats$icd_version <- gsub( "ICD", "", ncats$icd_version)

#create an additional column where we remove the dots from all the codes, since some sites are
#providing the codes without dots
ncats$codeNoDots <- gsub("[.]", "", ncats$icd_code)

#create an additional column with the highest level of the hierarchy, to be able to group
#the different diagnosis in around 20 different categories
ncats$Category <- sapply(strsplit( as.character(ncats$C_FULLNAME), "[\\]"), '[', 3)
ncats$Category <- sapply(strsplit( as.character(ncats$Category), "[(]"), '[', 1)
ncats$Category <- trimws(ncats$Category)

In [None]:
#map from ICD9 to 10 based on the hierarchy
icd10 <- ncats[ ncats$icd_version == 10, ]
icd9  <- ncats[ ncats$icd_version == 9, ]
icd10$description <- sapply(strsplit( as.character(icd10$C_FULLNAME), "[\\]"), tail, 1)
icd9$description <- sapply(strsplit( as.character(icd9$C_FULLNAME), "[\\]"), '[', 
                           length(unlist(strsplit(icd9$C_FULLNAME, "[\\]")))-1)
for( i in 1:nrow(icd9) ){
  lng <- length(unlist(strsplit(icd9$C_FULLNAME[i], "[\\]")))
  icd9$description[i] <- sapply(strsplit( as.character(icd9$C_FULLNAME)[i], "[\\]"), '[', lng-1)
}

totalDiagnosis <- rbind( icd9, icd10)

In [None]:
#####################
## Diagnosis files ##
#####################

diagnosisCombined <- read.csv(file       = "./Diagnoses-Combined200405.csv", 
                              sep        = ",", 
                              header     = TRUE, 
                              colClasses = "character")

diagnosisCombined <- diagnosisCombined[, c("siteid", "icd_code", "icd_version", "num_patients")]
diagnosisCombined$perc_patients <- round(100*(as.numeric( diagnosisCombined$num_patients ) / 15637 ),2)

diagnosisPerCountry <- read.delim(file       = "./Diagnoses-CombinedByCountry200405.csv", 
                                  sep        = ",", 
                                  header     = TRUE, 
                                  colClasses = "character")
diagnosisPerCountry <- diagnosisPerCountry[, c("siteid", "icd_code", "icd_version", "num_patients") ]

demographicsPerCountry <- read.delim(file   = "./Demographics-CombinedByCountry200405.csv", 
                                     sep    = ",", 
                                     header = TRUE, 
                                     colClasses = "character")
demographicsPerCountry <- demographicsPerCountry[demographicsPerCountry$sex == "All", 
                                                 c("siteid", "total_patients") ]

diagnosisPerCountry$siteid <- as.character( diagnosisPerCountry$siteid )

diagnosisPerCountry <- merge( diagnosisPerCountry, demographicsPerCountry, by = "siteid")
diagnosisPerCountry$perc_patients <- round(100*(as.numeric( diagnosisPerCountry$num_patients ) /  as.numeric( diagnosisPerCountry$total_patients )),2)
diagnosisPerCountry <- diagnosisPerCountry[, c("siteid", "icd_code", "icd_version", "num_patients", "perc_patients")]

diagnosis <- rbind( diagnosisCombined, diagnosisPerCountry)

In [None]:
###########################################################################
### Filter the diagnosis by number and percentage of affected patients ###
##########################################################################
numberOfPatientsFilter = 10
selection <- diagnosis[ as.numeric( diagnosis$num_patients) >= numberOfPatientsFilter, ]

percentageFilter = 0.1
selection <- diagnosis[ as.numeric( diagnosis$perc_patients) > percentageFilter, ]

In [None]:
##################################################
## Heatmap with ICD description and categories ##
#################################################
icdMapping <- totalDiagnosis[, c("codeNoDots", "Category", "description")]
icdMapping <- icdMapping[ ! duplicated( icdMapping ), ]
selection$codeNoDots <- gsub("[.]", "", selection$icd_code)
selectionDesc <- merge( selection, icdMapping, all.x = TRUE,by = "codeNoDots")
selectionDesc$lbl <- ifelse( is.na(selectionDesc$description), as.character(selectionDesc$icd_code),
                                     as.character(selectionDesc$description))

selectionDesc$Category <- factor(selectionDesc$Category, levels = c("Certain conditions originating in the perinatal period",  
                                                                    "Congenital malformations, deformations and chromosomal abnormalities", 
                                                                    "Symptoms, signs and abnormal clinical and laboratory findings, not elsewhere classified",
                                                                    "Certain infectious and parasitic diseases",
                                                                    "Factors influencing health status and contact with health services",
                                                                    "Pregnancy, childbirth and the puerperium", 
                                                                    "Diseases of the nervous system", 
                                                                    "Diseases of the blood and blood-forming organs and certain disorders involving the immune mechanism",
                                                                    "Diseases of the circulatory system",
                                                                    "Neoplasms", 
                                                                    "Diseases of the musculoskeletal system and connective tissue"  ,
                                                                    "Injury, poisoning and certain other consequences of external causes", 
                                                                    "Diseases of the digestive system", 
                                                                    "Diseases of the respiratory system", 
                                                                    "Diseases of the genitourinary system",
                                                                    "Endocrine, nutritional and metabolic diseases", 
                                                                    "Diseases of the ear and mastoid process", 
                                                                    "Diseases of the eye and adnexa",
                                                                    "Mental and behavioral disorders",
                                                                    "Diseases of the skin and subcutaneous tissue",
                                                                    "External causes of morbidity"), 
                                 labels = c("Perinatal",
                                            "Congenital&Chromosomal",
                                            "Symptoms",
                                            "Infectious",
                                            "Factors influencing health status",
                                            "Pregnance, childbirth",
                                            "Nervous",
                                            "Blood",
                                            "Circulatory", 
                                            "Neoplams",
                                            "Musculoskeletal",
                                            "Injury,poisoning",
                                            "Digestive",
                                            "Respiratory",
                                            "Genitourinary",
                                            "Endocrine/metabolic",
                                            "Ear and mastoid",
                                            "Eye and adnexa",
                                            "Mental",
                                            "Skin",
                                            "External causes"))

In [None]:
###########################
## Heatmap representation #
###########################
ggplot(data =  selectionDesc, aes(x = lbl, y = siteid, alpha=perc_patients)) + 
  scale_alpha(range = c(0.5, 1))+
  geom_tile(aes(fill = siteid), colour = "white") +
  facet_grid(. ~ Category, scales = "free", switch = "x")+
  scale_fill_manual(values = c("Italy" = "#009E73", "France" = "#0072B2", "Germany" = "#E69F00", "USA" = "#D55E00", "Combined" = "#444444")) +
  theme_bw() +
  theme(panel.grid=element_blank(), axis.text.y  = element_text(size=5),plot.title = element_text(size=7), 
        axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank(), 
        strip.text.x = element_text(angle = 90, size = 5)) +
  coord_cartesian(expand=FALSE)+
  labs(title = "Diagnoses (Date 2020-04-05) | 12 sites | 15,637 patients ")

In [None]:
##################################################
## Heatmap showing the specific diagnosis codes ##
##################################################
ggplot(selectionDesc, aes(y=description, x=siteid, fill=siteid, alpha=perc_patients)) +
  geom_tile() + 
  scale_fill_manual(values = c("Italy" = "#009E73", "France" = "#0072B2", "Germany" = "#E69F00", "USA" = "#D55E00", "Combined" = "#444444")) +
  theme_bw() +
  theme(panel.grid=element_blank(), axis.text.y  = element_text(size=5),plot.title = element_text(size=7)) +
  coord_cartesian(expand=FALSE)+
  facet_grid(Category ~ ., scales = "free")+theme(
    strip.text.y = element_text(
      size = 5))+
  labs(title = "Diagnoses (Date 2020-04-05) | 12 sites | 15,637 patients ")



In [None]:
######################
## Barplot with ICD ##
######################
ggplot(data=selectionByCountry, aes(x=reorder(lbl,perc_patients), y=perc_patients)) +
  geom_bar(aes(fill= siteid), stat="identity", position=position_dodge()) + 
  theme_bw()+
  theme(axis.text.x = element_text(angle =45, hjust = 1), axis.text.y = element_text(size=5))+
  labs(title = paste0("Number of patients by diagnostic code (>=",min(selection$num_patients),"patients)" ), 
       x = "diagnostic code", y = "percentage of patients")+ coord_flip()+
  scale_fill_manual("legend", values = c("Italy" = "#009E73", "France" = "#0072B2", "Germany" = "#E69F00", "USA" = "#D55E00", "Combined" = "#444444"))