# Demographics Summary Pediatrics

This notebook contain the R code to estimate the patient summary in terms of:
- per site  (severe vs total cases)
- per sex
- per age group


First we determine the path where all the files are located and load the R libraries required. 

In [None]:
rm(list=ls())
# Set working directory where the files are
setwd("./4CE/phase1.1/latest/")

#############
# LIBRARIES #
#############
library(dplyr)
library(tidyr)

Then we create a function to define the list with all the files to analyze.

In [None]:
fileList <- function( path, pattern, pediatric ){
  
  fileListInput <- list.files( path = path,pattern = pattern)
  fileListInput  <- fileListInput[! grepl( paste( c("FICHOS","VA.csv","BCH.csv","CHOP.csv", 
                                                    "RP401.csv") , collapse = "|"), x = fileListInput)]
  
  if( pediatric == TRUE){
    fileListInput  <- fileListInput[ grepl( paste( c("PED", "UNCCH"), collapse = "|"), x = fileListInput)]
    fileListInput <- fileListInput[! grepl("APHPPED.csv", x = fileListInput) ]
  }else{
    fileListInput  <- fileListInput[ !grepl( paste( "PED", collapse = "|"), x = fileListInput)]
  }
  return( fileListInput)
}

We create the list with all the pediatric files.

In [None]:
select_pediatric = TRUE
fileListDemog <- fileList( path = "./",pattern = "Demog", pediatric = select_pediatric)

### Summary of number of total cases

In [None]:
allDemographics <- as.data.frame(matrix(ncol=7, nrow = length(fileListDemog)))
colnames(allDemographics) <- c("site", "00to02", "03to05", "06to11", "12to17","18to20", "all")

for( i in 1:length( fileListDemog ) ){
  print(i)
  selection <- read.delim( fileListDemog[i], sep = ",", colClasses = "character")
  if( grepl( "UNC", x=fileListDemog[i], fixed = TRUE) == TRUE ){
    selection[ selection == 10] <- -99 
  }
  colnames( selection ) <- tolower( colnames( selection ))
  colnames( selection ) <- trimws( colnames( selection ))
  selection$age_group <- trimws( selection$age_group )
  selection <- selection[ selection$sex == "all" &
                            selection$race == "all",]
  selection$patients <- selection$num_patients_all
  
  allDemographics$site[i] <- selection$siteid[1]
  allDemographics$`00to02`[i] <- ifelse("00to02" %in% selection$age_group, 
                                         selection[ selection$age_group == "00to02", "patients"] , NA) 
  allDemographics$`03to05`[i] <- ifelse("03to05" %in% selection$age_group, 
                                         selection[ selection$age_group == "03to05", "patients"] , NA) 
  allDemographics$`06to11`[i] <- ifelse("06to11" %in% selection$age_group, 
                                         selection[ selection$age_group == "06to11", "patients"] , NA) 
  allDemographics$`12to17`[i] <- ifelse("12to17" %in% selection$age_group, 
                                         selection[ selection$age_group == "12to17", "patients"] , NA) 
  allDemographics$`18to20`[i] <- ifelse("18to20" %in% selection$age_group, 
                                         selection[ selection$age_group == "18to20", "patients"] , NA)

  allDemographics$all[i] <- selection[ selection$age_group == "all", "patients"] 
  
}

sum(as.numeric(allDemographics$all), na.rm = TRUE)

### Summary of number of cases per sex
For obfuscated values (-99), we transformed it following the next formula:
0.5*obfuscationValue

In [None]:
sexSummary <- as.data.frame(matrix(ncol=4, nrow = length(fileListDemog)))
colnames(sexSummary) <- c("site", "females", "males", "all")

obfuscation <- read.delim( file   = "./4CE/phase1.1_pediatric/pediatric_obfuscation.txt", 
                           header = TRUE, 
                           sep = "\t")


for( i in 1:length( fileListDemog ) ){
  print(i)
  selection <- read.delim( fileListDemog[i], sep = ",", colClasses = "character")
  if( grepl( "UNC", x=fileListDemog[i], fixed = TRUE) == TRUE ){
    selection[ selection == 10] <- -99 
  }
  colnames( selection ) <- tolower( colnames( selection ))
  colnames( selection ) <- trimws( colnames( selection ))
  selection <- selection[ selection$age_group == "all" &
                            selection$race == "all",]
  
  sexSummary$site[i] <- selection$siteid[1]
  
  obf <- obfuscation[ toupper(obfuscation$siteid) == toupper(selection$siteid[1]), "obfuscation"]
  if( obf == "none" ){
    sexSummary$females[i] <- selection[ selection$sex == "female", "num_patients_all"] 
    sexSummary$males[i] <- selection[ selection$sex == "male", "num_patients_all"]
    sexSummary$all[i] <- selection[ selection$sex == "all", "num_patients_all"] 
  }else{
    sexSummary$females[i] <- ifelse( as.numeric(selection[ selection$sex == "female", "num_patients_all"]) == -99, 0.5*as.numeric(obf), as.numeric(selection[ selection$sex == "female", "num_patients_all"])) 
    sexSummary$males[i] <- ifelse( as.numeric(selection[ selection$sex == "male", "num_patients_all"]) == -99, 0.5*as.numeric(obf), as.numeric(selection[ selection$sex == "male", "num_patients_all"])) 
    sexSummary$all[i] <- ifelse( as.numeric(selection[ selection$sex == "all", "num_patients_all"]) == -99, 0.5*as.numeric(obf), as.numeric(selection[ selection$sex == "all", "num_patients_all"])) 
  }
}

allFemales <- sum( as.numeric( sexSummary$females ))
allFemales
allMales <- sum( as.numeric( sexSummary$males ))
allMales
all <- sum( as.numeric( sexSummary$all ))
all
round( allFemales/all * 100, 2)
round( allMales/all * 100, 2)

### Summary of number of cases per age group
For obfuscated values (-99), we transformed it following the next formula:
0.5*obfuscationValue

In [None]:
ageSummary <- as.data.frame(matrix(ncol=7, nrow = length(fileListDemog)))
colnames(ageSummary) <- c("site", "00to02", "03to05", "06to11", "12to17","18to20", "all")

obfuscation <- read.delim( file   = "./4CE/phase1.1_pediatric/pediatric_obfuscation.txt", 
                           header = TRUE, 
                           sep = "\t")


for( i in 1:length( fileListDemog ) ){
  print(i)
  selection <- read.delim( fileListDemog[i], sep = ",", colClasses = "character")
  if( grepl( "UNC", x=fileListDemog[i], fixed = TRUE) == TRUE ){
    selection[ selection == 10] <- -99 
  }
  colnames( selection ) <- tolower( colnames( selection ))
  colnames( selection ) <- trimws( colnames( selection ))
  selection$age_group <- trimws( selection$age_group )
  selection <- selection[ selection$sex == "all" &
                            selection$race == "all",]
  
  ageSummary$site[i] <- selection$siteid[1]
  
  obf <- obfuscation[ toupper(obfuscation$siteid) == toupper(selection$siteid[1]), "obfuscation"]
  if( obf == "none" ){
    ageSummary$`00to02`[i] <-  ifelse("00to02" %in% selection$age_group, 
                                        selection[ selection$age_group == "00to02", "num_patients_all"], 0) 
    ageSummary$`03to05`[i] <-  ifelse("03to05" %in% selection$age_group, 
                                            selection[ selection$age_group == "03to05", "num_patients_all"], 0)
    ageSummary$`06to11`[i] <-  ifelse("06to11" %in% selection$age_group, 
                                            selection[ selection$age_group == "06to11", "num_patients_all"], 0)
    ageSummary$`12to17`[i] <-  ifelse("12to17" %in% selection$age_group, 
                                            selection[ selection$age_group == "12to17", "num_patients_all"], 0)
    ageSummary$`18to20`[i] <-  ifelse("18to20" %in% selection$age_group, 
                                            selection[ selection$age_group == "18to20", "num_patients_all"], 0)
    ageSummary$all[i] <- selection[ selection$age_group == "all", "num_patients_all"] 
  }else{
    ageSummary$`00to02`[i] <- ifelse("00to02" %in% selection$age_group, 
                                            ifelse( as.numeric(selection[ selection$age_group == "00to02", 
                                            "num_patients_all"]) == -99, 0.5*as.numeric(obf), 
                                            as.numeric(selection[ selection$age_group == "00to02", "num_patients_all"])),0) 
    ageSummary$`03to05`[i] <- ifelse("03to05" %in% selection$age_group, 
                                            ifelse( as.numeric(selection[ selection$age_group == "03to05", 
                                            "num_patients_all"]) == -99, 0.5*as.numeric(obf), 
                                            as.numeric(selection[ selection$age_group == "03to05", "num_patients_all"])),0) 
    ageSummary$`06to11`[i] <- ifelse("06to11" %in% selection$age_group,
                                            ifelse( as.numeric(selection[ selection$age_group == "06to11", 
                                            "num_patients_all"]) == -99, 0.5*as.numeric(obf), 
                                            as.numeric(selection[ selection$age_group == "06to11", "num_patients_all"])),0)
    ageSummary$`12to17`[i] <- ifelse("12to17" %in% selection$age_group, 
                                            ifelse( as.numeric(selection[ selection$age_group == "12to17", 
                                            "num_patients_all"]) == -99, 0.5*as.numeric(obf), 
                                            as.numeric(selection[ selection$age_group == "12to17", "num_patients_all"])),0)     
    ageSummary$`18to20`[i] <- ifelse("18to20" %in% selection$age_group,
                                            ifelse( as.numeric(selection[ selection$age_group == "18to20", 
                                            "num_patients_all"]) == -99, 0.5*as.numeric(obf), 
                                            as.numeric(selection[ selection$age_group == "18to20", "num_patients_all"])),0)
    ageSummary$all[i] <- ifelse( as.numeric(selection[ selection$age_group == "all", 
                                            "num_patients_all"]) == -99, 0.5*as.numeric(obf), 
                                            as.numeric(selection[ selection$age_group == "all", "num_patients_all"])) 
  }
  
}

sum( as.numeric( ageSummary$`00to02`))
sum( as.numeric( ageSummary$`03to05`))
sum( as.numeric( ageSummary$`06to11`))
sum( as.numeric( ageSummary$`12to17`))
sum( as.numeric( ageSummary$`18to20`))
sum( as.numeric( ageSummary$all))

round( sum( as.numeric( ageSummary$`00to02`))/ sum( as.numeric( ageSummary$all)) * 100)
round( sum( as.numeric( ageSummary$`03to05`))/ sum( as.numeric( ageSummary$all)) * 100)
round( sum( as.numeric( ageSummary$`06to11`))/ sum( as.numeric( ageSummary$all)) * 100)
round( sum( as.numeric( ageSummary$`12to17`))/ sum( as.numeric( ageSummary$all)) * 100)
round( sum( as.numeric( ageSummary$`18to20`))/ sum( as.numeric( ageSummary$all)) * 100)