In [None]:
# load R packages
library(readxl)
packageVersion('readxl')
library(dplyr)
packageVersion('dplyr')
library(stringr)
packageVersion('stringr')
library(fastDummies)
packageVersion('fastDummies')
library(tidyr)
packageVersion('tidyr')
library(lubridate)
packageVersion('lubridate')
library(ggplot2)
packageVersion('ggplot2')

In [None]:
# set directory
project.dir = '...'
data.dir = '...'
regeps.dir = '...'
raw.rpdr.dir = file.path(regeps.dir, '...')
cleaned.rpdr.dir = file.path(regeps.dir, '...')

# Load all RPDR files

In [None]:
# load all diagnosis
diag <- read.csv(file.path(cleaned.rpdr.dir, 'All_Diagnosis.csv'))
dim(diag)
length(unique(diag$Subject_Id)) # 928

In [None]:
# load ICS file
ics <- read.csv(file.path(cleaned.rpdr.dir, 'ICS_prescription_summary.csv'))
dim(ics)
length(unique(ics$Subject_Id)) # 914

In [None]:
# load OCS file
ocs <- read.csv(file.path(cleaned.rpdr.dir, 'OCS_prescription_summary.csv'))
dim(ocs)
length(unique(ocs$Subject_Id)) # 782

In [None]:
# load bmi
bmi <- read.csv(file.path(cleaned.rpdr.dir, 'BMI_median.csv'))
dim(bmi )
length(unique(bmi $Subject_Id)) # 923

In [None]:
# load smoking
smoking.status <- read.csv(file.path(cleaned.rpdr.dir, 'Smoking_status_summary.csv'))
dim(smoking.status)
length(unique(smoking.status$Subject_Id)) # 899

In [None]:
# load cortisol file
cortisol <- read.csv(file.path(cleaned.rpdr.dir, 'Cortisol_closest_collect_date.csv'))
dim(cortisol )
length(unique(cortisol $Subject_Id)) # 898

In [None]:
# load dem
dem <- read.csv(file.path(cleaned.rpdr.dir, 'Demographic_data.csv'))
dim(dem)
length(unique(dem$Subject_Id)) 
head(dem)
dem <- dem %>% select(Subject_Id, EMPI, Date_of_Birth,
                      Date_Of_Death, Vital_status, Plasma_collect_date, 
                      Age_at_plasma_collection_date)
dim(dem)
head(dem)

In [None]:
# load race
race <- read.csv(file.path(cleaned.rpdr.dir, 'race_info.csv'))
dim(race)
length(unique(race$Subject_Id)) 

In [None]:
# load gender
gender <- read.csv(file.path(cleaned.rpdr.dir, 'gender_info.csv'))
dim(gender)
length(unique(gender$Subject_Id)) 

In [None]:
# merge all
data <- dem %>% left_join(race, by = 'Subject_Id') %>%
                  left_join(gender, by = 'Subject_Id') %>%
                  left_join(diag, by = 'Subject_Id') %>%
                  left_join(smoking.status, by = 'Subject_Id') %>%
                  left_join(bmi, by = 'Subject_Id') %>%
                  left_join(ics, by = 'Subject_Id') %>%
                  left_join(ocs, by = 'Subject_Id') %>%
                  left_join(cortisol, by = 'Subject_Id')
dim(data)

In [None]:
# replace NA to Unknown
table(data$Closest_collect_date_smoking_status)
sum(is.na(data$Closest_collect_date_smoking_status))
data$Closest_collect_date_smoking_status[is.na(data$Closest_collect_date_smoking_status)] <- 'Unknown'
table(data$Closest_collect_date_smoking_status)

In [None]:
# replace NA to Unknown
table(data$Race_White_KNN_impute_missing)
sum(is.na(data$Race_White_KNN_impute_missing))
data$Race_White_KNN_impute_missing[is.na(data$Race_White_KNN_impute_missing)] <- 'Unknown'
table(data$Race_White_KNN_impute_missing)

In [None]:
# add BMI category of closest BMI to collect date
data$BMI_median_closest_measure_date_to_collect_date_category <- NA
data$BMI_median_closest_measure_date_to_collect_date_category[data$BMI_median_closest_measure_date_to_collect_date < 18.5] <- 'Underweight'
data$BMI_median_closest_measure_date_to_collect_date_category[(data$BMI_median_closest_measure_date_to_collect_date >= 18.5) & (data$BMI_median_closest_measure_date_to_collect_date < 25)] <- 'Healthy Weight'
data$BMI_median_closest_measure_date_to_collect_date_category[(data$BMI_median_closest_measure_date_to_collect_date >= 25) & (data$BMI_median_closest_measure_date_to_collect_date < 30)] <- 'Overweight'
data$BMI_median_closest_measure_date_to_collect_date_category[data$BMI_median_closest_measure_date_to_collect_date >= 30] <- 'Obesity'

table(data$BMI_median_closest_measure_date_to_collect_date_category)
# relocate
data <- data %>% relocate(BMI_median_closest_measure_date_to_collect_date_category, .after = BMI_median_closest_measure_date_to_collect_date)

# Comebine ICS and OCS for Corticosteroids

In [None]:
# remove patients without ICS
sum(is.na(data$Any_ICS_Medication_Existence_Yes_No))

In [None]:
ics.5y <- data$ICS_total_number_of_prescriptions_within_5y
ocs.5y <- data$OCS_total_number_of_prescriptions_within_5y

In [None]:
typeof(ics.5y)
typeof(ocs.5y)

In [None]:
sum(is.na(ics.5y)) # 37
sum(is.na(ocs.5y)) # 221

In [None]:
# replace NA values with 0 using is.na()
ics.5y[is.na(ics.5y)] <- 0
ocs.5y[is.na(ocs.5y)] <- 0

In [None]:
# add ICS and OCS
corti.5y <- ics.5y + ocs.5y
summary(corti.5y)

In [None]:
data$Corticosteroids_total_number_of_prescriptions_within_5y <- corti.5y

In [None]:
# five years before plasma collection
ics.5y.bef <- data$ICS_total_number_of_prescriptions_5y_bef_plasma_collect
ocs.5y.bef <- data$OCS_total_number_of_prescriptions_5y_bef_plasma_collect

sum(is.na(ics.5y.bef)) # 55
sum(is.na(ocs.5y.bef)) # 370


# replace NA values with 0 using is.na()
ics.5y.bef[is.na(ics.5y.bef)] <- 0
ocs.5y.bef[is.na(ocs.5y.bef)] <- 0

# add ICS and OCS
corti.5y.bef <- ics.5y.bef + ocs.5y.bef
summary(corti.5y.bef)

data$Corticosteroids_total_number_of_prescriptions_5y_bef_plasma_collect <- corti.5y.bef

In [None]:
# check empty value (Empty value = no record)
existence.cols <- colnames(data %>% select(matches('Existence_Yes_No')))
# summary statistic of diagnosis existence
for (i in c(1:length(existence.cols))){
    print(existence.cols[i])
    print(table(data[,existence.cols[i]]))
}

In [None]:
# change empty value in existence columns to NO
for (i in c(1:length(existence.cols))){
    data[,existence.cols[i]][is.na(data[,existence.cols[i]])] <- 'No'
}  

In [None]:
# check table of all existence columns
for (i in c(1:length(existence.cols))){
    print(existence.cols[i])
    print(table(data[,existence.cols[i]]))
}

In [None]:
write.csv(data, file.path(cleaned.rpdr.dir, 'RPDR-pheno.csv'), row.names = FALSE)

In [None]:
data.dictionary <- data.frame(colnames(data))
colnames(data.dictionary) <- c('Variable_Name')
head(data.dictionary)

In [None]:
write.csv(data.dictionary, file.path(data.dir, 'RPDR-Data-Dictionary.csv'), row.names = FALSE)