In [None]:
# Load R packages
library(dplyr)
packageVersion('dplyr') # version 1.1.0
library(readxl)
packageVersion('readxl') # version 1.4.1

In [None]:
# set directory
project.dir = '...'
data.dir = '...'
results.dir = '...'
regeps.dir = '...'
cleaned.rpdr.dir = file.path(regeps.dir, '...')

In [None]:
# Load cleaned RPDR
data <- read.csv(file.path(cleaned.rpdr.dir, 'pheno.csv'))
dim(data)
length(unique(data$Subject_Id)) # 928 subjects

# Select interested subjects

- Have asthma diagnosis, ICS prescription and cortisol measurement;
- Remove COPD >= 0.8 and primary adrenocortical insufficiency;
- Adult (Age at plasma collection > 18);
- Have prescription of corticosteroids (ICS + OCS) within 5 years of plasma collection (Total prescription > 0);
- Have ICS dosage information within 5 years of plasma collection from RPDR;
- Remove unknown race, smoking status and BMI

In [None]:
selected.subjects <- data %>% filter(Any_Asthma_Diagnosis_Existence_Yes_No == 'Yes' &
                                       Any_ICS_Medication_Existence_Yes_No == 'Yes' & 
                                       Any_Cortisol_no_ACTH_Existence_Yes_No == 'Yes' &
                                       COPD_current_or_past_history_custom_PPV_greater_0.80PPV_Existence_Yes_No != 'Yes' &
                                       Primary_adrenocortical_insufficiency_Existence_Yes_No != 'Yes' &
                                       Age_at_plasma_collection_date >= 18 &
                                       Corticosteroids_total_number_of_prescriptions_within_5y > 0 &
                                       !is.na(ICS_Dose_Classification_5Y_Median) &
                                       Closest_collect_date_smoking_status != 'Unknown' &
                                       Race_White_KNN_impute_missing != 'Unknown' &
                                       !is.na(BMI_median_closest_measure_date_to_collect_date))
dim(selected.subjects) # 711 selected subjects

# Summary statistics

In [None]:
# Age at plasma collection
summary(selected.subjects$Age_at_plasma_collection_date)

In [None]:
# Gender
table(selected.subjects$Gender_impute_all)
selected.subjects %>%
  group_by(Gender_impute_all) %>%
  summarise(Percentage = n() / nrow(selected.subjects) * 100) # calculate percentages

In [None]:
# Race
table(selected.subjects$Race_White_KNN_impute_missing)
selected.subjects %>%
  group_by(Race_White_KNN_impute_missing) %>%
  summarise(Percentage = n() / nrow(selected.subjects) * 100) # calculate percentages

In [None]:
# BMI at collection date 
summary(selected.subjects$BMI_median_closest_measure_date_to_collect_date)
table(selected.subjects$BMI_median_closest_measure_date_to_collect_date_category) # BMI category
selected.subjects %>%
  group_by(BMI_median_closest_measure_date_to_collect_date_category) %>%
  summarise(Percentage = n() / nrow(selected.subjects) * 100) # calculate percentages

In [None]:
# Smoking status
table(selected.subjects$Closest_collect_date_smoking_status)
selected.subjects %>%
  group_by(Closest_collect_date_smoking_status) %>%
  summarise(Percentage = n() / nrow(selected.subjects) * 100) # calculate percentages


In [None]:
# Number of corticoseroids
summary(selected.subjects$Corticosteroids_total_number_of_prescriptions_within_5y)

In [None]:
# ICS Daily Dose
table(selected.subjects$ICS_Dose_Classification_5Y_Median)
selected.subjects %>%
  group_by(ICS_Dose_Classification_5Y_Median) %>%
  summarise(Percentage = n() / nrow(selected.subjects) * 100) # calculate percentages

In [None]:
# Min Cortisol
summary(selected.subjects$Cortisol_min_value_closest_measure_date_to_collect_date)

In [None]:
# Duration between plasma collection date and cortuisol measurement date
summary(selected.subjects$Cortisol_closest_date_collect_date_gap_abs)

In [None]:
hist(selected.subjects$Cortisol_closest_date_collect_date_gap_abs,10)

In [None]:
# Bronchiectasis Existence
table(selected.subjects$Any_Bronchiectasis_Existence_Yes_No)
selected.subjects %>%
  group_by(Any_Bronchiectasis_Existence_Yes_No) %>%
  summarise(Percentage = n() / nrow(selected.subjects) * 100) # calculate percentages


In [None]:
# Chronic Bronchitis Existence
table(selected.subjects$Any_Chronic_Bronchitis_Existence_Yes_No)
selected.subjects %>%
  group_by(Any_Chronic_Bronchitis_Existence_Yes_No) %>%
  summarise(Percentage = n() / nrow(selected.subjects) * 100) # calculate percentages

# Transformation of non-normal distributed variables

In [None]:
# log transformation of number of prescription
selected.subjects$Corticosteroids_total_number_of_prescriptions_within_5y_log <- log(selected.subjects$Corticosteroids_total_number_of_prescriptions_within_5y)

In [None]:
# square transformation of min cortisol
selected.subjects$Cortisol_min_value_closest_measure_date_to_collect_date_square <- sqrt(selected.subjects$Cortisol_min_value_closest_measure_date_to_collect_date)

In [None]:
# quartile transformation of duration between plasma collection and cortisol measurement date
### Function
quant_cut <- function(x, n) {
  qs <- quantile(x, 1:(n-1)/n, na.rm = TRUE)
  brks <- c(-Inf, qs, Inf)
  cut(x, breaks=unique(brks), labels=FALSE, na.rm = TRUE)
}

selected.subjects$Cortisol_closest_date_collect_date_gap_abs_quartile <- quant_cut(selected.subjects$Cortisol_closest_date_collect_date_gap_abs, 4) 
table(selected.subjects$Cortisol_closest_date_collect_date_gap_abs_quartile)
unique(quantile(selected.subjects$Cortisol_closest_date_collect_date_gap_abs, probs = seq(.25, .75, by = .25), na.rm = TRUE))

In [None]:
hist(selected.subjects$Cortisol_closest_date_collect_date_gap_abs_quartile)

In [None]:
# Select variables
selected.variables <- selected.subjects %>% select(Subject_Id, 
                                                   Any_Bronchiectasis_Existence_Yes_No,
                                                   Any_Chronic_Bronchitis_Existence_Yes_No,
                                                   Gender_impute_all, 
                                                   Age_at_plasma_collection_date,
                                                   Race_White_KNN_impute_missing, 
                                                   Closest_collect_date_smoking_status,
                                                   BMI_median_closest_measure_date_to_collect_date,
                                                   BMI_median_closest_measure_date_to_collect_date_category,
                                                   Corticosteroids_total_number_of_prescriptions_within_5y,
                                                   Corticosteroids_total_number_of_prescriptions_within_5y_log,
                                                   ICS_Dose_Classification_5Y_Median,
                                                   Cortisol_closest_date_collect_date_gap_abs,
                                                   Cortisol_closest_date_collect_date_gap_abs_quartile,
                                                   Cortisol_min_value_closest_measure_date_to_collect_date,
                                                   Cortisol_min_value_closest_measure_date_to_collect_date_square)
dim(selected.variables)

# Metabolomics data

In [None]:
# Load metabolomics data
mets <- read.csv(file.path(cleaned.rpdr.dir, 'Mets-QC.csv'))
dim(mets)
length(unique(mets$CLIENT_SAMPLE_ID)) # 928 subjects

In [None]:
# rename column to merge files
colnames(mets)[colnames(mets) == "CLIENT_SAMPLE_ID"] = "Subject_Id"
dim(mets)
head(mets)

In [None]:
# merge with pheno
data.mets.pheno <- selected.variables %>% left_join(mets, by = 'Subject_Id')
dim(data.mets.pheno)
head(data.mets.pheno)

In [None]:
write.csv(data.mets.pheno, file.path(data.dir, 'pheno_met.csv'), row.names = FALSE)