In [None]:
# load R packages
library(readxl)
packageVersion('readxl')
library(dplyr)
packageVersion('dplyr')
library(stringr)
packageVersion('stringr')
library(fastDummies)
packageVersion('fastDummies')
library(tidyr)
packageVersion('tidyr')
library(lubridate)
packageVersion('lubridate')
library(ggplot2)
packageVersion('ggplot2')

In [None]:
# set directory
project.dir = '...'
data.dir = '...'
regeps.dir = '...'
raw.rpdr.dir = file.path(regeps.dir, '...')
cleaned.rpdr.dir = file.path(regeps.dir, '...')

# Phy file

In [None]:
# check phy file to see how many rows
phy.lines <-readLines(file.path(raw.rpdr.dir, "Phy.txt"))
length(phy.lines)

# load phy file
phy.data <- read.delim(file.path(raw.rpdr.dir, "Phy.txt"), sep = '|')
dim(phy.data)
length(unique(phy.data$EMPI)) # only 927 indviduals

In [None]:
# extract biobank ID from 00_data_Subject_Id.csv
data.id <- read.csv(file.path(cleaned.rpdr.dir, 'Subject_Id.csv'))
dim(data.id)

In [None]:
# merge biobank.ID and phy.data file
phy.data.ID <- merge(phy.data, data.id[,c('Subject_Id', 'EMPI')], by = 'EMPI')
dim(phy.data.ID)
colSums(is.na(phy.data.ID))

In [None]:
phy.data.ID$Date <- as.Date(phy.data.ID$Date, format = "%m/%d/%Y") # change format of date
typeof(phy.data.ID$Date)

In [None]:
# subset data with BMI in Concept_Name
bmi.phy <- phy.data.ID %>% filter(str_detect(Concept_Name, 'BMI'))
dim(bmi.phy)

# make sure Concept_Name has only BMI
unique(bmi.phy$Concept_Name)

# check how many ind have BMI
length(unique(bmi.phy$Subject_Id)) # 923 -> 7 no BMI

In [None]:
# change type of Result column from character to numeric
bmi.phy$Result <- as.numeric(bmi.phy$Result)
typeof(bmi.phy$Result)

# check missing value of result
colSums(is.na(bmi.phy)) # no missing

In [None]:
# check BMI result to see outliers
summary(bmi.phy$Result)
quantile(bmi.phy$Result, c(.01,.1,.25,.50,.75,.90,.99))
hist(bmi.phy$Result)

plot(x = bmi.phy$Subject_Id, y = bmi.phy$Result)

In [None]:
# select only Subject_Id, Date, Result, Days_Difference
bmi.phy.selected.cols <- bmi.phy %>% select(Subject_Id, Date, Result)
dim(bmi.phy.selected.cols)

# check missing values of each category
colSums(is.na(bmi.phy.selected.cols))

In [None]:
# check individuals with BMI higher than 100
bmi.higher.100 <- bmi.phy.selected.cols %>% filter(Result > 100)
dim(bmi.higher.100)
length(unique(bmi.higher.100$Subject_Id))

In [None]:
# histogram of all ind with BMI higher than 100
id.bmi.higher.100 <- unique(bmi.higher.100$Subject_Id)

for (id in id.bmi.higher.100){
    bmi.higher.100.data <- bmi.phy.selected.cols %>% filter(Subject_Id == id)
    plot <- ggplot(bmi.higher.100.data, aes(x = Result)) + 
    geom_histogram(bins = 50, color = "black", fill = "white") + ggtitle(paste(id, median(bmi.higher.100.data$Result))) +
    xlab("\nBMI") + ylab("\nCount") + theme(
        plot.title = element_text(colour = "black", size = 18, face = "bold", hjust = 0.5),
        axis.title.x = element_text(colour = "black", size = 14, face = "bold"),
        axis.title.y = element_text(colour = "black", size = 14, face = "bold"),
        axis.text.x = element_text(colour = "black", size = 12),
        axis.text.y = element_text(colour = "black", size = 12)
      )
    print(plot)
    
}

In [None]:
# check individuals with BMI lower than 16
bmi.lower.16 <- bmi.phy.selected.cols %>% filter(Result < 16)
dim(bmi.lower.16)
length(unique(bmi.lower.16$Subject_Id)) # 40 ind

In [None]:
# histogram of all ind with BMI lower than 16
id.bmi.lower.16 <- unique(bmi.lower.16$Subject_Id)

for (id in id.bmi.lower.16){
    bmi.lower.16.data <- bmi.phy.selected.cols %>% filter(Subject_Id == id)
    plot <- ggplot(bmi.lower.16.data, aes_string(x = 'Result')) + 
    geom_histogram(bins = 50, color = "black", fill = "white") + ggtitle(paste(id, median(bmi.lower.16.data$Result))) +
    xlab("\nBMI") + ylab("\nCount") + theme(
        plot.title = element_text(colour = "black", size = 18, face = "bold", hjust = 0.5),
        axis.title.x = element_text(colour = "black", size = 14, face = "bold"),
        axis.title.y = element_text(colour = "black", size = 14, face = "bold"),
        axis.text.x = element_text(colour = "black", size = 12),
        axis.text.y = element_text(colour = "black", size = 12)
      )
    print(plot)
} 

In [None]:
# remove BMI values higher than 100 and less than 10
bmi.no.outlier <-  bmi.phy.selected.cols %>% filter(Result < 100 & Result > 10)
dim(bmi.no.outlier)
length(unique(bmi.no.outlier$Subject_Id))
colSums(is.na(bmi.no.outlier))

In [None]:
summary(bmi.no.outlier$Result) # already remove outliers

In [None]:
dim(bmi.no.outlier[duplicated(bmi.no.outlier), ]) # same values: 968

In [None]:
dim(bmi.no.outlier)
bmi.no.outlier <- bmi.no.outlier[!duplicated(bmi.no.outlier), ] # remove 968
dim(bmi.no.outlier)

In [None]:
# merge collect date by Subject_ID
bmi.w.date <- merge(bmi.no.outlier, data.id[,c('Subject_Id', 'Plasma_collect_date')], by = 'Subject_Id')
dim(bmi.w.date)

In [None]:
# calculate date difference
# convert the date column to the Y-M-D format
bmi.w.date$Date <- as.Date(bmi.w.date$Date, format = "%m/%d/%Y")
typeof(bmi.w.date$Date)
head(bmi.w.date$Date)

# plasma collect date
bmi.w.date$Plasma_collect_date <- as.Date(bmi.w.date$Plasma_collect_date, format = "%Y -%m -%d")
head(bmi.w.date$Plasma_collect_date)

# substract collect date and diag date
bmi.w.date["Days_Difference"] <- difftime(bmi.w.date$Plasma_collect_date, bmi.w.date$Date, units = "days")

In [None]:
# absolute values
bmi.w.date["Days_Difference_Abs"] <- as.numeric(abs(bmi.w.date$Days_Difference))

## BMI median

In [None]:
# calculate median BMI of each day per individual
bmi.median.day <- bmi.w.date %>% group_by(Subject_Id, Date, Days_Difference) %>% 
                                 summarise_at(vars('Result'), c(median))
colnames(bmi.median.day) <- c('Subject_Id', 'Date', 'Days_Difference',
                              'BMI_median_per_day')
dim(bmi.median.day)
length(unique(bmi.median.day$Subject_Id))

In [None]:
summary(bmi.median.day$BMI_median_per_day)

In [None]:
# calculate median BMI of each individual
bmi.median.ind <- bmi.median.day %>% group_by(Subject_Id) %>% summarise_at(vars('BMI_median_per_day'), median)
dim(bmi.median.ind) # only 923 have BMI

names(bmi.median.ind)[2] <- 'BMI_median_ind'

colSums(is.na(bmi.median.ind)) # no missing

In [None]:
# check median BMI result to see outliers
summary(bmi.median.ind$BMI_median_ind)
quantile(bmi.median.ind$BMI_median_ind, c(.01,.1,.25,.50,.75,.90,.99))
hist(bmi.median.ind$BMI_median_ind)

plot(x = bmi.median.ind$Subject_Id, y = bmi.median.ind$BMI_median_ind)

## BMI category based on CDC recommendations using BMI median
#### below 18.5 -> Underweight 18.5 - 24.9 -> Healthy Weight 25 - 29.9 -> Overweight 30 and Above -> Obseity

In [None]:
bmi.median.ind$BMI_median_category <- NA
bmi.median.ind$BMI_median_category[bmi.median.ind$BMI_median_ind < 18.5] <- 'Underweight'
bmi.median.ind$BMI_median_category[(bmi.median.ind$BMI_median_ind >= 18.5) & (bmi.median.ind$BMI_median_ind < 25)] <- 'Healthy Weight'
bmi.median.ind$BMI_median_category[(bmi.median.ind$BMI_median_ind >= 25) & (bmi.median.ind$BMI_median_ind < 30)] <- 'Overweight'
bmi.median.ind$BMI_median_category[bmi.median.ind$BMI_median_ind >= 30] <- 'Obesity'

In [None]:
# group the data frame by 'Subject_Id' and spread the 'BMI_median_category' column
BMI.category <- bmi.median.ind %>% group_by(Subject_Id, BMI_median_ind) %>%
                                                    pivot_wider(names_from = BMI_median_category, 
                                                                values_from = BMI_median_category)
dim(BMI.category)

In [None]:
colnames(BMI.category) <- c('Subject_Id', 'BMI_median_ind',
                             'BMI_median_Underweight', 'BMI_median_Obesity', 'BMI_median_Healthy_Weight', 
                             'BMI_median_Overweight')
dim(BMI.category)

In [None]:
# set Yes or No in each category
BMI.category.cols <- c('BMI_median_Underweight', 'BMI_median_Obesity', 'BMI_median_Healthy_Weight', 'BMI_median_Overweight')

for (col in BMI.category.cols){
    BMI.category[[col]] <- ifelse(is.na(BMI.category[[col]]), 'No', 'Yes')
}

### BMI window time: 6 months, 12 months, 24 months before and after plasma collection date

In [None]:
# BMI 6 months window using bmi.median.day file
bmi.6M <- bmi.median.day %>% filter(Days_Difference <= 180 & Days_Difference >= -180)
dim(bmi.6M)

# check how many individuals have BMI 6 month window
length(unique(bmi.6M$Subject_Id)) # only 858 individuals

In [None]:
# create a new dataframe for storing median values of all window times
BMI.median.windows <- as.data.frame(unique(bmi.median.day$Subject_Id)) 
colnames(BMI.median.windows)[1] <- 'Subject_Id'

# calculate median BMI of 6 months before and after plasma collection date
BMI.median.windows$BMI_median_6M <- NA
for (i in BMI.median.windows$Subject_Id){
    BMI.median.windows$BMI_median_6M[BMI.median.windows$Subject_Id == i] <- median(bmi.6M$BMI_median_per_day[bmi.6M$Subject_Id == i])
}

dim(BMI.median.windows)
# check missing values of each columns
colSums(is.na(BMI.median.windows))

In [None]:
length(unique(BMI.median.windows$Subject_Id))

In [None]:
# BMI 12 months window using bmi.median.day file
bmi.12M <- bmi.median.day %>% filter(Days_Difference <= 365 & Days_Difference >= -365)
dim(bmi.12M)

# check how many individuals have BMI 12 month window
length(unique(bmi.12M$Subject_Id)) # only 891 individuals

In [None]:
# calculate median BMI of 12 months before and after plasma collection date
BMI.median.windows$BMI_median_12M <- NA
for (i in BMI.median.windows$Subject_Id){
    BMI.median.windows$BMI_median_12M[BMI.median.windows$Subject_Id == i] <- median(bmi.12M$BMI_median_per_day[bmi.12M$Subject_Id == i])
}

dim(BMI.median.windows)
# check missing values of each columns
colSums(is.na(BMI.median.windows))
length(unique(BMI.median.windows$Subject_Id))

In [None]:
# BMI 24 months window using bmi.median.day file
bmi.24M <- bmi.median.day %>% filter(Days_Difference <= 730 & Days_Difference >= -730)
dim(bmi.24M)

# check how many individuals have BMI 24 month window
length(unique(bmi.24M$Subject_Id)) # only 910 individuals

In [None]:
# calculate median BMI of 24 months before and after plasma collection date
BMI.median.windows$BMI_median_24M <- NA
for (i in BMI.median.windows$Subject_Id){
    BMI.median.windows$BMI_median_24M[BMI.median.windows$Subject_Id == i] <- median(bmi.24M$BMI_median_per_day[bmi.24M$Subject_Id == i])
}

dim(BMI.median.windows)
# check missing values
colSums(is.na(BMI.median.windows))

In [None]:
# summary statistic of BMI
BMI.values <- c("BMI_median_6M","BMI_median_12M","BMI_median_24M")
BMI.col.titles <- c( "BMI median 6 months", "BMI median 12 months", "BMI median 24 months")

for (i in c(1:length(BMI.values))){
    print(BMI.col.titles[i])
    print(summary(BMI.median.windows[,BMI.values[i]]))
}

In [None]:
length(unique(BMI.median.windows$Subject_Id))

### BMI closest to plasma collection date (before or after)

In [None]:
bmi.median.day$Days_Difference_Abs <- as.numeric(abs(bmi.median.day$Days_Difference))

In [None]:
summary(as.numeric(bmi.median.day$Days_Difference))
summary(as.numeric(bmi.median.day$Days_Difference_Abs))

In [None]:
# duplication values because of absolute value of days difference
bmi.abs_date <- bmi.median.day %>% group_by(Subject_Id, Days_Difference_Abs) %>% 
                   summarise_at(vars('BMI_median_per_day'), c(median))
colnames(bmi.abs_date) <- c('Subject_Id', 'Days_Difference_Abs', 
                               'BMI_median_per_day_abs')
dim(bmi.abs_date)

In [None]:
length(unique(bmi.abs_date$Subject_Id)) # how many unique IDs
colSums(is.na(bmi.abs_date)) # check missing value

In [None]:
# find closest date to collect date (before or after)
abs.closest.collect.date <- bmi.abs_date %>%                                       # Get min by group
  group_by(Subject_Id) %>%
  summarise_at(vars(Days_Difference_Abs),
               list(abs_closest_date_collect_date = min))
dim(abs.closest.collect.date)

In [None]:
summary(abs.closest.collect.date$abs_closest_date_collect_date)

In [None]:
# merge to have BMI values
bmi.abs.closest.collect.date <- merge(abs.closest.collect.date, bmi.abs_date, 
                                      by.x = c('Subject_Id', 'abs_closest_date_collect_date'),
                                      by.y = c('Subject_Id', 'Days_Difference_Abs'))
dim(bmi.abs.closest.collect.date)

In [None]:
colnames(bmi.abs.closest.collect.date) <- c('Subject_Id', 'BMI_closest_date_collect_date_gap', 
                                            'BMI_median_closest_measure_date_to_collect_date')
dim(bmi.abs.closest.collect.date)

In [None]:
dim(BMI.category)
# BMI median within 6, 12, 24 months include plasma collection date
dim(BMI.median.windows)

# BMI closet to plasma collection date (before and after)
dim(bmi.abs.closest.collect.date)

In [None]:
# merge all files
data.bmi <- BMI.category %>% left_join(BMI.median.windows, by = 'Subject_Id') %>%
                                     left_join(bmi.abs.closest.collect.date, by = 'Subject_Id')
                                        
dim(data.bmi)
colSums(is.na(data.bmi))

In [None]:
length(unique(data.bmi$Subject_Id))

In [None]:
# add column: Yes or No for available BMI value
data.bmi$Any_BMI_record_Existence_Yes_No <- 'Yes' # yes for available BMI value

# Smoking status

In [None]:
# subset data with smoking status in Concept_Name
smoke.data <- phy.data.ID %>% filter(str_detect(Concept_Name, 'Smokeless|smokeless|Tobacco|tobacco|
                                                        Smoking|smoking|Smoker|smoker|smoke|Smoke')) %>%
filter(!str_detect(Concept_Name, 'Ready to Quit Smoking|quit date|Quit Date|Start Date|start date|
                                                        Tobacco Pack|Tobacco Used Years|Smoking status'))
dim(smoke.data)

# make sure Concept_Name has only smoking status
unique(smoke.data$Concept_Name)

# check how many ind have smoking status
length(unique(smoke.data$Subject_Id)) # 899

In [None]:
# replace space between the two strings
smoke.data$Concept_Name <- gsub(' ', '_', smoke.data$Concept_Name)
smoke.data$Concept_Name <- gsub('-', '_', smoke.data$Concept_Name)
smoke.data$Concept_Name <- gsub('___', '_', smoke.data$Concept_Name)
smoke.data$Concept_Name <- gsub(',', '_', smoke.data$Concept_Name)
smoke.data$Concept_Name <- gsub('__', '_', smoke.data$Concept_Name)
dim(smoke.data)

In [None]:
# make sure Concept_Name has only smoking status with only underscore
unique(smoke.data$Concept_Name)

In [None]:
# selected columns
smoke.data.selected.cols <- smoke.data %>% select(Subject_Id, Date, Concept_Name, Result)
dim(smoke.data.selected.cols)

In [None]:
dim(smoke.data.selected.cols[duplicated(smoke.data.selected.cols), ]) # 5607

In [None]:
# remove duplicated rows
smoke.data.no.dup <- smoke.data.selected.cols[!duplicated(smoke.data.selected.cols), ]
dim(smoke.data.no.dup)

## Smokeless tobacco

In [None]:
# subset data with smokeless tobacco use in Concept_Name
smokeless.tobacco <- smoke.data.no.dup %>% filter(str_detect(Concept_Name, 'Smokeless|Chewing_Tobacco'))
dim(smokeless.tobacco)

# make sure Concept_Name has only smokeless tobacco use in Concept_Name
unique(smokeless.tobacco$Concept_Name)
# check missing values
colSums(is.na(smokeless.tobacco)) # no missing
# check how many ind have only smokeless tobacco use in Concept_Name
length(unique(smokeless.tobacco$Subject_Id)) # 899

In [None]:
# category in smokeless tobacco use
table(smokeless.tobacco$Concept_Name)

## Smoking tobacco

In [None]:
# subset data with smoking tobacco use in Concept_Name
smoking.tobacco <- smoke.data.no.dup %>% filter(str_detect(Concept_Name, 'Smoking|smoking|Smoker|smoker|Tobacco_User'))
dim(smoking.tobacco)
# make sure Concept_Name has only smoking.tobacco use in Concept_Name
unique(smoking.tobacco$Concept_Name)
# check missing values
colSums(is.na(smoking.tobacco)) # no missing
# check how many ind have  only smoking tobacco use in Concept_Name
length(unique(smoking.tobacco$Subject_Id)) # 899

In [None]:
# category in smoking tobacco use
table(smoking.tobacco$Concept_Name)

In [None]:
# show result is no
result.no <- smoking.tobacco %>% filter(Result == 'No') # mean no smoker
table(result.no$Concept_Name) # only smoker has No in result col

In [None]:
smoking.tobacco$Smoking_category <- NA
smoking.tobacco$Smoking_category[smoking.tobacco$Result == 'No'] <- 'Never_Smoker'
smoking.tobacco$Smoking_category[smoking.tobacco$Result == 'Yes'] <- 'Current_Smoker'

smoking.tobacco$Smoking_category[smoking.tobacco$Concept_Name == 'Smoking_Tobacco_Use_Current_Every_Day_Smoker'] <- 'Current_Smoker'
smoking.tobacco$Smoking_category[smoking.tobacco$Concept_Name == 'Smoking_Tobacco_Use_Current_Some_Day_Smoker'] <- 'Current_Smoker'

smoking.tobacco$Smoking_category[smoking.tobacco$Concept_Name == 'Smoking_Tobacco_Use_Former_Smoker'] <- 'Former_Smoker'
smoking.tobacco$Smoking_category[smoking.tobacco$Concept_Name == 'Smoking_Tobacco_Use_Heavy_Tobacco_Smoker'] <- 'Current_Smoker'

smoking.tobacco$Smoking_category[smoking.tobacco$Concept_Name == 'Smoking_Tobacco_Use_Light_Tobacco_Smoker'] <- 'Current_Smoker'
smoking.tobacco$Smoking_category[smoking.tobacco$Concept_Name == 'Smoking_Tobacco_Use_Never_Assessed'] <- 'Unknown'

smoking.tobacco$Smoking_category[smoking.tobacco$Concept_Name == 'Smoking_Tobacco_Use_Never_Smoker'] <- 'Never_Smoker'
smoking.tobacco$Smoking_category[smoking.tobacco$Concept_Name == 'Smoking_Tobacco_Use_Passive_Smoke_Exposure_Never_Smoker'] <- 'Never_Smoker'

smoking.tobacco$Smoking_category[smoking.tobacco$Concept_Name == 'Smoking_Tobacco_Use_Smoker_Current_Status_Unknown'] <- 'Unknown'
smoking.tobacco$Smoking_category[smoking.tobacco$Concept_Name == 'Smoking_Tobacco_Use_Unknown_if_Ever_Smoked'] <- 'Unknown'

smoking.tobacco$Smoking_category[smoking.tobacco$Concept_Name == 'Tobacco_User_Never'] <- 'Never_Smoker'
smoking.tobacco$Smoking_category[smoking.tobacco$Concept_Name == 'Tobacco_User_Not_Asked'] <- 'Unknown'
smoking.tobacco$Smoking_category[smoking.tobacco$Concept_Name == 'Tobacco_User_Passive'] <- 'Never_Smoker'
smoking.tobacco$Smoking_category[smoking.tobacco$Concept_Name == 'Tobacco_User_Quit'] <- 'Former_Smoker'
smoking.tobacco$Smoking_category[smoking.tobacco$Concept_Name == 'Tobacco_User_Yes'] <- 'Current_Smoker'

In [None]:
table(smoking.tobacco$Smoking_category)

In [None]:
table(smoking.tobacco$Result)

# Smoking tobacco status

In [None]:
dim(smoking.tobacco[duplicated(smoking.tobacco), ]) 

In [None]:
# remove result col
smoking.tobacco.cat <- smoking.tobacco %>% select(Subject_Id, Date, Smoking_category)
dim(smoking.tobacco.cat)

In [None]:
dim(smoking.tobacco.cat[duplicated(smoking.tobacco.cat), ]) # 45261 dup

In [None]:
# remove duplication
smoking.tobacco.no.dup <- smoking.tobacco.cat[!duplicated(smoking.tobacco.cat), ]
dim(smoking.tobacco.no.dup)

In [None]:
dim(smoking.tobacco.no.dup[duplicated(smoking.tobacco.no.dup[,c('Subject_Id', 'Date')]), ]) # 28

In [None]:
# rank
table(smoking.tobacco.no.dup$Smoking_category)

smoking.tobacco.no.dup$Cat_prior <- NA
smoking.tobacco.no.dup$Cat_prior[smoking.tobacco.no.dup$Smoking_category == 'Unknown'] <- 0
smoking.tobacco.no.dup$Cat_prior[smoking.tobacco.no.dup$Smoking_category == 'Never_Smoker'] <- 1
smoking.tobacco.no.dup$Cat_prior[smoking.tobacco.no.dup$Smoking_category == 'Former_Smoker'] <- 2
smoking.tobacco.no.dup$Cat_prior[smoking.tobacco.no.dup$Smoking_category == 'Current_Smoker'] <- 3

table(smoking.tobacco.no.dup$Cat_prior)

In [None]:
max(smoking.tobacco.no.dup$Cat_prior)

In [None]:
# priority selecting patients
smoking.cat.prior <- smoking.tobacco.no.dup %>%                                 # Get max by group
  group_by(Subject_Id, Date) %>%
  summarise_at(vars(Cat_prior),
               list(Cat_prior = max))
dim(smoking.cat.prior)
# from 45390 to 45362

In [None]:
table(smoking.cat.prior$Cat_prior)

In [None]:
dim(smoking.cat.prior[duplicated(smoking.cat.prior[,c('Subject_Id', 'Date')]), ]) # no dup

In [None]:
# convert Cat_prior to smoking status
table(smoking.cat.prior$Cat_prior)
smoking.cat.prior$Smoking_category <- NA
smoking.cat.prior$Smoking_category[smoking.cat.prior$Cat_prior == 0] <- 'Unknown'
smoking.cat.prior$Smoking_category[smoking.cat.prior$Cat_prior == 1] <- 'Never_Smoker'
smoking.cat.prior$Smoking_category[smoking.cat.prior$Cat_prior == 2] <- 'Former_Smoker'
smoking.cat.prior$Smoking_category[smoking.cat.prior$Cat_prior == 3] <- 'Current_Smoker'

table(smoking.cat.prior$Smoking_category)

### Count number of smoking tobacco counts

In [None]:
# create dummy variables for each category in Concept_Name
smoking.tobacco.category.dummy <- dummy_cols(smoking.cat.prior,
                   select_columns = "Smoking_category")
dim(smoking.tobacco.category.dummy)

In [None]:
which(colnames(smoking.tobacco.category.dummy) == 'Smoking_category_Current_Smoker')

In [None]:
# check how many counts in each columns
smoking.tobacco.category.cols <- colnames(smoking.tobacco.category.dummy)[5:ncol(smoking.tobacco.category.dummy)] # start from Current smoker
for (i in c(1:length(smoking.tobacco.category.cols))){
  print(smoking.tobacco.category.cols[i])
  print(table(smoking.tobacco.category.dummy[, smoking.tobacco.category.cols[i]]))
}

In [None]:
# group by Subject_Id and sum all columns
smoking.tobacco.category.dummy.group <- smoking.tobacco.category.dummy[,-2:-4] # keep only Subject_Id
sum.smoking.tobacco.category <- smoking.tobacco.category.dummy.group %>% group_by(Subject_Id) %>% 
                                                    summarise(across(everything(), sum), .groups = 'drop') %>%
                                                    as.data.frame()
dim(sum.smoking.tobacco.category)

In [None]:
# Remove the "Smoking_category_" prefix from all column names
names(sum.smoking.tobacco.category) <- gsub("Smoking_category_", "", names(sum.smoking.tobacco.category))
dim(sum.smoking.tobacco.category)

In [None]:
# add suffix count after each category
colnames(sum.smoking.tobacco.category) <- paste(colnames(sum.smoking.tobacco.category),"count",sep="_")
colnames(sum.smoking.tobacco.category)[1] = 'Subject_Id'

In [None]:
# add column: Yes or No for each diagnosis
exist.sum.smoking.tobacco.category <- sum.smoking.tobacco.category
exist.sum.smoking.tobacco.category[,-1] <- ifelse(exist.sum.smoking.tobacco.category[,-1] > 0, 'Yes', 'No')
colnames(exist.sum.smoking.tobacco.category)[-1] <- str_replace(colnames(exist.sum.smoking.tobacco.category[,-1]),"_count", '')
colnames(exist.sum.smoking.tobacco.category)[-1] <- paste(colnames(exist.sum.smoking.tobacco.category[,-1]),"_Existence_Yes_No", sep = '')

In [None]:
# merge files
sum.smoking.tobacco.category.final <- merge(sum.smoking.tobacco.category, exist.sum.smoking.tobacco.category, by = 'Subject_Id')
head(sum.smoking.tobacco.category.final)
dim(sum.smoking.tobacco.category.final)

In [None]:
# add column: Yes or No for smoking tobacco record
sum.smoking.tobacco.category.final$Any_Smoking_Tobacco_Status_Existence_Yes_No <- 'Yes' # yes for available record

In [None]:
# find most frequency asthma diagnosis
smoking.tobacco.cat.count <- sum.smoking.tobacco.category.final[,c('Current_Smoker_count', 
                                       'Former_Smoker_count',
                                       'Never_Smoker_count',
                                       'Unknown_count')]

sum.smoking.tobacco.category.final$Most_freq_smoking_status <- colnames(smoking.tobacco.cat.count)[apply(smoking.tobacco.cat.count,1,which.max)]

# remove suffix _total_diagnosis
sum.smoking.tobacco.category.final <- sum.smoking.tobacco.category.final %>% mutate_at("Most_freq_smoking_status", str_replace, "_count", "")

### Calculate days difference

In [None]:
# merge collect date by Subject_ID
smoking.cat.prior <- merge(smoking.cat.prior, data.id[,c('Subject_Id', 'Plasma_collect_date')], by = 'Subject_Id')
dim(smoking.cat.prior)

In [None]:
# calculate date difference
# convert the date column to the Y-M-D format
smoking.cat.prior$Date <- as.Date(smoking.cat.prior$Date, format = "%m/%d/%Y")
typeof(smoking.cat.prior$Date)
head(smoking.cat.prior$Date)

# plasma collect date
smoking.cat.prior$Plasma_collect_date <- as.Date(smoking.cat.prior$Plasma_collect_date, format = "%Y -%m -%d")
head(smoking.cat.prior$Plasma_collect_date)

# substract collect date and diag date
smoking.cat.prior["Days_Difference"] <- as.numeric(difftime(smoking.cat.prior$Plasma_collect_date, 
                                                 smoking.cat.prior$Date, units = "days"))

# absolute values
smoking.cat.prior["Days_Difference_Abs"] <- as.numeric(abs(smoking.cat.prior$Days_Difference))

In [None]:
summary(smoking.cat.prior$Days_Difference_Abs)
summary(smoking.cat.prior$Days_Difference)

In [None]:
# Find closest date to plasma collect date in smoking.cat.prior
smoking.tobacco.date.closest.collect <- smoking.cat.prior %>% select(Subject_Id, Date, 
                                                                     Days_Difference_Abs, Smoking_category)

### Closest date before or after plasma collect date

In [None]:
# find closest date to plasma collect date in smoking.tobacco.date.closest.collect
smoking.tobacco.closest.collect.date <- smoking.tobacco.date.closest.collect %>%                # Get min by group
  group_by(Subject_Id, Smoking_category) %>%
  summarise_at(vars(Days_Difference_Abs),
               list(closest_collect_date_gap = min))
dim(smoking.tobacco.closest.collect.date)

In [None]:
dim(smoking.tobacco.closest.collect.date[duplicated(smoking.tobacco.closest.collect.date), ]) # no duplication

In [None]:
# group the data frame by 'Subject_Id' and spread the 'closet_collect_date' column
smoking.tobacco.closest.collect.date.wide <- smoking.tobacco.closest.collect.date %>% group_by(Subject_Id, 
                                                                                               Smoking_category) %>%
                                                                pivot_wider(names_from = Smoking_category, 
                                                                            values_from = closest_collect_date_gap)
dim(smoking.tobacco.closest.collect.date.wide)

In [None]:
# add suffix cloest collect date after each diag
colnames(smoking.tobacco.closest.collect.date.wide) <- paste(colnames(smoking.tobacco.closest.collect.date.wide),
                                                             "closest_collect_date_gap",sep="_")
colnames(smoking.tobacco.closest.collect.date.wide)[1] = 'Subject_Id'
dim(smoking.tobacco.closest.collect.date.wide)

In [None]:
# filter for all diagnosis to include diagnosis date for each days difference
diag.names <- unique(smoking.tobacco.date.closest.collect$Smoking_category)
for (diag.name in diag.names){
    smoking.tobacco.date.closest.collect.each <- smoking.tobacco.date.closest.collect %>% filter(Smoking_category == diag.name)


    # remove diagnosis_name and Plasma_collect_date
    smoking.tobacco.date.closest.collect.each.1 <- smoking.tobacco.date.closest.collect.each %>% select(-c(Smoking_category))


    # replace Date name with date + diagnosis
    names(smoking.tobacco.date.closest.collect.each.1)[names(smoking.tobacco.date.closest.collect.each.1) == 'Date'] <- paste(diag.name, '_closest_collect_date', sep = '')


    # test merge date for diagnosis
    smoking.tobacco.closest.collect.date.wide <- merge(smoking.tobacco.closest.collect.date.wide, smoking.tobacco.date.closest.collect.each.1, 
                                                        by.x = c('Subject_Id', paste(diag.name, '_closest_collect_date_gap', sep = '')), 
                                                        by.y = c('Subject_Id', 'Days_Difference_Abs'), all.x = TRUE)
    }


dim(smoking.tobacco.closest.collect.date.wide)

In [None]:
# remove duplication
check.unique.smoking.tobacco <- colnames(smoking.tobacco.closest.collect.date.wide)[grepl("Subject_Id|date_gap",colnames(smoking.tobacco.closest.collect.date.wide))]
smoking.tobacco.closest.collect.date.wide <- smoking.tobacco.closest.collect.date.wide[!duplicated(smoking.tobacco.closest.collect.date.wide[,check.unique.smoking.tobacco]), ]

In [None]:
dim(smoking.tobacco.closest.collect.date.wide)

In [None]:
# find closest smoking.status diagnosis to collect date
smoking.status.closest.collect.date.cols <- smoking.tobacco.closest.collect.date.wide[,c('Unknown_closest_collect_date_gap', 
                                                'Current_Smoker_closest_collect_date_gap',
                                                'Never_Smoker_closest_collect_date_gap',
                                                'Former_Smoker_closest_collect_date_gap')]


smoking.tobacco.closest.collect.date.wide$Closest_collect_date_smoking_status <- colnames(smoking.status.closest.collect.date.cols)[apply(data.matrix(smoking.status.closest.collect.date.cols),1,which.min)]

# remove suffix _closest_collect_date_gap
smoking.tobacco.closest.collect.date.wide <- smoking.tobacco.closest.collect.date.wide %>% mutate_at("Closest_collect_date_smoking_status", str_replace, "_closest_collect_date_gap", "")

In [None]:
# gap between closest date to collect date
min.na.rm <- function(x){
    min(x, na.rm = TRUE)
}

smoking.tobacco.closest.collect.date.wide$Closest_collect_date_smoking_status_gap <- apply(data.matrix(smoking.status.closest.collect.date.cols),1,min.na.rm)
# relocate 
smoking.tobacco.closest.collect.date.wide <- smoking.tobacco.closest.collect.date.wide %>% relocate(Closest_collect_date_smoking_status_gap, .after = Closest_collect_date_smoking_status)

### Smoking: First date

In [None]:
typeof(smoking.cat.prior$Date)

In [None]:
# find first date in smoking.cat.prior
first.smoking.tobacco.date <- smoking.cat.prior %>%                                       # Get min by group
  group_by(Subject_Id, Smoking_category) %>%
  summarise_at(vars(Date),
               list(first_date = min))
dim(first.smoking.tobacco.date)

In [None]:
# group the data frame by 'Subject_Id' and spread the 'min' column
first.smoking.tobacco.date.wide <- first.smoking.tobacco.date %>% group_by(Subject_Id, Smoking_category) %>%
                                                    pivot_wider(names_from = Smoking_category, 
                                                                values_from = first_date)
dim(first.smoking.tobacco.date.wide)

In [None]:
# add suffix first date after each category
colnames(first.smoking.tobacco.date.wide) <- paste(colnames(first.smoking.tobacco.date.wide),"first_date",sep="_")
colnames(first.smoking.tobacco.date.wide)[1] = 'Subject_Id'
dim(first.smoking.tobacco.date.wide)

### Smoking: Recent date

In [None]:
# find recent date in smoking.cat.prior
recent.smoking.tobacco.date <- smoking.cat.prior %>%                                       # Get max by group
  group_by(Subject_Id, Smoking_category) %>%
  summarise_at(vars(Date),
               list(recent_date = max))
dim(recent.smoking.tobacco.date)

In [None]:
# group the data frame by 'Subject_Id' and spread the 'max' column
recent.smoking.tobacco.date.wide <- recent.smoking.tobacco.date %>% group_by(Subject_Id, Smoking_category) %>%
                                                    pivot_wider(names_from = Smoking_category, 
                                                                values_from = recent_date)
dim(recent.smoking.tobacco.date.wide)

In [None]:
# add suffix recent date after each category
colnames(recent.smoking.tobacco.date.wide) <- paste(colnames(recent.smoking.tobacco.date.wide),"recent_date",sep="_")
colnames(recent.smoking.tobacco.date.wide)[1] = 'Subject_Id'
dim(recent.smoking.tobacco.date.wide)

In [None]:
# find recent smoking.status diagnosis
# because recent date is string -> need to convert date formate
smoking.status.recent.date.cols <- recent.smoking.tobacco.date.wide[,c('Never_Smoker_recent_date', 
                                                'Former_Smoker_recent_date',
                                                'Current_Smoker_recent_date')]

convert.date <- function(x) as.Date(x, format = "%Y-%m-%d")
smoking.status.recent.date.cols <- data.frame(lapply(smoking.status.recent.date.cols, convert.date))
smoking.status.recent.date.cols[is.na(smoking.status.recent.date.cols)] <- as.Date('1900-01-01', format = "%Y-%m-%d")
recent.smoking.tobacco.date.wide$Most_recent_smoking_status <- colnames(smoking.status.recent.date.cols)[apply(data.matrix(smoking.status.recent.date.cols),1,which.max)]

# remove suffix_recent_diagnosis_date
recent.smoking.tobacco.date.wide <- recent.smoking.tobacco.date.wide %>% mutate_at("Most_recent_smoking_status", str_replace, "_recent_date", "")

In [None]:
# merge all smoking tobacco files
# recent date
dim(recent.smoking.tobacco.date.wide) # 899

# first date 
dim(first.smoking.tobacco.date.wide) # 899

# count
dim(sum.smoking.tobacco.category.final) # 899

# closest to collect date
dim(smoking.tobacco.closest.collect.date.wide)

In [None]:
# merge all files for smoking tobacco
smoking.tobacco.merged <- sum.smoking.tobacco.category.final %>%
                                            left_join(first.smoking.tobacco.date.wide, by = 'Subject_Id') %>%
                                            left_join(recent.smoking.tobacco.date.wide, by = 'Subject_Id') %>%
                                            left_join(smoking.tobacco.closest.collect.date.wide, by = 'Subject_Id')
dim(smoking.tobacco.merged)

In [None]:
smoking.tobacco.merged <- smoking.tobacco.merged %>% select(Subject_Id,
                                    Any_Smoking_Tobacco_Status_Existence_Yes_No,
                                    Current_Smoker_Existence_Yes_No,
                                    Most_freq_smoking_status,
                                    Most_recent_smoking_status,
                                    Closest_collect_date_smoking_status,
                                    Closest_collect_date_smoking_status_gap,
                                    Current_Smoker_count,
                                    Current_Smoker_first_date,
                                    Current_Smoker_recent_date,
                                    Current_Smoker_closest_collect_date,
                                    Current_Smoker_closest_collect_date_gap,
                                    Former_Smoker_Existence_Yes_No,
                                    Former_Smoker_count,
                                    Former_Smoker_first_date,
                                    Former_Smoker_recent_date,
                                    Former_Smoker_closest_collect_date,
                                    Former_Smoker_closest_collect_date_gap,
                                    Never_Smoker_Existence_Yes_No,
                                    Never_Smoker_count,
                                    Never_Smoker_first_date,
                                    Never_Smoker_recent_date,
                                    Never_Smoker_closest_collect_date,
                                    Never_Smoker_closest_collect_date_gap,
                                    Unknown_Existence_Yes_No,
                                    Unknown_count,
                                    Unknown_first_date,
                                    Unknown_recent_date,
                                    Unknown_closest_collect_date,
                                    Unknown_closest_collect_date_gap)
dim(smoking.tobacco.merged)