In [None]:
# load R packages
library(readxl)
packageVersion('readxl')
library(dplyr)
packageVersion('dplyr')
library(stringr)
packageVersion('stringr')
library(fastDummies)
packageVersion('fastDummies')
library(tidyr)
packageVersion('tidyr')
library(lubridate)
packageVersion('lubridate')
library(ggplot2)
packageVersion('ggplot2')

In [None]:
# set directory
project.dir = '...'
data.dir = '...'
regeps.dir = '...'
raw.rpdr.dir = file.path(regeps.dir, '...')
cleaned.rpdr.dir = file.path(regeps.dir, '...')
Qingwen.data.dir = file.path(regeps.dir, '...')

In [None]:
# check med file to see how many rows
med.lines <-readLines(file.path(raw.rpdr.dir, "Med.txt"))
length(med.lines)

In [None]:
# load med file
med.data <- read.delim(file.path(raw.rpdr.dir, "Med.txt"), sep = '|', quote="", fill=FALSE)
dim(med.data)
length(unique(med.data$EMPI))
head(med.data)

In [None]:
# change format of med date
med.data$Medication_Date <- as.Date(med.data$Medication_Date, format = "%m/%d/%Y")
typeof(med.data$Medication_Date)

In [None]:
table(med.data$Medication_Date_Detail) # check medication date detail

In [None]:
# The 'Removed' value denotes that a medication was removed from a patient's medication list -> exclude 'Removed'
med.listed <- med.data %>% filter(Medication_Date_Detail != 'Removed')
dim(med.listed) # from 1463941 to 1415384

In [None]:
table(med.listed$Medication_Date_Detail) # check after removing

In [None]:
# extract biobank ID
data.id <- read.csv(file.path(cleaned.rpdr.dir, 'Subject_Id.csv'))
dim(data.id)
# merge biobank.ID and med.data file
med.data.ID <- merge(med.listed,  data.id[,c('Subject_Id', 'EMPI')], by = 'EMPI')
dim(med.data.ID)

In [None]:
head(med.data.ID)

In [None]:
# list medication name in biobank portal
met.dir = '...'
contraceptives.list <- read.csv(file.path(met.dir, 'Contraceptives_list_biobank.csv'))
head(contraceptives.list)

In [None]:
contraceptives.med <- med.data.ID %>% filter(Medication %in% contraceptives.list$Medication_name)
dim(contraceptives.med)
length(unique(contraceptives.med$Medication)) #168

In [None]:
# selected columns
contraceptives.med.selected.cols <- contraceptives.med %>% select(Subject_Id,Medication_Date, Medication, 
                                                   Additional_Info) %>%
                                            arrange(Subject_Id, Medication_Date)
dim(contraceptives.med.selected.cols)
length(unique(contraceptives.med.selected.cols$Subject_Id)) # 219
head(contraceptives.med.selected.cols, 10)

In [None]:
# extract ROUTE infomation from Additional info
contraceptives.med.selected.cols$Route <- str_extract(contraceptives.med.selected.cols$Additional_Info, "ROUTE=[^;]+")
# check route
table(contraceptives.med.selected.cols$Route)

In [None]:
dim(contraceptives.med.selected.cols[duplicated(contraceptives.med.selected.cols),]) # dup rows

In [None]:
# remove dup rows
dim(contraceptives.med.selected.cols)
contraceptives.no.dup.rows <- contraceptives.med.selected.cols[!duplicated(contraceptives.med.selected.cols),]
dim(contraceptives.no.dup.rows)

In [None]:
contraceptives.med.group <- contraceptives.no.dup.rows %>% select(Subject_Id, Medication_Date)
dim(contraceptives.med.group[duplicated(contraceptives.med.group),]) 

In [None]:
dim(contraceptives.med.group)
contraceptives.no.dup <- contraceptives.med.group[!duplicated(contraceptives.med.group),]
dim(contraceptives.no.dup)

In [None]:
contraceptives.no.dup$Prescription <- 1 # assign 1 for a prescription
head(contraceptives.no.dup)

In [None]:
# Total number of prescription
# Group by Subject_Id and sum of medication using dplyr
contraceptives.prescr.counts.per.ind <- contraceptives.no.dup %>% group_by(Subject_Id) %>% 
  summarise(Contraceptives_total_number_of_prescriptions = sum(Prescription),
            .groups = 'drop')
dim(contraceptives.prescr.counts.per.ind)

In [None]:
# summary statistic
summary(contraceptives.prescr.counts.per.ind$Contraceptives_total_number_of_prescriptions)
hist(contraceptives.prescr.counts.per.ind$Contraceptives_total_number_of_prescriptions)

In [None]:
# Calculate date difference based on plasma collection date
contraceptives.group.w.date <- merge(contraceptives.no.dup , data.id[,c('Subject_Id', 'Plasma_collect_date')], 
                              by = 'Subject_Id')
dim(contraceptives.group.w.date)

In [None]:
# convert the date column to the Y-M-D format
contraceptives.group.w.date$Medication_Date <- as.Date(contraceptives.group.w.date$Medication_Date, format = "%m/%d/%Y")
typeof(contraceptives.group.w.date$Medication_Date)
head(contraceptives.group.w.date$Medication_Date)

# plasma collect date
contraceptives.group.w.date$Plasma_collect_date <- as.Date(contraceptives.group.w.date$Plasma_collect_date, format = "%Y -%m -%d")
typeof(contraceptives.group.w.date$Plasma_collect_date)
head(contraceptives.group.w.date$Plasma_collect_date)

In [None]:
# substract collect date and diag date
contraceptives.group.w.date["Days_Difference"] <- difftime(contraceptives.group.w.date$Plasma_collect_date, 
                                                    contraceptives.group.w.date$Medication_Date, units = "days")
contraceptives.group.w.date$Days_Difference <- as.numeric(contraceptives.group.w.date$Days_Difference)
head(contraceptives.group.w.date$Days_Difference,10)
# absolute values
contraceptives.group.w.date["Days_Difference_Abs"] <- as.numeric(abs(contraceptives.group.w.date$Days_Difference))
head(contraceptives.group.w.date$Days_Difference_Abs,10)

In [None]:
typeof(contraceptives.group.w.date$Days_Difference)
typeof(contraceptives.group.w.date$Days_Difference_Abs)

In [None]:
# create a dataset of no of prescriptions within 5 years based on sample collection date
contraceptives.5y <- contraceptives.group.w.date %>% filter(Days_Difference_Abs <= 1826)
dim(contraceptives.5y)

# check how many individuals have ICS prescriptions within the last 5 years based on sample collection date
length(unique(contraceptives.5y$Subject_Id)) # 143 individuals

In [None]:
summary(as.numeric(contraceptives.5y$Days_Difference))

In [None]:
# Group by Subject_Id and sum of medication date using dplyr
contraceptives.5y.counts.per.ind <- contraceptives.5y %>% group_by(Subject_Id) %>% 
  summarise(Contraceptives_total_number_of_prescriptions_within_5y = sum(Prescription),
            .groups = 'drop')
dim(contraceptives.5y.counts.per.ind)

In [None]:
summary(contraceptives.5y.counts.per.ind$Contraceptives_total_number_of_prescriptions_within_5y)
plot(contraceptives.5y.counts.per.ind$Contraceptives_total_number_of_prescriptions_within_5y)

In [None]:
head(contraceptives.5y.counts.per.ind)

In [None]:
met.dir = ''
write.csv(contraceptives.5y.counts.per.ind, file.path(met.dir, 'Contraceptives_prescription_5Y.csv'), row.names = FALSE)