In [None]:
# load R packages
library(readxl)
packageVersion('readxl')
library(dplyr)
packageVersion('dplyr')
library(stringr)
packageVersion('stringr')
library(fastDummies)
packageVersion('fastDummies')
library(tidyr)
packageVersion('tidyr')
library(lubridate)
packageVersion('lubridate')
library(ggplot2)
packageVersion('ggplot2')

In [None]:
# set directory
project.dir = '...'
data.dir = '...'
regeps.dir = '...'
raw.rpdr.dir = file.path(regeps.dir, '...')
cleaned.rpdr.dir = file.path(regeps.dir, '...')

# Demo file

In [None]:
# check demographic data
dem.lines <-readLines(file.path(raw.rpdr.dir, "Dem.txt"))
length(dem.lines) # 929 includes header

dem.data <- read.delim(file.path(raw.rpdr.dir, "Dem.txt"), sep = '|')
dim(dem.data)
head(dem.data)
summary(dem.data)

In [None]:
# select columns
selected.dem.data <- dem.data %>% select(EMPI, Sex_At_Birth, Gender_Legal_Sex, Gender_Identity, Date_of_Birth, 
                                       Age, Date_Of_Death, Vital_status, Race1, Race2, Race_Group)

In [None]:
# extract biobank ID from Subject_Id.csv
data.id <- read.csv(file.path(cleaned.rpdr.dir, 'Subject_Id.csv'))
dim(data.id)
head(data.id)

In [None]:
data.dem <- merge(selected.dem.data, data.id, by = "EMPI")
head(data.dem)
length(unique(data.dem$Subject_Id))

In [None]:
# calculage age at collection date
date.of.birth <- as.Date(data.dem$Date_of_Birth, format = "%m /%d /%Y")
plasma.collect.date <- format(as.Date(data.dem$Plasma_collect_date), "%Y-%m-%d")
age.plasma.collection.date <- difftime(plasma.collect.date,date.of.birth, units = "days")
age.plasma.collection.date <- as.numeric(age.plasma.collection.date/365.2425)
sum(is.na(age.plasma.collection.date))
summary(age.plasma.collection.date)
data.dem["Age_at_plasma_collection_date"] <- round(age.plasma.collection.date, 2)
head(age.plasma.collection.date)
head(data.dem$Age_at_plasma_collection_date)

In [None]:
# summary statistic of age
summary(age.plasma.collection.date)
sd(age.plasma.collection.date)

# check individual less than 18
length(which(age.plasma.collection.date < 18)) # 2 less than 18

data.dem %>% filter(Age_at_plasma_collection_date < 18)

In [None]:
# select columns
data.dem.selected.cols <- data.dem %>% select(Subject_Id, EMPI, Sex_At_Birth, Gender_Legal_Sex, Gender_Identity,
                                                  Date_of_Birth, Date_Of_Death, Vital_status, Plasma_collect_date, 
                                                 Age_at_plasma_collection_date, Race1, Race2, Race_Group)
head(data.dem.selected.cols)

In [None]:
# change other non white races to non_white
data.dem.selected.cols$Race_White <- data.dem.selected.cols$Race_Group

data.dem.selected.cols$Race_White[data.dem.selected.cols$Race_White == "American Indian or Alaska Native"] <- "Non_White"
data.dem.selected.cols$Race_White[data.dem.selected.cols$Race_White == "Black"] <- "Non_White"
data.dem.selected.cols$Race_White[data.dem.selected.cols$Race_White == "Other"] <- "Non_White"
data.dem.selected.cols$Race_White[data.dem.selected.cols$Race_White == "Asian"] <- "Non_White"

# other races to unknown
data.dem.selected.cols$Race_White[data.dem.selected.cols$Race_White == "Unknown/Missing"] <- "Unknown"
data.dem.selected.cols$Race_White[data.dem.selected.cols$Race_White == "Declined"] <- "Unknown"
data.dem.selected.cols$Race_White[data.dem.selected.cols$Race_White == "Two or More"] <- "Unknown"

table(data.dem.selected.cols$Race_White)

In [None]:
table(data.dem.selected.cols$Sex_At_Birth)

In [None]:
table(data.dem.selected.cols$Gender_Legal_Sex)

In [None]:
table(data.dem.selected.cols$Gender_Identity)

In [None]:
data.dem.selected.cols$sex_at_birth_vs_legal <- ifelse(data.dem.selected.cols$Sex_At_Birth == data.dem.selected.cols$Gender_Legal_Sex,
                                                        'TRUE', 'FALSE')
table(data.dem.selected.cols$sex_at_birth_vs_legal)

In [None]:
data.dem.selected.cols %>% filter(sex_at_birth_vs_legal == 'FALSE') %>% select(Subject_Id, Sex_At_Birth,
                                                                                Gender_Legal_Sex, Gender_Identity)
# mostly unknow

In [None]:
data.dem.selected.cols %>% filter(sex_at_birth_vs_legal == 'FALSE') %>% filter(Sex_At_Birth != 'Unknown' & 
                                                                                 Sex_At_Birth != 'Chose not to disclose') %>%
                            select(Subject_Id, Sex_At_Birth, Gender_Legal_Sex, Gender_Identity)