In [None]:
# load library
library(data.table)
packageVersion('data.table')
library(readr)
packageVersion('readr')
library(dplyr)
packageVersion('dplyr')
library(stringr)
packageVersion('stringr')
library(ggplot2)
packageVersion('ggplot2')
library(tidyr)
packageVersion('tidyr')

In [None]:
# set directory
data.dir = '...'
rpdr.dir = file.path('...')
plink1.dir = '...'

In [None]:
# load gender file from RPDR
dem <- read.csv(file.path(rpdr.dir, 'Demographic_data.csv'))
gender <- dem %>% select(Subject_Id, Sex_At_Birth)
head(gender)

In [None]:
table(gender$Sex_At_Birth)

In [None]:
# load id file
id <- read.csv(file.path(data.dir, 'genotype_ID.csv'))
id <- distinct(id)
colnames(id) <- c('Subject_Id', 'sample_info', 'batch')
dim(id)

In [None]:
# merge
gender.698 <- id %>% left_join(gender, by = 'Subject_Id')
head(gender.698)

In [None]:
table(gender.698$Sex_At_Birth) # check gender

In [None]:
# code Sex (1=male; 2=female; other=unknown based on plink)
gender.698$Sex_at_birth_code <- ifelse(gender.698$Sex_At_Birth == 'Male', 1,
                      ifelse(gender.698$Sex_At_Birth == 'Female', 2, 0))
table(gender.698$Sex_at_birth_code)
head(gender.698)

In [None]:
gender.698.1 <- separate(gender.698, sample_info, into = c('FID', 'IID'), sep = "_")
head(gender.698.1)

In [None]:
# make new fam file including gender
# Read the FAM file: change the fam file to chrs_X_poly_SNPs_maf_005_hwe_1e-05_org.fam
fam.698 <- read.table(file.path(plink1.dir, 'chrs_X_poly_SNPs_maf_005_hwe_1e-05_org.fam'), 
                                header = FALSE, stringsAsFactors = FALSE)
colnames(fam.698) <- c("FID", "IID", "Paternal_ID", "Maternal_ID", "Sex", "Phenotype")
head(fam.698)

In [None]:
fam.698$FID <- as.character(fam.698$FID)
fam.698.new <- fam.698 %>% left_join(gender.698.1, by = c('FID', 'IID'))
dim(fam.698.new)
head(fam.698.new)

In [None]:
fam.698.new.1 <- fam.698.new %>% select(FID, IID, Paternal_ID, Maternal_ID, Sex_at_birth_code, Phenotype)
names(fam.698.new.1)[names(fam.698.new.1) == "Sex_at_birth_code"] <- "Sex"
head(fam.698.new.1)

In [None]:
write.table(fam.698.new.1, file.path(plink1.dir, 'chrs_X_poly_SNPs_maf_005_hwe_1e-05.fam'), 
            sep = "\t", col.names = FALSE, row.names = FALSE, quote = FALSE) 

In [None]:
# open output from checksex: chrs_X_SNP_cleaned_checksex.sexcheck
sex.check <- read.table(file.path(plink1.dir,'chrs_X_SNP_cleaned_checksex.sexcheck'), header = TRUE)
dim(sex.check)
head(sex.check)

In [None]:
sex.check$FID <- as.character(sex.check$FID)
sex.check.new <- sex.check %>% left_join(gender.698.1, by = c('FID', 'IID'))
dim(sex.check.new)
head(sex.check.new)

In [None]:
sex.check.new.1 <- sex.check.new %>% left_join(dem[,c('Subject_Id', 'Gender_Legal_Sex', 'Gender_Identity')], by = 'Subject_Id')
head(sex.check.new.1)

In [None]:
# code Gender_Legal_Sex (1=male; 2=female; other=unknown based on plink)
sex.check.new.1$Gender_Legal_Sex_code <- ifelse(sex.check.new.1$Gender_Legal_Sex == 'Male', 1,
                      ifelse(sex.check.new.1$Gender_Legal_Sex == 'Female', 2, 0))
table(sex.check.new.1$Gender_Legal_Sex_code)
head(sex.check.new.1)

In [None]:
# code Gender_Identity (1=male; 2=female; other=unknown based on plink)
sex.check.new.1$Gender_Identity_code <- ifelse(sex.check.new.1$Gender_Identity == 'Male', 1,
                      ifelse(sex.check.new.1$Gender_Identity == 'Female', 2, 0))
table(sex.check.new.1$Gender_Identity_code)
head(sex.check.new.1)

In [None]:
sex.check.new.2 <- sex.check.new.1 %>% select(Subject_Id, PEDSEX, SNPSEX, STATUS, Gender_Legal_Sex_code, Gender_Identity_code)
head(sex.check.new.2)

In [None]:
# compare SNPSEX vs Gender_Legal_Sex
sex.check.new.2$SNPSEX_LegalSex <- ifelse(sex.check.new.2$SNPSEX == sex.check.new.2$Gender_Legal_Sex_code, 'TRUE', 'FALSE')
table(sex.check.new.2$SNPSEX_LegalSex)

In [None]:
sex.check.new.2 %>% filter(SNPSEX_LegalSex == 'FALSE' & SNPSEX != 0) # 7 problem

In [None]:
id.problem <- sex.check.new.2 %>% filter(STATUS == 'PROBLEM' & SNPSEX != 0 & PEDSEX != 0) # 4 problem --> remove
id.problem

In [None]:
dim(sex.check.new.2)
sex.check.remove.problem <- sex.check.new.2 %>% filter(!Subject_Id %in% id.problem$Subject_Id)
dim(sex.check.remove.problem)

In [None]:
sex.check.remove.problem %>% filter(Subject_Id %in% id.problem$Subject_Id)

In [None]:
# if ped sex missing --> use snp sex info
sex.check.remove.problem$Gender_impute <- sex.check.remove.problem$PEDSEX
table(sex.check.remove.problem$Gender_impute)
sex.check.remove.problem$Gender_impute[sex.check.remove.problem$Gender_impute == 0] <- sex.check.remove.problem$SNPSEX[sex.check.remove.problem$Gender_impute == 0]
table(sex.check.remove.problem$Gender_impute)

In [None]:
sex.check.remove.problem %>% filter(Gender_impute == 0)

In [None]:
# 83 snp sex info missing --> use legal gender
sex.check.remove.problem$Gender_impute[sex.check.remove.problem$Gender_impute == 0] <- sex.check.remove.problem$Gender_Legal_Sex_code[sex.check.remove.problem$Gender_impute == 0]
table(sex.check.remove.problem$Gender_impute)

In [None]:
# code Sex (1=male; 2=female; other=unknown based on plink)
sex.check.remove.problem$Gender_impute_info <- ifelse(sex.check.remove.problem$Gender_impute == 1, 'Male',
                      ifelse(sex.check.remove.problem$Gender_impute == 2, 'Female', 0))
table(sex.check.remove.problem$Gender_impute_info)
head(sex.check.remove.problem)

In [None]:
# because genotype data has only 698 ind, merge to have full data
gender.all <- dem %>% select(Subject_Id, Sex_At_Birth, Gender_Legal_Sex, Gender_Identity)
gender.all.1 <- gender.all %>% left_join(sex.check.remove.problem, by = 'Subject_Id')
dim(gender.all.1)
head(gender.all.1)

In [None]:
# show missing
gender.all.1 %>% filter(is.na(Gender_impute_info)) # use Sex at birth for missing

In [None]:
gender.all.1$Gender_impute_all <- gender.all.1$Gender_impute_info
table(gender.all.1$Gender_impute_all)
gender.all.1$Gender_impute_all[is.na(gender.all.1$Gender_impute_info)] <- gender.all.1$Sex_At_Birth[is.na(gender.all.1$Gender_impute_info)]
table(gender.all.1$Gender_impute_all)

In [None]:
# 84 unknow, using gender legal sex
gender.all.1$Gender_impute_all[gender.all.1$Gender_impute_all == 'Unknown'] <- gender.all.1$Gender_Legal_Sex[gender.all.1$Gender_impute_all == 'Unknown']
table(gender.all.1$Gender_impute_all)

In [None]:
dim(gender.all.1)
head(gender.all.1)