In [None]:
# load library
library(data.table)
packageVersion('data.table')
library(readr)
packageVersion('readr')
library(dplyr)
packageVersion('dplyr')
library(stringr)
packageVersion('stringr')
library(ggplot2)
packageVersion('ggplot2')
library(tidyr)
packageVersion('tidyr')
library(caret)
packageVersion('caret')

In [None]:
# set directory
data.dir = '...'
rpdr.dir = file.path('...')
plink1.dir = '...'

In [None]:
# load race
dem <- read.csv(file.path(rpdr.dir, 'Demographic_data.csv'))
head(dem)

In [None]:
race <- dem %>% select(Subject_Id, Race1, Race2, Race_Group, Race_White)
head(race)

In [None]:
columns = c('fid', 'Sample.name', 'pca1', 'pca2', 'pca3', 'pca4', 'pca5', 'pca6','pca7','pca8','pca9','pca10')

In [None]:
ld.pruned.pca <- read.table(file.path(plink1.dir,'chrs_1_22_poly_SNPs_mind_005_maf_005_hwe_1e-05_pruned_pca.eigenvec'), sep = '',
                 header=F, col.names=columns)[,c(1:12)]
head(ld.pruned.pca)

In [None]:
# merge column 1 and 2
ld.pruned.pca$sample_info <- paste(ld.pruned.pca$fid, ld.pruned.pca$Sample.name, sep = '_')
head(ld.pruned.pca)

In [None]:
# load id file
id <- read.csv(file.path(data.dir, 'genotype_ID.csv'))
id <- distinct(id)
dim(id)
head(id)

In [None]:
colnames(id) <- c('Subject_Id', 'sample_info', 'batch')
# merge
race.698 <- id %>% left_join(race, by = 'Subject_Id')
head(race.698)

In [None]:
ggplot(ld.pruned.pca,aes(x=pca1, y=pca2)) +geom_point() + theme_bw() +xlab('PC1') + ylab('PC2')

In [None]:
ld.pruned.pca.race <- ld.pruned.pca %>% left_join(race.698, by = 'sample_info')
head(ld.pruned.pca.race)

In [None]:
ggplot(ld.pruned.pca.race, aes(x=pca1, y=pca2, color = Race_Group)) + geom_point() + theme_bw() + xlab('PC1') + ylab('PC2')

In [None]:
ggplot(ld.pruned.pca.race, aes(x=pca1, y=pca2, color = Race_White)) + geom_point() + theme_bw() + xlab('PC1') + ylab('PC2')

In [None]:
table(ld.pruned.pca.race$Race_Group)

In [None]:
eigenvalues <- read.table(file.path(plink1.dir,'chrs_1_22_poly_SNPs_mind_005_maf_005_hwe_1e-05_pruned_pca.eigenval'), header = FALSE)
dim(eigenvalues)
head(eigenvalues)

In [None]:
eigenvalues <- eigenvalues$V1
eigenvalues

pve <- eigenvalues/sum(eigenvalues)
pve

cumsum(pve)

In [None]:
# Create a data frame for ggplot
scree_data <- data.frame(Component = 1:length(eigenvalues), Eigenvalue = eigenvalues)

# Plot the scree plot using ggplot
ggplot(scree_data, aes(x = Component, y = Eigenvalue)) +
  geom_point() +
  geom_line() +
  labs(title = 'Scree Plot', x = 'Component', y = 'Eigenvalue') +
  theme_minimal()

**select 4 PCs for KNN prediciton**

### KNN prediciton of unknown races

In [None]:
# change Unknow to NA
table(ld.pruned.pca.race$Race_White)
ld.pruned.pca.race$Race_White[ld.pruned.pca.race$Race_White == 'Unknown'] <- NA
table(ld.pruned.pca.race$Race_White)

In [None]:
train <- ld.pruned.pca.race %>% select('pca1', 'pca2', 'pca3', 'pca4', 'Race_White') %>% filter(!is.na(Race_White))
train

In [None]:
test <- ld.pruned.pca.race %>% select('pca1', 'pca2', 'pca3', 'pca4', 'Race_White')
test

In [None]:
train[["Race_White"]] = factor(train[["Race_White"]])

In [None]:
trctrl <- trainControl(method = "repeatedcv", number = 10, repeats = 3) # 10 folds cross validation, 3 repeats

In [None]:
knn_fit <- train(Race_White ~pca1+pca2+pca3+pca4, data = train, method = "knn",
                 trControl=trctrl,preProcess = c("center", "scale"),tuneGrid=data.frame(k=5)) 
# standardize pca (scale = 1, mean = 0). k is the number of neighbors

In [None]:
test_pred <- predict(knn_fit, newdata = test)
ld.pruned.pca.race$predict_race <- test_pred 

In [None]:
ld.pruned.pca.race$compare <- ifelse(ld.pruned.pca.race$Race_White == ld.pruned.pca.race$predict_race, 'TRUE', 'FALSE')
table(ld.pruned.pca.race$compare)

In [None]:
655/(655+19) # 97 accuracy

In [None]:
ld.pruned.pca.race %>% filter(ld.pruned.pca.race$compare == 'FALSE')

In [None]:
ld.pruned.pca.race %>% filter(is.na(Race_White))

In [None]:
ld.pruned.pca.race$predict_race <- as.character(ld.pruned.pca.race$predict_race)

In [None]:
ld.pruned.pca.race$Race_White_KNN <- ld.pruned.pca.race$Race_White
ld.pruned.pca.race$Race_White_KNN[is.na(ld.pruned.pca.race$Race_White_KNN)] <- ld.pruned.pca.race$predict_race[is.na(ld.pruned.pca.race$Race_White_KNN)]

In [None]:
ld.pruned.pca.race %>% filter(is.na(Race_White))

In [None]:
sum(is.na(ld.pruned.pca.race$Race_White_KNN))

In [None]:
table(ld.pruned.pca.race$Race_White_KNN)
table(ld.pruned.pca.race$Race_White)

In [None]:
ggplot(ld.pruned.pca.race, aes(x=pca1, y=pca2, color = Race_White_KNN)) + geom_point() + theme_bw() + xlab('PC1') + ylab('PC2')

In [None]:
head(ld.pruned.pca.race)

In [None]:
table(ld.pruned.pca.race$Race_White == ld.pruned.pca.race$Race_White_KNN)
dim(ld.pruned.pca.race)

In [None]:
race.info <- ld.pruned.pca.race %>% select(Subject_Id, Race_White_KNN)
dim(race.info)

In [None]:
head(race.info)

In [None]:
# merge back to all inds
dim(race)
head(race)
race.info.all <- race %>% left_join(race.info, by = 'Subject_Id')
dim(race.info.all)
head(race.info.all)

In [None]:
sum(is.na(race.info.all$Race_White_KNN)) # missing because not all inds in data cohort have genotype data


In [None]:
table(race.info.all$Race_White)

In [None]:
race.info.all %>% filter(Race_White == 'Unknown')

In [None]:
race.info.all$Race_White_KNN_impute_missing <- race.info.all$Race_White
race.info.all$Race_White_KNN_impute_missing[race.info.all$Race_White == 'Unknown'] <- race.info.all$Race_White_KNN[race.info.all$Race_White == 'Unknown']
table(race.info.all$Race_White_KNN_impute_missing)

In [None]:
sum(is.na(race.info.all$Race_White_KNN_impute_missing)) # remove

In [None]:
race.info.all <- race.info.all %>% select(Subject_Id, Race1, Race2, Race_Group, Race_White, Race_White_KNN_impute_missing)
dim(race.info.all)
head(race.info.all)