In [2]:
library("dplyr")
library("tibble")
library("readr")
library("purrr")
library("modelr")

In [6]:
## Compute rmse function
compute_rmse = function(raw,fitted){
rmse = rep(0,ncol(raw))
for (i in 1:ncol(raw)){
  rmse[i] = ((fitted - raw)[,i])^2%>%mean(na.rm = TRUE)%>%sqrt() 
  }
return(rmse)
}

## Compute r2 function
compute_r2 = function(raw,fitted){
  r2 = rep(0,ncol(raw))
  for (j in 1:ncol(raw)){
   r2[j] = summary(lm( as.matrix(fitted[,j]) ~ as.matrix(raw[,j]) ))$adj.r.sq
  }
  return(r2)
}

## Compute r2 raw

compute_r2_raw = function(raw,fitted){
  r2 = rep(0,ncol(raw))
  for (j in 1:ncol(raw)){
    r2[j] =  cor(as.matrix(fitted[,j])[which(!is.na(raw[,j]))],raw[,j]%>%na.omit())^2
  }
  return(r2)
}

## Get P.value
compute_pval = function(raw,fitted){
  pval = rep(0,ncol(raw))
  for (k in 1:ncol(raw)){
    pval[k] = summary(lm( fitted[,k]%>%as.matrix ~ raw[,k]%>%as.matrix ))$coef[2,4]
  }
  return(pval)
}



###Functions to compute MAF and missing genotype rate
compute_maf <- function(geno){
  f <- mean(geno,na.rm = TRUE)/2
  return(min(f, 1-f))
}

compute_missing <- function(geno){
  miss <- sum(is.na(geno))/length(geno)
  return(miss)
}

mean_impute <- function(geno){
  f <- apply(geno, 2, function(x) mean(x,na.rm = TRUE))
  for (i in 1:length(f)) geno[,i][which(is.na(geno[,i]))] <- f[i]
  return(geno)
}

is_zero_variance <- function(x) {
  if (length(unique(x))==1) return(T)
  else return(F)
}
### Filter X matrix
filter_X <- function(X, missing_rate_thresh, maf_thresh) {
  rm_col <- which(apply(X, 2, compute_missing) > missing_rate_thresh)
  if (length(rm_col)) X <- X[, -rm_col]
  rm_col <- which(apply(X, 2, compute_maf) < maf_thresh)
  if (length(rm_col)) X <- X[, -rm_col]
  rm_col <- which(apply(X, 2, is_zero_variance))
  if (length(rm_col)) X <- X[, -rm_col]
  return(mean_impute(X))
}
### Produce CV dataset
cv_data_gen = function(X,Y,times,test_prop){
# Merged the X and Y for producing testing and training set for modelr cv
cv_df_raw = cbind(X,Y)%>%as_tibble() 
cv_df = crossv_mc(cv_df_raw, times ,test = test_prop)%>%mutate(
  train_X = map(train,~as_tibble(.x)[1:ncol(X)]%>%as.matrix),
  train_Y = map(train,~as_tibble(.x)[(ncol(X)+1):(ncol(X)+ncol(Y))]%>%as.matrix),
  test_X = map(test,~as_tibble(.x)[1:ncol(X)]%>%as.matrix),
  test_Y = map(test,~as_tibble(.x)[(ncol(X)+1):(ncol(X)+ncol(Y))]%>%as_tibble)
)  

# Filter Train X with maf and missing, filter test X with the same col as Train X
cv_df = cv_df%>%mutate(
train_X = map(train_X,~filter_X(.x,0.1,0.01)),
test_X = map2(test_X,train_X,~.x%>%as_tibble()%>%select(colnames(.y))%>%as.matrix())
)
return(cv_df)
}
             

In [17]:
# Load Data
full_model = attach('/mnt/mfs/statgen/neuro-twas/JAZF1_uni/Whole_Blood/susie/GTEx_V8.ENSG00000153814.susie.model.RData')$full_model
X = full_model$X
Y = full_model$Y%>%as.tibble()

# Create cv dataset
times = 2
test_prop = 0.2
cv_df = cv_data_gen(X,Y,2,0.2)

The following object is masked _by_ .GlobalEnv:

    full_model


The following object is masked from file:/mnt/mfs/statgen/neuro-twas/JAZF1_uni/Whole_Blood/susie/GTEx_V8.ENSG00000153814.susie.model.RData (pos = 3):

    full_model


The following object is masked from file:/mnt/mfs/statgen/neuro-twas/JAZF1_uni/Whole_Blood/susie/GTEx_V8.ENSG00000153814.susie.model.RData (pos = 4):

    full_model


The following object is masked from file:/mnt/mfs/statgen/neuro-twas/JAZF1_uni/Whole_Blood/susie/GTEx_V8.ENSG00000153814.susie.model.RData (pos = 5):

    full_model




In [21]:
cv_df$test_Y

value
<dbl>
-0.87087001
-0.07347318
0.49547089
1.38314220
0.74124490
-0.24133417
-0.90980297
0.21815186
0.06215395
0.22200721

value
<dbl>
0.52543046
-0.08480167
-0.07347318
0.71679205
0.52112254
0.05084257
1.13672718
0.05461225
-0.01694113
0.21815186
