In [1]:
library("dplyr")
library("tibble")
library("readr")
library("purrr")
library("modelr")



Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




“unable to access index for repository https://cran.r-project.org/src/contrib:
  cannot open URL 'https://cran.r-project.org/src/contrib/PACKAGES'”
“package ‘devtools’ is not available (for R version 3.6.3)”


In [2]:
# Define functions

## Compute rmse function
compute_rmse = function(raw,fitted){
rmse = rep(0,ncol(raw))
for (i in 1:ncol(raw)){
  rmse[i] = ((fitted - raw)[,i])^2%>%mean(na.rm = TRUE)%>%sqrt() 
  }
return(rmse)
}

## Compute r2 function
compute_r2 = function(raw,fitted){
  r2 = rep(0,ncol(raw))
  for (j in 1:ncol(raw)){
   r2[j] = summary(lm( as.matrix(fitted[,j]) ~ as.matrix(raw[,j]) ))$adj.r.sq
  }
  return(r2)
}

## Compute r2 raw

compute_r2_raw = function(raw,fitted){
  r2 = rep(0,ncol(raw))
  for (j in 1:ncol(raw)){
    r2[j] =  cor(as.matrix(fitted[,j])[which(!is.na(raw[,j]))],raw[,j]%>%na.omit())^2
  }
  return(r2)
}

## Get P.value
compute_pval = function(raw,fitted){
  pval = rep(0,ncol(raw))
  for (k in 1:ncol(raw)){
    pval[k] = summary(lm( fitted[,k]%>%as.matrix ~ raw[,k]%>%as.matrix ))$coef[2,4]
  }
  return(pval)
}



###Functions to compute MAF and missing genotype rate
compute_maf <- function(geno){
  f <- mean(geno,na.rm = TRUE)/2
  return(min(f, 1-f))
}

compute_missing <- function(geno){
  miss <- sum(is.na(geno))/length(geno)
  return(miss)
}

mean_impute <- function(geno){
  f <- apply(geno, 2, function(x) mean(x,na.rm = TRUE))
  for (i in 1:length(f)) geno[,i][which(is.na(geno[,i]))] <- f[i]
  return(geno)
}

is_zero_variance <- function(x) {
  if (length(unique(x))==1) return(T)
  else return(F)
}
### Filter X matrix
filter_X <- function(X, missing_rate_thresh, maf_thresh) {
  rm_col <- which(apply(X, 2, compute_missing) > missing_rate_thresh)
  if (length(rm_col)) X <- X[, -rm_col]
  rm_col <- which(apply(X, 2, compute_maf) < maf_thresh)
  if (length(rm_col)) X <- X[, -rm_col]
  rm_col <- which(apply(X, 2, is_zero_variance))
  if (length(rm_col)) X <- X[, -rm_col]
  return(mean_impute(X))
}

### Produce CV dataset
cv_data_gen = function(X,Y,times,test_prop){
# Merged the X and Y for producing testing and training set for modelr cv
cv_df_raw = cbind(X,Y)%>%as_tibble() 
cv_df = crossv_mc(cv_df_raw, times ,test = test_prop)%>%mutate(
  train_X = map(train,~as_tibble(.x)[1:ncol(X)]%>%as.matrix),
  train_Y = map(train,~as_tibble(.x)[(ncol(X)+1):(ncol(X)+ncol(Y))]%>%as.matrix),
  test_X = map(test,~as_tibble(.x)[1:ncol(X)]%>%as.matrix),
  test_Y = map(test,~as_tibble(.x)[(ncol(X)+1):(ncol(X)+ncol(Y))]%>%as_tibble)
)  

# Filter Train X with maf and missing, filter test X with the same col as Train X
cv_df = cv_df%>%mutate(
train_X = map(train_X,~filter_X(.x,0.1,0.01)),
test_X = map2(test_X,train_X,~.x%>%as_tibble()%>%select(colnames(.y))%>%as.matrix())
)
return(cv_df)
}



In [3]:
# Load Data
full_model = attach('/home/hs3163/Project/Alz/Alz_AC_SNP_CV/susie/geneTpmResidualsAgeGenderAdj_rename.ENSG00000264538.susie.model.RData')
full_model = full_model$fitted1
X = full_model$X
Y = full_model$Y$V6%>%as_tibble()

In [33]:
# Create cv dataset

cv_df = cv_data_gen(X,Y,2,0.2)
cv_df[1,]$test_X[[1]] -> D

value
<dbl>
0.16055100
0.05537180
-0.22287500
0.03545450
0.01807610
0.15978500
-0.32289500
-0.00355084
0.13177000
0.12971200


In [25]:
# Actual cv

cv_df = cv_df%>%mutate(


## Do susie

  susie = pmap(list(train_X,train_Y),function(first,second)(

    susie(first, second,
    L = 10,
    estimate_residual_variance = TRUE, 
    estimate_prior_variance = FALSE,
    scaled_prior_variance = 0.1)
    )))

ERROR: Error in susie(first, second, L = 10, estimate_residual_variance = TRUE, : could not find function "susie"


In [None]:




# Extract data 

cv_df = cv_df%>%mutate(
  weight = map(susie,~
  (coef(.x)[2:length(coef(.x))])
  ),
  test_fitted = map2(susie,test_X,~predict(.x,.y)%>%as_tibble),
  rmse = map2(test_Y,test_fitted,~compute_rmse(.x,.y)),
  r2 = map2(test_Y,test_fitted,~compute_r2(.x,.y)),
  r2_raw = map2(test_Y,test_fitted,~compute_r2_raw(.x,.y)),
  pval = map2(test_Y,test_fitted,~compute_pval(.x,.y))
)

# Calculate metrics

mean_rmse = cv_df%>%pull(rmse)%>%as.data.frame()%>%t()%>%as_tibble()%>%colMeans()
mean_r2 = cv_df%>%pull(r2)%>%as.data.frame()%>%t()%>%as_tibble()%>%colMeans()
mean_r2_raw = cv_df%>%pull(r2_raw)%>%as.data.frame()%>%t()%>%as_tibble()%>%colMeans()
mean_pval = cv_df%>%pull(pval)%>%as.data.frame()%>%t()%>%as_tibble()%>%colMeans()


# Save metrics
full_model$rmse = mean_rmse
full_model$r2 = mean_r2 
full_model$r2_raw = mean_r2_raw    
full_model$pval = mean_pval    
fitted1 = full_model
# Save the CV data
save(cv_df,file = "/home/hs3163/Project/Alz/Alz_AC_SNP_CV/result/geneTpmResidualsAgeGenderAdj_rename.ENSG00000264538.susie.model.cv_diag.RData")

#Output
save(fitted1,file = "/home/hs3163/Project/Alz/Alz_AC_SNP_CV/result/geneTpmResidualsAgeGenderAdj_rename.ENSG00000264538.susie.model.cv.RData")

