# CV Summaries

In [None]:
%%writefile cv_summary_task_matrix.R

bucket = 'gs://fc-secure-4cd71c22-4c0a-45c8-ab6a-aab6e509c6bd/data/'

tasks = data.frame(check.names = FALSE)

for (trait in c('LDL', 'MCH')) {
    tasks = rbind(tasks, 
                  data.frame(
                      '--input-recursive PACKAGE'=paste0(bucket, 'Package2/'),
                      '--input UTILS'=paste0(bucket, 'Package2/utils.cpp'),
                      '--input PLINK_RDS'=paste0(bucket, 'Analysis/arrays.rds'),
                      '--input PLINK_BK'=paste0(bucket, 'Analysis/arrays.bk'),
                      '--input MAP'=paste0(bucket, 'Analysis/aou_ukb_map.txt'),
                      '--input PHENO'=paste0(bucket, 'Continuous/aou_', trait, '_pheno.tsv'),
                      '--input COV'=paste0(bucket, 'Continuous/aou_', trait, '_cov.tsv'),
                      '--input RELATED'=paste0(bucket, 'relatedness_flagged_samples.tsv'),
                      '--input TRAIN_FOLD'=paste0(bucket, 'Continuous/', trait, '_CV/', trait, '_train_fold.txt'),
                      '--input ANCESTRY'=paste0(bucket, 'pca_ancestry.txt'),
                      '--input PCA'=paste0(bucket, 'Analysis/aou_pca.sscore'),
                      '--output OUT'=paste0(bucket, 'Continuous/', trait, '_CV/', trait, '_summaries.RDS'),
                      '--output Esum'=paste0(bucket, 'Continuous/', trait, '_CV/', trait, '_Esum.RDS'),
                      check.names = FALSE
                  ))
}

colnames(tasks)
write.table(tasks, 
            file="cv_summary_task_matrix.txt", 
            row.names=F, col.names=T, 
            sep='\t', quote=F)

In [None]:
!Rscript cv_summary_task_matrix.R

In [None]:
%%bash --out Continuous_sum_batch

# https://isb-cancer-genomics-cloud.readthedocs.io/en/latest/sections/QueryOfTheMonthClub.html#november-2017

source ~/aou_dsub.bash # This file was created via notebook 01_dsub_setup.ipynb.

docker_image='tacantong/polygenicriskscores:v1'

aou_dsub \
  --image "${docker_image}" \
  --disk-size 512 \
  --boot-disk-size 100 \
  --min-ram 10 \
  --min-cores 5 \
  --timeout "1d" \
  --logging "${WORKSPACE_BUCKET}/data/logging" \
  --script aou_bigsummary_CV.R \
  --tasks cv_summary_task_matrix.txt

# Phecodes (in R)

Adapted from "All-by-All Phecode Curation" workspace

In [None]:
suppressPackageStartupMessages({
    library(dplyr)
    library(data.table)
    library(tidyr)
})

In [None]:
data = fread('mcc2_phecode_table.csv')

In [None]:
pheno = data %>%
mutate(FID=0) %>%
select(FID, IID=person_id, Breast='174.1', Prostate='185', T2D='250.2')

# Compute Summaries

## Logistic

In [None]:
%%writefile cv_summary_task_matrix.R

tasks = data.frame(check.names = FALSE)

for (trait in c('T2D', 'Breast', 'Prostate')) {
    tasks = rbind(tasks, 
                  data.frame(
                      '--input-recursive PACKAGE'=paste0(bucket, 'Package_NEW/'),
                      '--input UTILS'=paste0(bucket, 'Package_NEW/utils.cpp'),
                      '--input PLINK_RDS'=paste0(bucket, 'Analysis/arrays.rds'),
                      '--input PLINK_BK'=paste0(bucket, 'Analysis/arrays.bk'),
                      '--input MAP'=paste0(bucket, 'Analysis/aou_ukb_map.txt'),
                      '--input PHENO'=paste0(bucket, 'Binary/', trait, '/', trait, '_allpheno.tsv'),
                      '--input COV'=paste0(bucket, 'Binary/', trait, '/', trait, '_allcov.tsv'),
                      '--input RELATED'=paste0(bucket, 'relatedness_flagged_samples.tsv'),
                      '--input TRAIN_FOLD'=paste0(bucket, 'Binary/', trait, '_CV/', trait, '_train_fold.txt'),
                      '--input ANCESTRY'=paste0(bucket, 'pca_ancestry.txt'),
                      '--input PCA'=paste0(bucket, 'Analysis/aou_pca.sscore'),
                      '--output OUT'=paste0(bucket, 'Binary/', trait, '_CV/', trait, '_summaries.RDS'),
                      '--output Esum'=paste0(bucket, 'Binary/', trait, '_CV/', trait, '_Esum.RDS'),
                      check.names = FALSE
                  ))
}

colnames(tasks)
write.table(tasks, 
            file="cv_summary_task_matrix.txt", 
            row.names=F, col.names=T, 
            sep='\t', quote=F)

In [None]:
!Rscript cv_summary_task_matrix.R

In [None]:
%%bash --out Binary_sum_batch

# https://isb-cancer-genomics-cloud.readthedocs.io/en/latest/sections/QueryOfTheMonthClub.html#november-2017

source ~/aou_dsub.bash # This file was created via notebook 01_dsub_setup.ipynb.

docker_image='tacantong/polygenicriskscores:v1'

aou_dsub \
  --image "${docker_image}" \
  --disk-size 512 \
  --boot-disk-size 100 \
  --min-ram 20 \
  --min-cores 5 \
  --timeout "1d" \
  --logging "${WORKSPACE_BUCKET}/data/logging" \
  --script aou_bigsummary_binary_CV.R \
  --tasks cv_summary_task_matrix.txt

## Linear

In [None]:
%%writefile cv_summary_task_matrix.R

tasks = data.frame(check.names = FALSE)

for (trait in c('T2D', 'Breast', 'Prostate')) {
    tasks = rbind(tasks, 
                  data.frame(
                      '--input-recursive PACKAGE'=paste0(bucket, 'Package_NEW/'),
                      '--input UTILS'=paste0(bucket, 'Package_NEW/utils.cpp'),
                      '--input PLINK_RDS'=paste0(bucket, 'Analysis/arrays.rds'),
                      '--input PLINK_BK'=paste0(bucket, 'Analysis/arrays.bk'),
                      '--input MAP'=paste0(bucket, 'Analysis/aou_ukb_map.txt'),
                      '--input PHENO'=paste0(bucket, 'Binary/', trait, '/', trait, '_allpheno.tsv'),
                      '--input COV'=paste0(bucket, 'Binary/', trait, '/', trait, '_allcov.tsv'),
                      '--input RELATED'=paste0(bucket, 'relatedness_flagged_samples.tsv'),
                      '--input TRAIN_FOLD'=paste0(bucket, 'Binary/', trait, '_CV/', trait, '_train_fold.txt'),
                      '--input ANCESTRY'=paste0(bucket, 'pca_ancestry.txt'),
                      '--input PCA'=paste0(bucket, 'Analysis/aou_pca.sscore'),
                      '--output OUT'=paste0(bucket, 'Binary/', trait, '_CV/', trait, '_summaries.RDS'),
                      '--output Esum'=paste0(bucket, 'Binary/', trait, '_CV/', trait, '_Esum.RDS'),
                      check.names = FALSE
                  ))
}

colnames(tasks)
write.table(tasks, 
            file="cv_summary_task_matrix.txt", 
            row.names=F, col.names=T, 
            sep='\t', quote=F)

In [None]:
!Rscript cv_summary_task_matrix.R

In [None]:
%%bash --out Binary_sum_batch

# https://isb-cancer-genomics-cloud.readthedocs.io/en/latest/sections/QueryOfTheMonthClub.html#november-2017

source ~/aou_dsub.bash # This file was created via notebook 01_dsub_setup.ipynb.

docker_image='tacantong/polygenicriskscores:v1'

aou_dsub \
  --image "${docker_image}" \
  --disk-size 512 \
  --boot-disk-size 100 \
  --min-ram 20 \
  --min-cores 5 \
  --timeout "1d" \
  --logging "${WORKSPACE_BUCKET}/data/logging" \
  --script aou_bigsummary_CV.R \
  --tasks cv_summary_task_matrix.txt

# Run SPLENDID-CMSA

## Logistic

In [None]:
%%writefile binary_task_matrix.R

tasks = data.frame(check.names = FALSE)

for (trait in c('T2D', 'Breast', 'Prostate')) {
    for(FOLD_ID in 1:5) {
        
        x = try(system(paste0('gsutil ls ${WORKSPACE_BUCKET}/data/Binary/', trait, '_CV/Results_Logistic/cross_fold', FOLD_ID, '.RDS'),
                       intern=T))

        if (length(x) < 1) {
            cat(trait, FOLD_ID, '\n')
            tasks = rbind(tasks, 
                          data.frame('--env DFMAX'=10000,
                                     '--env FOLD_ID'=FOLD_ID,
                                     '--input-recursive PACKAGE'=paste0(bucket, 'Package/'),
                                     '--input UTILS'=paste0(bucket, 'Package/utils.cpp'),
                                     '--input LIN'=paste0(bucket, 'Package/logistic.cpp'),
                                     '--input PLINK_RDS'=paste0(bucket, 'Analysis/arrays.rds'),
                                     '--input PLINK_BK'=paste0(bucket, 'Analysis/arrays.bk'),
                                     '--input MAP'=paste0(bucket, 'Analysis/aou_ukb_map_final.txt'),
                                     '--input PHENO'=paste0(bucket, 'Binary/', trait, '/', trait, '_allpheno.tsv'),
                                     '--input COV'=paste0(bucket, 'Binary/', trait, '/', trait, '_allcov.tsv'),
                                     '--input RELATED'=paste0(bucket, 'relatedness_flagged_samples.tsv'),
                                     '--input ANCESTRY'=paste0(bucket, 'pca_ancestry.txt'),
                                     '--input PCA'=paste0(bucket, 'Analysis/aou_pca.sscore'),
                                     '--input TRAIN_FOLD'=paste0(bucket, 'Binary/', trait, '_CV/', trait, '_train_fold.txt'),
                                     '--input TEST_FOLD'=paste0(bucket, 'Binary/', trait, '_CV/', trait, '_test_fold.txt'),
                                     '--input Esum'=paste0(bucket, 'Binary/', trait, '_CV/', trait, '_Esum.RDS'),
                                     '--input SUMMARY'=paste0(bucket, 'Binary/', trait, '_CV/', trait, '_summaries.RDS'),
                                     '--input META'=paste0(bucket, 'Binary/', trait, '/', trait, '_interaction_gwas.txt'),
                                     '--output-recursive OUT'=paste0(bucket, 'Binary/', trait, '_CV/Results_Logistic'),
                                     check.names = FALSE))
        }
    }
}

colnames(tasks)
write.table(tasks, 
            file="binary_task_matrix.txt", 
            row.names=F, col.names=T, 
            sep='\t', quote=F)

In [None]:
!Rscript binary_task_matrix.R

In [None]:
%%bash --out Binary_analysis_batch

# https://isb-cancer-genomics-cloud.readthedocs.io/en/latest/sections/QueryOfTheMonthClub.html#november-2017

source ~/aou_dsub.bash # This file was created via notebook 01_dsub_setup.ipynb.

docker_image='tacantong/polygenicriskscores:v1'

aou_dsub \
--image "${docker_image}" \
--disk-size 300 \
--min-ram 20 \
--min-cores 1 \
--timeout '1d' \
--logging "${WORKSPACE_BUCKET}/data/logging" \
--script "aou_bigL0L1_binary_CV.R" \
--tasks "binary_task_matrix.txt"

## Linear

In [None]:
%%writefile binary_task_matrix.R

tasks = data.frame(check.names = FALSE)

for (trait in c('T2D', 'Breast', 'Prostate')) {
    for(FOLD_ID in 1:5) {
        
        x = try(system(paste0('gsutil ls ${WORKSPACE_BUCKET}/data/Binary/', trait, '_CV/Results_Linear/cross_fold', FOLD_ID, '.RDS'),
                       intern=T))

        if (length(x) < 1) {
            cat(trait, FOLD_ID, '\n')
            tasks = rbind(tasks, 
                          data.frame('--env DFMAX'=10000,
                                     '--env FOLD_ID'=FOLD_ID,
                                     '--input-recursive PACKAGE'=paste0(bucket, 'Package/'),
                                     '--input UTILS'=paste0(bucket, 'Package/utils.cpp'),
                                     '--input LIN'=paste0(bucket, 'Package/logistic.cpp'),
                                     '--input PLINK_RDS'=paste0(bucket, 'Analysis/arrays.rds'),
                                     '--input PLINK_BK'=paste0(bucket, 'Analysis/arrays.bk'),
                                     '--input MAP'=paste0(bucket, 'Analysis/aou_ukb_map_final.txt'),
                                     '--input PHENO'=paste0(bucket, 'Binary/', trait, '/', trait, '_allpheno.tsv'),
                                     '--input COV'=paste0(bucket, 'Binary/', trait, '/', trait, '_allcov.tsv'),
                                     '--input RELATED'=paste0(bucket, 'relatedness_flagged_samples.tsv'),
                                     '--input ANCESTRY'=paste0(bucket, 'pca_ancestry.txt'),
                                     '--input PCA'=paste0(bucket, 'Analysis/aou_pca.sscore'),
                                     '--input TRAIN_FOLD'=paste0(bucket, 'Binary/', trait, '_CV/', trait, '_train_fold.txt'),
                                     '--input TEST_FOLD'=paste0(bucket, 'Binary/', trait, '_CV/', trait, '_test_fold.txt'),
                                     '--input Esum'=paste0(bucket, 'Binary/', trait, '_CV/', trait, '_Esum.RDS'),
                                     '--input SUMMARY'=paste0(bucket, 'Binary/', trait, '_CV/', trait, '_summaries.RDS'),
                                     '--input META'=paste0(bucket, 'Binary/', trait, '/', trait, '_interaction_gwas.txt'),
                                     '--output-recursive OUT'=paste0(bucket, 'Binary/', trait, '_CV/Results_Linear'),
                                     check.names = FALSE))
        }
    }
}

colnames(tasks)
write.table(tasks, 
            file="binary_task_matrix.txt", 
            row.names=F, col.names=T, 
            sep='\t', quote=F)

In [None]:
!Rscript binary_task_matrix.R

In [None]:
%%bash --out Binary_analysis_batch

# https://isb-cancer-genomics-cloud.readthedocs.io/en/latest/sections/QueryOfTheMonthClub.html#november-2017

source ~/aou_dsub.bash # This file was created via notebook 01_dsub_setup.ipynb.

docker_image='tacantong/polygenicriskscores:v1'

aou_dsub \
--image "${docker_image}" \
--disk-size 300 \
--min-ram 20 \
--min-cores 1 \
--timeout '1d' \
--logging "${WORKSPACE_BUCKET}/data/logging" \
--script "aou_bigL0L1_continuous_CV.R" \
--tasks "binary_task_matrix.txt"

# Compile Results (in R)

In [None]:
for (trait in c('T2D', 'Breast', 'Prostate')) {
    print(trait)
    
    beta = NULL
    n_folds = 0
    for (fold in 1:5) {
        print(fold)
        cross = readRDS(paste0(trait, '_CV/Results_Logistic/cross_fold', fold, '.RDS'))

        best_par = which.min(unlist(lapply(cross, FUN=function(x) min(x$metrics, na.rm=T))))

        beta_cv = data.matrix(cross[[best_par]]$beta)
        beta_cv[is.na(beta_cv)] = 0
        beta_cv[abs(beta_cv) < 1e-20] = 0
        if (cross[[best_par]]$nb_interact[which.min(cross[[best_par]]$metrics)] == sum(rowSums(abs(beta_cv[,2:6])>0) > 0)) {
            rownames(beta_cv) = cross[[best_par]]$snp

            cat(cross[[best_par]]$nb_active[which.min(cross[[best_par]]$metrics)], sum(beta_cv[,1] != 0), '\n'); 
            cat(cross[[best_par]]$nb_interact[which.min(cross[[best_par]]$metrics)], sum(rowSums(abs(beta_cv[,2:6])>0) > 0), '\n'); 

            if (is.null(beta)) {
                beta = beta_cv
            } else {
                beta = beta + beta_cv
            }
            n_folds = n_folds + 1
        } else {
            cat('SKIP ', fold, '\n')
        }

    }
    beta = beta / n_folds

    ix_keep = which(rowSums(abs(beta) > 0) > 0)

    beta = beta[ix_keep,]
    print(dim(beta))
    print(colSums(beta != 0))
    saveRDS(beta, paste0(trait, '/', trait, '_beta_cv_logistic.RDS'))
}

In [None]:
for (trait in c('T2D', 'Breast', 'Prostate')) {
    print(trait)
    
    beta = NULL
    n_folds = 0
    for (fold in 1:5) {
        print(fold)
        cross = readRDS(paste0(trait, '_CV/Results_Linear/cross_fold', fold, '.RDS'))

        best_par = which.min(unlist(lapply(cross, FUN=function(x) min(x$metrics, na.rm=T))))

        beta_cv = data.matrix(cross[[best_par]]$beta)
        beta_cv[is.na(beta_cv)] = 0
        beta_cv[abs(beta_cv) < 1e-20] = 0
        if (cross[[best_par]]$nb_interact[which.min(cross[[best_par]]$metrics)] == sum(rowSums(abs(beta_cv[,2:6])>0) > 0)) {
            rownames(beta_cv) = cross[[best_par]]$snp

            cat(cross[[best_par]]$nb_active[which.min(cross[[best_par]]$metrics)], sum(beta_cv[,1] != 0), '\n'); 
            cat(cross[[best_par]]$nb_interact[which.min(cross[[best_par]]$metrics)], sum(rowSums(abs(beta_cv[,2:6])>0) > 0), '\n'); 

            if (is.null(beta)) {
                beta = beta_cv
            } else {
                beta = beta + beta_cv
            }
            n_folds = n_folds + 1
        } else {
            cat('SKIP ', fold, '\n')
        }

    }
    beta = beta / n_folds

    ix_keep = which(rowSums(abs(beta) > 0) > 0)

    beta = beta[ix_keep,]
    print(dim(beta))
    print(colSums(beta != 0))
    saveRDS(beta, paste0(trait, '/', trait, '_beta_cv_linear.RDS'))
}