In [None]:
import os 

bucket = os.getenv("WORKSPACE_BUCKET")
bucket

!echo $GOOGLE_PROJECT

USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')
%env USER_NAME={USER_NAME}

# DSUB

In [None]:
!pip3 install --upgrade dsub

In [None]:
%%writefile ~/aou_dsub.bash
#!/bin/bash
function aou_dsub () {

  # Get a shorter username to leave more characters for the job name.
  local DSUB_USER_NAME="$(echo "${OWNER_EMAIL}" | cut -d@ -f1)"

  # For AoU RWB projects network name is "network".
  local AOU_NETWORK=network
  local AOU_SUBNETWORK=subnetwork

  dsub \
      --provider google-cls-v2 \
      --user-project "${GOOGLE_PROJECT}"\
      --project "${GOOGLE_PROJECT}"\
      --image 'marketplace.gcr.io/google/ubuntu1804:latest' \
      --network "${AOU_NETWORK}" \
      --subnetwork "${AOU_SUBNETWORK}" \
      --service-account "$(gcloud config get-value account)" \
      --user "${DSUB_USER_NAME}" \
      --regions us-central1 \
      --logging "${WORKSPACE_BUCKET}/dsub/logs/{job-name}/{user-id}/$(date +'%Y%m%d/%H%M%S')/{job-id}-{task-id}-{task-attempt}.log" \
      "$@"
}

In [None]:
%%bash
echo source ~/aou_dsub.bash >> ~/.bashrc

# Create bigsnpr

In [None]:
%%writefile Continuous/microarray_bigsnpr.R
#!/usr/bin/env Rscript

# BIM: gs://fc-aou-datasets-controlled/v7/microarray/plink/arrays.bim
# FAM: gs://fc-aou-datasets-controlled/v7/microarray/plink/arrays.fam
# BED: gs://fc-aou-datasets-controlled/v7/microarray/plink/arrays.bed

# MAP: Analysis/aou_ukb_map.txt
# ID: Continuous/continuous_ids.txt

# OUT_RDS: ${WORKSPACE_BUCKET}/data/Analysis/arrays.rds
# OUT_BK: ${WORKSPACE_BUCKET}/data/Analysis/arrays.bk

Sys.getenv()

library(data.table)
library(dplyr)
library(bigsnpr)

# get index of common SNPs
print('SNPs')
full_bim = fread(Sys.getenv('BIM'))
common_snps = fread(Sys.getenv('MAP')) %>% pull(aou_code)
snp_keep = which(full_bim$V2 %in% common_snps)

cat('full bim:', nrow(full_bim), '\n')
cat('common SNPs:', length(common_snps), '\n')
cat('snp keep:', length(snp_keep), '\n')

rm(common_snps, full_bim); gc()

# get index of phenotype individuals
continuous_ids = fread(Sys.getenv('ID'))
full_fam = fread(Sys.getenv('FAM'))
id_keep = which(full_fam$V1 %in% continuous_ids$V1 & full_fam$V2 %in% continuous_ids$V2)

cat('full fam:', nrow(full_fam), '\n')
cat('continuous ids:', nrow(continuous_ids), '\n')
cat('id keep:', length(id_keep), '\n')

rm(continuous_ids, full_fam); gc()

# convert to bigsnpr format
print("start converting")
backing_name = function(filename) gsub('.bk', '', filename)

bigsnpr_name = snp_readBed2(
    bedfile=Sys.getenv('BED'),
    backingfile = backing_name(Sys.getenv('OUT_BK')),
    ind.row = id_keep,
    ind.col = snp_keep,
    ncores = 1
)
print(bigsnpr_name)

print("done converting")

print('OUT_RDS')
plink = try(snp_attach(Sys.getenv('OUT_RDS')))
if (!('try-error' %in% class(plink))) {
    print(dim(plink$genotypes))
} else {
    print('nope')
}

print('bigsnpr_name')
plink = try(snp_attach(bigsnpr_name))
if (!('try-error' %in% class(plink))) {
    print(dim(plink$genotypes)) 
} else {
    print('nope')
}

In [None]:
%%bash --out microarray_bigsnpr_chr

# https://isb-cancer-genomics-cloud.readthedocs.io/en/latest/sections/QueryOfTheMonthClub.html#november-2017

source ~/aou_dsub.bash # This file was created via notebook 01_dsub_setup.ipynb.

docker_image='tacantong/polygenicriskscores:v1'

aou_dsub \
  --image "${docker_image}" \
  --disk-size 512 \
  --boot-disk-size 100 \
  --min-ram 50 \
  --timeout "8h" \
  --logging "${WORKSPACE_BUCKET}/data/logging" \
  --input BED="gs://fc-aou-datasets-controlled/v7/microarray/plink/arrays.bed" \
  --input BIM="gs://fc-aou-datasets-controlled/v7/microarray/plink/arrays.bim" \
  --input FAM="gs://fc-aou-datasets-controlled/v7/microarray/plink/arrays.fam" \
  --input MAP="${WORKSPACE_BUCKET}/data/Analysis/aou_ukb_map.txt" \
  --input ID="${WORKSPACE_BUCKET}/data/Analysis/unrelated_wgs_ids.txt" \
  --output OUT_BK="${WORKSPACE_BUCKET}/data/Analysis/arrays.bk" \
  --output OUT_RDS="${WORKSPACE_BUCKET}/data/Analysis/arrays.rds" \
  --script Continuous/microarray_bigsnpr.R

# GWAS

## Task matrix

In [None]:
%%writefile summary_task_matrix.R

tasks = data.frame(check.names = FALSE)

for (trait in c('LDL')) {
    tasks = rbind(tasks, 
                  data.frame(
                      '--input-recursive PACKAGE'=paste0(bucket, 'Package/'),
                      '--input UTILS'=paste0(bucket, 'Package/big_gglassoUtils.cpp'),
                      '--input PLINK_RDS'=paste0(bucket, 'Analysis/arrays.rds'),
                      '--input PLINK_BK'=paste0(bucket, 'Analysis/arrays.bk'),
                      '--input MAP'=paste0(bucket, 'Analysis/aou_ukb_map.txt'),
                      '--input PHENO'=paste0(bucket, 'Continuous/aou_', trait, '_pheno.tsv'),
                      '--input COV'=paste0(bucket, 'Continuous/aou_', trait, '_cov.tsv'),
                      '--input RELATED'=paste0(bucket, 'relatedness_flagged_samples.tsv'),
                      '--input ANCESTRY'=paste0(bucket, 'pca_ancestry.txt'),
                      '--input PCA'=paste0(bucket, 'Analysis/aou_pca.sscore'),
                      '--output OUT'=paste0(bucket, 'Continuous/', trait, '/', trait, '_summaries.RDS'),
                      '--output Esum'=paste0(bucket, 'Continuous/', trait, '/', trait, '_Esum.RDS'),
                      check.names = FALSE
                  ))
}

colnames(tasks)
write.table(tasks, 
            file="summary_task_matrix.txt", 
            row.names=F, col.names=T, 
            sep='\t', quote=F)

In [None]:
!Rscript summary_task_matrix.R

## Run jobs

In [None]:
%%bash --out Continuous_sum_batch

# https://isb-cancer-genomics-cloud.readthedocs.io/en/latest/sections/QueryOfTheMonthClub.html#november-2017

source ~/aou_dsub.bash # This file was created via notebook 01_dsub_setup.ipynb.

docker_image='tacantong/polygenicriskscores:v1'

aou_dsub \
  --image "${docker_image}" \
  --disk-size 512 \
  --boot-disk-size 100 \
  --min-ram 10 \
  --timeout "1d" \
  --logging "${WORKSPACE_BUCKET}/data/logging" \
  --script aou_bigsummary_full.R \
  --tasks summary_task_matrix.txt

# Ancestry-specific GWAS

## PLINK script

In [None]:
%%writefile ancestry_gwas.sh
#!/bin/bash

set -o errexit
set -o nounset

plink2 --bfile "${input_path}/arrays" \
--pheno "${pheno}" \
--keep "${id}" \
--extract "${snp}" \
--covar "${cov}" \
--covar-name age,female,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12,PC13,PC14,PC15,PC16,PC17,PC18,PC19,PC20 \
--covar-variance-standardize \
--linear hide-covar cols=+a1freq \
--out "${out}/${anc}"

## Run jobs

In [None]:
%%bash

source ~/aou_dsub.bash # This file was created via notebook 01_dsub_setup.ipynb.

for trait in LDL; do
echo $trait

for anc in afr amr eas eur mid sas; do
aou_dsub \
  --image biocontainer/plink2:alpha2.3_jan2020 \
  --disk-size 512 \
  --boot-disk-size 50 \
  --min-ram 50 \
  --min-cores 4 \
  --logging "${WORKSPACE_BUCKET}/data/logging" \
  --input-recursive input_path="gs://fc-aou-datasets-controlled/v7/microarray/plink" \
  --input-recursive input_path2="${WORKSPACE_BUCKET}/data/Continuous/${trait}" \
  --input snp="${WORKSPACE_BUCKET}/data/Analysis/aou_ukb_map.snp38" \
  --env anc="${anc}" \
  --input id="${WORKSPACE_BUCKET}/data/${anc}.id" \
  --input cov="${WORKSPACE_BUCKET}/data/Continuous/aou_${trait}_cov.tsv" \
  --input pheno="${WORKSPACE_BUCKET}/data/Continuous/aou_${trait}_pheno.tsv" \
  --output-recursive out="${WORKSPACE_BUCKET}/data/Continuous/${trait}" \
  --script ancestry_gwas.sh
done  


# Interaction GWAS

## PLINK script

In [None]:
%%writefile interaction_gwas.sh
#!/bin/bash

set -o errexit
set -o nounset

plink2 --bfile "${input_path}/arrays" \
--pheno "${pheno}" \
--covar "${cov}" \
--keep "${id}" \
--extract "${snp}" \
--linear interaction hide-covar \
--covar-variance-standardize \
--parameters 1-23,26-30 \
--tests 24-28 \
--out "${out}/int"

## Run jobs

In [None]:
%%bash

source ~/aou_dsub.bash # This file was created via notebook 01_dsub_setup.ipynb.

for trait in LDL; do

aou_dsub \
  --image biocontainer/plink2:alpha2.3_jan2020 \
  --disk-size 512 \
  --boot-disk-size 50 \
  --min-ram 50 \
  --min-cores 8 \
  --logging "${WORKSPACE_BUCKET}/data/logging" \
  --input-recursive input_path="gs://fc-aou-datasets-controlled/v7/microarray/plink" \
  --input-recursive input_path2="${WORKSPACE_BUCKET}/data/Continuous/${trait}" \
  --input snp="${WORKSPACE_BUCKET}/data/Analysis/aou_ukb_map.snp38" \
  --input id="${WORKSPACE_BUCKET}/data/eurafr.id" \
  --input cov="${WORKSPACE_BUCKET}/data/Continuous/aou_${trait}_cov.tsv" \
  --input pheno="${WORKSPACE_BUCKET}/data/Continuous/aou_${trait}_pheno.tsv" \
  --output-recursive out="${WORKSPACE_BUCKET}/data/Continuous/${trait}" \
  --script interaction_gwas.sh
  
done

# SPLENDID summaries

## Task matrix

In [None]:
%%writefile summary_task_matrix.R

tasks = data.frame(check.names = FALSE)

for (trait in c('LDL')) {
    tasks = rbind(tasks, 
                  data.frame(
                      '--input-recursive PACKAGE'=paste0(bucket, 'Package/'),
                      '--input UTILS'=paste0(bucket, 'Package/big_gglassoUtils.cpp'),
                      '--input PLINK_RDS'=paste0(bucket, 'Analysis/arrays.rds'),
                      '--input PLINK_BK'=paste0(bucket, 'Analysis/arrays.bk'),
                      '--input MAP'=paste0(bucket, 'Analysis/aou_ukb_map.txt'),
                      '--input PHENO'=paste0(bucket, 'Continuous/aou_', trait, '_pheno.tsv'),
                      '--input COV'=paste0(bucket, 'Continuous/aou_', trait, '_cov.tsv'),
                      '--input RELATED'=paste0(bucket, 'relatedness_flagged_samples.tsv'),
                      '--input ANCESTRY'=paste0(bucket, 'pca_ancestry.txt'),
                      '--input PCA'=paste0(bucket, 'Analysis/aou_pca.sscore'),
                      '--output OUT'=paste0(bucket, 'Continuous/', trait, '/', trait, '_summaries.RDS'),
                      '--output Esum'=paste0(bucket, 'Continuous/', trait, '/', trait, '_Esum.RDS'),
                      check.names = FALSE
                  ))
}

colnames(tasks)
write.table(tasks, 
            file="summary_task_matrix.txt", 
            row.names=F, col.names=T, 
            sep='\t', quote=F)

## Run jobs

In [None]:
%%bash --out Continuous_sum_batch

# https://isb-cancer-genomics-cloud.readthedocs.io/en/latest/sections/QueryOfTheMonthClub.html#november-2017

source ~/aou_dsub.bash # This file was created via notebook 01_dsub_setup.ipynb.

docker_image='tacantong/polygenicriskscores:v1'

aou_dsub \
  --image "${docker_image}" \
  --disk-size 512 \
  --boot-disk-size 100 \
  --min-ram 10 \
  --timeout "1d" \
  --logging "${WORKSPACE_BUCKET}/data/logging" \
  --script aou_bigsummary_full.R \
  --tasks summary_task_matrix.txt

# SPLENDID training

## Task matrix

In [None]:
%%writefile analysis_task_matrix.R

tasks = data.frame(check.names = FALSE)

for (trait in c('LDL')) {
    for (pval in c('0', '5e-8', '5e-6')) {
        for (lambda_ix in 1:5) {
            x = try(system(paste0('gsutil ls ${WORKSPACE_BUCKET}/data/Continuous/', trait, '/Results/model_pval_', pval, '_lambda', lambda_ix, '.RDS'),
                  intern=T))
            if (length(x) == 0) {
                cat(trait, pval, lambda_ix, '\n')
                tasks = rbind(tasks, 
                              data.frame('--env PVAL'=pval,
                                         '--env LAMBDA_IX'=lambda_ix,
                                         '--env DFMAX'=dfmax,
                                         '--input-recursive PACKAGE'=paste0(bucket, 'Package/'),
                                         '--input UTILS'=paste0(bucket, 'Package/big_gglassoUtils.cpp'),
                                         '--input LIN'=paste0(bucket, 'Package/big_gglassoLin.cpp'),
                                         '--input PLINK_RDS'=paste0(bucket, 'Analysis/arrays.rds'),
                                         '--input PLINK_BK'=paste0(bucket, 'Analysis/arrays.bk'),
                                         '--input MAP'=paste0(bucket, 'Analysis/aou_ukb_map_final.txt'),
                                         '--input PHENO'=paste0(bucket, 'Continuous/aou_', trait, '_pheno.tsv'),
                                         '--input COV'=paste0(bucket, 'Continuous/aou_', trait, '_cov.tsv'),
                                         '--input RELATED'=paste0(bucket, 'relatedness_flagged_samples.tsv'),
                                         '--input ANCESTRY'=paste0(bucket, 'pca_ancestry.txt'),
                                         '--input PCA'=paste0(bucket, 'Analysis/aou_pca.sscore'),
                                         '--input SUMMARY'=paste0(bucket, 'Continuous/', trait, '/', trait, '_summaries.RDS'),
                                         '--input META'=paste0(bucket, 'Continuous/', trait, '/', trait, '_meta.txt'),
                                         '--output-recursive OUT'=paste0(bucket, 'Continuous/', trait, '/Results'),
                                         check.names = FALSE))
            }
        }        
    }
}


colnames(tasks)
write.table(tasks, 
            file="analysis_task_matrix.txt", 
            row.names=F, col.names=T, 
            sep='\t', quote=F)

## Run jobs

In [None]:
%%bash --out Continuous_analysis_batch

# https://isb-cancer-genomics-cloud.readthedocs.io/en/latest/sections/QueryOfTheMonthClub.html#november-2017

source ~/aou_dsub.bash # This file was created via notebook 01_dsub_setup.ipynb.

docker_image='tacantong/polygenicriskscores:v1'

aou_dsub \
  --image "${docker_image}" \
  --disk-size 300 \
  --min-ram 10 \
  --timeout '1d' \
  --logging "${WORKSPACE_BUCKET}/data/logging" \
  --script ${WORKSPACE_BUCKET}/data/aou_bigL0L1_continuous.R \
  --tasks analysis_task_matrix.txt

# iPGS training

In [None]:
%%writefile lasso_task_matrix.R

tasks = data.frame(check.names = FALSE)

for (trait in c('LDL')) {
    tasks = rbind(tasks, 
                  data.frame('--input-recursive PACKAGE'=paste0(bucket, 'Package/'),
                             '--input UTILS'=paste0(bucket, 'Package/big_gglassoUtils.cpp'),
                             '--input LIN'=paste0(bucket, 'Package/big_gglassoLin.cpp'),
                             '--input PLINK_RDS'=paste0(bucket, 'Analysis/arrays.rds'),
                             '--input PLINK_BK'=paste0(bucket, 'Analysis/arrays.bk'),
                             '--input MAP'=paste0(bucket, 'Analysis/aou_ukb_map_final.txt'),
                             '--input PHENO'=paste0(bucket, 'Continuous/aou_', trait, '_pheno.tsv'),
                             '--input COV'=paste0(bucket, 'Continuous/aou_', trait, '_cov.tsv'),
                             '--input RELATED'=paste0(bucket, 'relatedness_flagged_samples.tsv'),
                             '--input ANCESTRY'=paste0(bucket, 'pca_ancestry.txt'),
                             '--input PCA'=paste0(bucket, 'Analysis/aou_pca.sscore'),
                             '--input SUMMARY'=paste0(bucket, 'Continuous/', trait, '/', trait, '_summaries.RDS'),
                             '--output-recursive OUT'=paste0(bucket, 'Continuous/', trait, '/Results'),
                             check.names = FALSE))
        
}

colnames(tasks)
write.table(tasks, 
            file="lasso_task_matrix.txt", 
            row.names=F, col.names=T, 
            sep='\t', quote=F)

## Run jobs

In [None]:
%%bash --out Continuous_analysis_batch

# https://isb-cancer-genomics-cloud.readthedocs.io/en/latest/sections/QueryOfTheMonthClub.html#november-2017

source ~/aou_dsub.bash # This file was created via notebook 01_dsub_setup.ipynb.

docker_image='tacantong/polygenicriskscores:v1'

aou_dsub \
  --image "${docker_image}" \
  --disk-size 300 \
  --min-ram 10 \
  --logging "${WORKSPACE_BUCKET}/data/logging" \
  --script "${WORKSPACE_BUCKET}/data/aou_bigLasso_continuous.R" \
  --tasks lasso_task_matrix.txt