# Common settings

In [None]:
## Python Package Import
import sys
import os
import numpy as np
import pandas as pd
from datetime import datetime

In [None]:
##Ensuring dsub is up to date
!pip3 install --upgrade dsub

In [None]:
## Setting for running dsub jobs
pd.set_option('display.max_colwidth', 0)

In [None]:
## Defining necessary pathways
my_bucket = os.environ['WORKSPACE_BUCKET']

In [None]:
USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')

# Save this Python variable as an environment variable so that its easier to use within %%bash cells.
%env USER_NAME={USER_NAME}

In [None]:
#radix='antidiabetic.HbA1c'
#radix='antidiabetic.HbA1cmyboxcox'
#radix='antidiabetic.HbA1cmyboxcox01'
#radix='antihypertensive.SBP'
#radix='antihypertensive.SBPmyboxcox'
#radix='antihypertensive.SBPmyboxcox01'
#radix='antihypertensive.SBPmyboxcoxv2'
#radix='hypolipidemics.LDL'
#radix='hypolipidemics.LDLmyboxcox'
#radix='hypolipidemics.LDLmyboxcox01'
#radix='hypolipidemics.LDLdelta'
#radix='hypolipidemics.LDLlogratio'
#radix='hypolipidemics.LDLmyboxcoxv2'
#radix='SBPnodrugmyboxcox-05nge4'
#radix='MAPnodrugmyboxcox0nge4'
#radix='PhysicalMeasurementsv2_MAPmyboxcoxnge2' # --phenoExcludeList measurement_datetime
#radix='LabsMeasurementsv2_MAPmyboxcox10yrnge4'
radix='LabsMeasurementsv2_MAPmyboxcox5yrnge4'
#radix='PhysicalMeasurements_HRmyboxcoxnge2'# --phenoExcludeList measurement_datetime #suspended after step1
#radix='LabsMeasurements_HRmyboxcox10yrnge4' #suspended after step1

# Save this Python variable as an environment variable so that its easier to use within %%bash cells.
%env radix={radix}

In [None]:
trait='qt' #bt

# Save this Python variable as an environment variable so that its easier to use within %%bash cells.
%env trait={trait}

## Run below once per workspace

In [None]:
%%writefile ~/aou_dsub.bash

#!/bin/bash

# This shell function passes reasonable defaults for several dsub parameters, while
# allowing the caller to override any of them. It creates a nice folder structure within
# the workspace bucket for dsub log files.

# --[ Parameters ]--
# any valid dsub parameter flag

#--[ Returns ]--
# the job id of the job created by dsub

#--[ Details ]--
# The first five parameters below should always be those values when running on AoU RWB.

# Feel free to change the values for --user, --regions, --logging, and --image if you like.

# Note that we insert some job data into the logging path.
# https://github.com/DataBiosphere/dsub/blob/main/docs/logging.md#inserting-job-data

function aou_dsub () {

  # Get a shorter username to leave more characters for the job name.
  local DSUB_USER_NAME="$(echo "${OWNER_EMAIL}" | cut -d@ -f1)"

  # For AoU RWB projects network name is "network".
  #local AOU_NETWORK=network
  #local AOU_SUBNETWORK=subnetwork

  dsub \
      --provider google-batch \
      --user-project "${GOOGLE_PROJECT}"\
      --project "${GOOGLE_PROJECT}"\
      --image 'ubuntu:latest' \
      --network "global/networks/network" \
      --subnetwork "regions/us-central1/subnetworks/subnetwork" \
      --service-account "$(gcloud config get-value account)" \
      --use-private-address \
      --user "${DSUB_USER_NAME}" \
      --regions us-central1 \
      --logging "${WORKSPACE_BUCKET}/dsub/logs/{job-name}/{user-id}/$(date +'%Y%m%d/%H%M%S')/{job-id}-{task-id}-{task-attempt}.log" \
      "$@"
}

In [None]:
%%bash

echo source ~/aou_dsub.bash >> ~/.bashrc

# dsub for regenie GWAS

## Prepare LD pruned all chr for regenie step 1

In [None]:
%%bash

for chromo in {1..22}
do
echo $chromo

gsutil -u $GOOGLE_PROJECT -m cp gs://fc-aou-datasets-controlled/v8/wgs/short_read/snpindel/acaf_threshold/pgen/acaf_threshold.chr${chromo}.pgen .
gsutil -u $GOOGLE_PROJECT -m cp gs://fc-aou-datasets-controlled/v8/wgs/short_read/snpindel/acaf_threshold/pgen/acaf_threshold.chr${chromo}.psam .
gsutil -u $GOOGLE_PROJECT -m cp gs://fc-aou-datasets-controlled/v8/wgs/short_read/snpindel/acaf_threshold/pgen/acaf_threshold.chr${chromo}.pvar .


plink2 --pfile acaf_threshold.chr${chromo} \
  --maf 0.01 --mac 100 --geno 0.1 --hwe 1e-5 0.001 \
  --mind 0.1 \
  --write-snplist --write-samples --no-id-header \
  --set-missing-var-ids @:# \
  --out qc_pass

plink2 --pfile acaf_threshold.chr${chromo} \
  --extract qc_pass.snplist \
  --keep qc_pass.id \
  --indep-pairwise 200kb 1 0.5 \
  --set-missing-var-ids @:# \
  --out ldpruned_snplist
  
plink2 --pfile acaf_threshold.chr${chromo} \
  --extract ldpruned_snplist.prune.in \
  --keep qc_pass.id \
  --make-pgen \
  --set-missing-var-ids @:# \
  --out acaf_threshold.ldpruned.chr${chromo}

rm acaf_threshold.chr${chromo}.pgen
rm acaf_threshold.chr${chromo}.psam
rm acaf_threshold.chr${chromo}.pvar
rm qc_pass.*
rm ldpruned_snplist.*

done

`--pmerge-list` does not work

In [None]:
%%bash

ln -s acaf_threshold.ldpruned.chr1.pgen foo.1.pgen
ln -s acaf_threshold.ldpruned.chr1.psam foo.1.psam
ln -s acaf_threshold.ldpruned.chr1.pvar foo.1.pvar

for c in {1..21}
do
    d=$((c+1))
    plink2 --pfile foo.$c --pmerge acaf_threshold.ldpruned.chr$d --mind 0.1 \
      --make-pgen --out foo.$d
    rm foo.$c.{pgen,psam,pvar} foo.${d}-merge.{pgen,psam,pvar}
done

mv foo.22.pgen acaf_threshold.ldpruned.chrall.pgen
mv foo.22.psam acaf_threshold.ldpruned.chrall.psam
mv foo.22.pvar acaf_threshold.ldpruned.chrall.pvar

In [None]:
%%bash

plink2 --pfile acaf_threshold.ldpruned.chrall --missing

In [None]:
%%bash

plink2 --pfile acaf_threshold.ldpruned.chrall \
    --max-alleles 2 \
    --export bgen-1.3 \
    --out acaf_threshold.ldpruned.chrall

In [None]:
%% bash

mv acaf_threshold.ldpruned.chrall.sample acaf_threshold.ldpruned.chrall.sample.original
perl -ne 's/^0 (\d+)/$1 $1/; print' < acaf_threshold.ldpruned.chrall.sample.original > acaf_threshold.ldpruned.chrall.sample

`gcloud storage cp` does not work

In [None]:
!gsutil cp acaf_threshold.ldpruned.chrall.bgen $my_bucket"/data/"

In [None]:
!gsutil cp acaf_threshold.ldpruned.chrall.sample $my_bucket"/data/"

## regenie step 1

In [None]:
! grep -v NA < aou_{radix}_covariates.txt > qc_pass.id

In [None]:
%%bash

plink2 --bgen acaf_threshold.ldpruned.chrall.bgen ref-unknown \
  --sample acaf_threshold.ldpruned.chrall.sample \
  --maf 0.01 --mac 100 --geno 0.1 --hwe 1e-5 0.001 \
  --write-snplist -no-id-header \
  --keep qc_pass.id \
  --out qc_pass  

In [None]:
! gsutil cp qc_pass.snplist {my_bucket}/data/aou_{radix}.qc_pass.snplist

In [None]:
## MODIFY FOR FULL DATA RUN 
# Use underscores, not whitespace since it will become part of the bucket path.
JOB_NAME=f'regenie_{radix}' ## add name in quotes, copy name in quotes to 4.1

# Save this Python variable as an environment variable so that its easier to use within %%bash cells.
%env JOB_NAME={JOB_NAME}

In [None]:
## Analysis Results Folder 
line_count_results_folder = os.path.join(
    os.getenv('WORKSPACE_BUCKET'),
    'dsub',
    'results',
    JOB_NAME,
    USER_NAME,
    datetime.now().strftime('%Y%m%d'))

line_count_results_folder

In [None]:
## Where the output files will go
output_files = os.path.join(line_count_results_folder, "results")
print(output_files)

In [None]:
OUTPUT_FILES = output_files

# Save this Python variable as an environment variable so that its easier to use within %%bash cells.
%env OUTPUT_FILES={OUTPUT_FILES}

In [None]:
%%writefile ~/Regenie_GWAS_NewPlinkBgens.sh

set -o pipefail 
set -o errexit

cp "${regenie_file}" regenie.zip
unzip regenie.zip
mv regenie_v4.1.gz_x86_64_Linux regenie
chmod 700 regenie

./regenie \
    --step 1 \
    --bgen "${bgen_file}" \
    --sample "${sample_file}" \
    --phenoFile "${pheno_file}" --phenoExcludeList measurement_datetime \
    --covarFile "${cov_file}" \
    --extract "${snplist_file}" \
    --"${trait}" \
    --bsize 1000 \
    --lowmem \
    --lowmem-prefix regenie_tmp_preds \
    --verbose \
    --out "${prefix}"_step1_out

zip "${prefix}"_step1.zip "${prefix}"_step1_out*
mv "${prefix}"_step1.zip "${OUTPUT_PATH}"

In [None]:
## Modify to personal gs bucket
!gsutil cp /home/jupyter/Regenie_GWAS_NewPlinkBgens.sh {my_bucket}/data/dsub/

In [None]:
!gsutil ls {my_bucket}/data/dsub/*.sh

The next cell contains the dsub command that will kick off the Regenie GWAS once ran. Right now, the ```--input``` variable options below point to the paths of the input files used in the original v7 LDL-C Regenie GWAS. If you would like to re-create the original run of the v7 LDL GWAS, change nothing (other than the lower and upper bounds depending on your test region) and run the next cell as is.

If you just created new phenotype and covariate files using notebooks 1-3 and section 2 above, you will need to change the ```--input pheno_file``` and ```--input cov_file``` variable paths to point to your new files in your bucket. 

If you just ran sections 3-6 and have new bgen files, you will need to change the ```--input bgen_file``` and ```--input sample_file``` variable paths to point to your new bgen files in your bucket.

2026.10.24 invalid: --image "gcr.io/bick-aps2/ghcr.io/rgcgithub/regenie/regenie:v3.2.4.gz"

In [None]:
! gcloud container images describe "gcr.io/bick-aps2/ghcr.io/rgcgithub/regenie/regenie:v3.2.4.gz"

In [None]:
! gcloud container images describe "us.gcr.io/broad-dsp-gcr-public/terra-jupyter-aou:2.2.16"

NOT CHECKED
We would also like to share that you can search docker hub to find one image, e.g., --image shengqh/regenie4:20241127, and we recommend running a small test first before using such images.

In [None]:
%%bash --out LINE_COUNT_JOB_ID

source ~/aou_dsub.bash # This file was created via notebook 01_dsub_setup.ipynb.

# Get a shorter username to leave more characters for the job name.
DSUB_USER_NAME="$(echo "${OWNER_EMAIL}" | cut -d@ -f1)"

    aou_dsub \
    --name "${JOB_NAME}" \
    --logging "${WORKSPACE_BUCKET}/dsub/logs/{job-name}/{user-id}/$(date +'%Y%m%d/%H%M%S')/{job-id}-{task-id}-{task-attempt}.log" \
    --image "us.gcr.io/broad-dsp-gcr-public/terra-jupyter-aou:2.2.16" \
    --machine-type "n2-standard-4" \
    --boot-disk-size 1000 \
    --script "${WORKSPACE_BUCKET}/data/dsub/Regenie_GWAS_NewPlinkBgens.sh" \
    --input bgen_file="${WORKSPACE_BUCKET}/data/acaf_threshold.ldpruned.chrall.bgen" \
    --input sample_file="${WORKSPACE_BUCKET}/data/acaf_threshold.ldpruned.chrall.sample" \
    --input pheno_file="${WORKSPACE_BUCKET}/data/aou_${radix}_QT.txt" \
    --input cov_file="${WORKSPACE_BUCKET}/data/aou_${radix}_covariates.txt" \
    --input snplist_file="${WORKSPACE_BUCKET}/data/aou_${radix}.qc_pass.snplist" \
    --input regenie_file="${WORKSPACE_BUCKET}/data/regenie_v4.1.gz_x86_64_Linux.zip" \
    --output-recursive OUTPUT_PATH="${OUTPUT_FILES}" \
    --env prefix="aou_${radix}" \
    --env trait=${trait} \


In [None]:
# Save this Python variable value as an environment variable so that its easier to use within %%bash cells.
%env JOB_ID={LINE_COUNT_JOB_ID}

In [None]:
%%bash

dstat \
    --provider google-batch \
    --project "${GOOGLE_PROJECT}" \
    --location us-central1 \
    --jobs "${JOB_ID}" \
    --users "${USER_NAME}" \
    --status '*' --format json --full

## regenie step 2

In [None]:
## MODIFY FOR FULL DATA RUN 
# Use underscores, not whitespace since it will become part of the bucket path.
JOB_NAME=f'regenie_{radix}' ## add name in quotes, copy name in quotes to 4.1

# Save this Python variable as an environment variable so that its easier to use within %%bash cells.
%env JOB_NAME={JOB_NAME}

In [None]:
## Analysis Results Folder 
line_count_results_folder = os.path.join(
    os.getenv('WORKSPACE_BUCKET'),
    'dsub',
    'results',
    JOB_NAME,
    USER_NAME,
    datetime.now().strftime('%Y%m%d'))

line_count_results_folder

In [None]:
## Where the output files will go
output_files = os.path.join(line_count_results_folder, "results")
print(output_files)

In [None]:
OUTPUT_FILES = output_files

# Save this Python variable as an environment variable so that its easier to use within %%bash cells.
%env OUTPUT_FILES={OUTPUT_FILES}

In [None]:
%%writefile ~/Regenie_GWAS_NewPlinkBgens2.sh

set -o pipefail 
set -o errexit

cp "${regenie_file}" regenie.zip
unzip regenie.zip
mv regenie_v4.1.gz_x86_64_Linux regenie
chmod 700 regenie

cp "${step1zip_file}" foo.zip
unzip foo.zip

./regenie \
    --step 2 \
    --bgen   "${bgen_file}" \
    --sample "${sample_file}" \
    --phenoFile "${pheno_file}" --phenoExcludeList measurement_datetime \
    --covarFile "${cov_file}" \
    --"${trait}" \
    --firth --approx --pThresh 0.01 \
    --pred "${prefix}"_step1_out_pred.list \
    --bsize 400 \
    --verbose \
    --ref-first \
    --out "${prefix}"_step2_out_chr"${chrom}"

mv "${prefix}"_step2_out_chr"${chrom}"* ${OUTPUT_PATH}

In [None]:
## Modify to personal gs bucket
!gsutil cp /home/jupyter/Regenie_GWAS_NewPlinkBgens2.sh {my_bucket}/data/dsub/

In [None]:
!gsutil ls {my_bucket}/data/dsub/*.sh

The next cell contains the dsub command that will kick off the Regenie GWAS once ran. Right now, the ```--input``` variable options below point to the paths of the input files used in the original v7 LDL-C Regenie GWAS. If you would like to re-create the original run of the v7 LDL GWAS, change nothing (other than the lower and upper bounds depending on your test region) and run the next cell as is.

If you just created new phenotype and covariate files using notebooks 1-3 and section 2 above, you will need to change the ```--input pheno_file``` and ```--input cov_file``` variable paths to point to your new files in your bucket. 

If you just ran sections 3-6 and have new bgen files, you will need to change the ```--input bgen_file``` and ```--input sample_file``` variable paths to point to your new bgen files in your bucket.

In [None]:
%%bash --out LINE_COUNT_JOB_ID

# Get a shorter username to leave more characters for the job name.
DSUB_USER_NAME="$(echo "${OWNER_EMAIL}" | cut -d@ -f1)"

source ~/aou_dsub.bash # This file was created via notebook 01_dsub_setup.ipynb.

# Python is 'right side limited' wherein the last value is not included
# To run the regression across all chromosomes, set lower to 1 and upper to 23
# To run across one chromosome, set lower to the chomosome-of-interest and upper to the following

LOWER=1
UPPER=23
for ((chromo=$LOWER;chromo<$UPPER;chromo+=1))
do

    aou_dsub \
    --name "${JOB_NAME}" \
    --logging "${WORKSPACE_BUCKET}/dsub/logs/{job-name}/{user-id}/$(date +'%Y%m%d/%H%M%S')/{job-id}-{task-id}-{task-attempt}.log" \
    --image "us.gcr.io/broad-dsp-gcr-public/terra-jupyter-aou:2.2.16" \
    --machine-type "n2-standard-4" \
    --boot-disk-size 1000 \
    --disk-size 1000 \
    --script "${WORKSPACE_BUCKET}/data/dsub/Regenie_GWAS_NewPlinkBgens2.sh" \
    --input regenie_file="${WORKSPACE_BUCKET}/data/regenie_v4.1.gz_x86_64_Linux.zip" \
    --input step1zip_file="${WORKSPACE_BUCKET}/data/aou_${radix}_step1.zip" \
    --input bgen_file="gs://fc-aou-datasets-controlled/v8/wgs/short_read/snpindel/acaf_threshold/bgen/chr${chromo}.bgen" \
    --input sample_file="gs://fc-aou-datasets-controlled/v8/wgs/short_read/snpindel/acaf_threshold/bgen/chr${chromo}.sample" \
    --input pheno_file="${WORKSPACE_BUCKET}/data/aou_${radix}_QT.txt" \
    --input cov_file="${WORKSPACE_BUCKET}/data/aou_${radix}_covariates.txt" \
    --output-recursive OUTPUT_PATH="${OUTPUT_FILES}/${chromo}" \
    --env prefix="aou_${radix}" \
    --env trait=${trait} \
    --env chrom=${chromo} \

done

In [None]:
# Save this Python variable value as an environment variable so that its easier to use within %%bash cells.
%env JOB_ID={LINE_COUNT_JOB_ID}

In [None]:
%%bash

dstat \
    --provider google-batch \
    --project "${GOOGLE_PROJECT}" \
    --location us-central1 \
    --jobs "${JOB_ID}" \
    --users "${USER_NAME}" \
    --status '*'

# dsub for plink hardy, GWAS

In [None]:
## MODIFY FOR FULL DATA RUN 
# Use underscores, not whitespace since it will become part of the bucket path.
JOB_NAME=f'plink_{radix}' ## add name in quotes, copy name in quotes to 4.1

# Save this Python variable as an environment variable so that its easier to use within %%bash cells.
%env JOB_NAME={JOB_NAME}

In [None]:
## Analysis Results Folder 
line_count_results_folder = os.path.join(
    os.getenv('WORKSPACE_BUCKET'),
    'dsub',
    'results',
    JOB_NAME,
    USER_NAME,
    datetime.now().strftime('%Y%m%d'))

line_count_results_folder

In [None]:
## Where the output files will go
output_files = os.path.join(line_count_results_folder, "results")
print(output_files)

In [None]:
OUTPUT_FILES = output_files

# Save this Python variable as an environment variable so that its easier to use within %%bash cells.
%env OUTPUT_FILES={OUTPUT_FILES}

In [None]:
%%writefile ~/hardy.sh

set -o pipefail 
set -o errexit

plink2 \
  --bgen ${bgen_file} ref-unknown \
  --keep ${keep_file} \
  --hardy \
  --out chr${chrom}

mv chr${chrom}.hardy ${OUTPUT_PATH}

In [None]:
## Modify to personal gs bucket
!gsutil cp /home/jupyter/hardy.sh {my_bucket}/data/dsub/

In [None]:
%%bash --out LINE_COUNT_JOB_ID

source ~/aou_dsub.bash # This file was created via notebook 01_dsub_setup.ipynb.

# Get a shorter username to leave more characters for the job name.
DSUB_USER_NAME="$(echo "${OWNER_EMAIL}" | cut -d@ -f1)"

LOWER=1
UPPER=23
for ((chromo=$LOWER;chromo<$UPPER;chromo+=1))
do

    aou_dsub \
    --name "${JOB_NAME}" \
    --logging "${WORKSPACE_BUCKET}/dsub/logs/{job-name}/{user-id}/$(date +'%Y%m%d/%H%M%S')/{job-id}-{task-id}-{task-attempt}.log" \
    --image "us.gcr.io/broad-dsp-gcr-public/terra-jupyter-aou:2.2.16" \
    --machine-type "n2-standard-4" \
    --boot-disk-size 1000 \
    --disk-size 1000 \
    --script "${WORKSPACE_BUCKET}/data/dsub/hardy.sh" \
    --output-recursive OUTPUT_PATH="${OUTPUT_FILES}/${chromo}" \
    --input bgen_file="gs://fc-aou-datasets-controlled/v8/wgs/short_read/snpindel/acaf_threshold/bgen/chr${chromo}.bgen" \
    --input sample_file="gs://fc-aou-datasets-controlled/v8/wgs/short_read/snpindel/acaf_threshold/bgen/chr${chromo}.sample" \
    --input keep_file="${WORKSPACE_BUCKET}/data/eur.plink.ind" \
    --env chrom=${chromo} \

done

In [None]:
!dstat --provider google-batch --project terra-vpc-sc-9b3acbf3 --location us-central1 --jobs 'plink-labs--fumibaker--251029-203511-73' --users 'fumibaker' --status '*'

variance inflation factor for covariate 'PC6' is too high (VIF_TOO_HIGH).
==> limit up to PC5

In [None]:
%%writefile ~/plinkGWAS.sh

set -o pipefail 
set -o errexit

cp "${plink2_file}" ./plink2
chmod 700 ./plink2

./plink2 \
  --pgen ${pgen_file} \
  --pvar ${pvar_file} \
  --psam ${psam_file} \
  --glm \
  --out ${trait}.chr${chrom} \
  --pheno "${pheno_file}" \
  --pheno-name ${trait} \
  --covar "${cov_file}" \
  --covar-name PC1 PC2 PC3 PC4 PC5 \
  --covar-variance-standardize \
  --maf 0.01

mv "${trait}.chr${chrom}"* ${OUTPUT_PATH}

In [None]:
## Modify to personal gs bucket
!gsutil cp /home/jupyter/plinkGWAS.sh {my_bucket}/data/dsub/

In [None]:
!gsutil ls {my_bucket}/data/dsub/*.sh

In [None]:
!gsutil -u $GOOGLE_PROJECT ls gs://fc-aou-datasets-controlled/v8/wgs/short_read/snpindel/acaf_threshold/pgen

In [None]:
%%bash --out LINE_COUNT_JOB_ID

# Get a shorter username to leave more characters for the job name.
DSUB_USER_NAME="$(echo "${OWNER_EMAIL}" | cut -d@ -f1)"

# For AoU RWB projects network name is "network".
AOU_NETWORK=network
AOU_SUBNETWORK=subnetwork

MACHINE_TYPE="n2-standard-4"

# Change for your bucket, path in output of cell directly above:
BASH_SCRIPT="gs://fc-secure-7ac6cae0-10b6-47a1-acb4-c71d46ca046e/data/dsub/plinkGWAS.sh"

#LOWER=1
#UPPER=22
#for ((chromo=$LOWER;chromo<$UPPER;chromo+=1))
for chromo in 22
do
#for t in C10AA C10AB C10AC C10AD C10AX06 C10AX09
for t in C10AB
do
    dsub \
    --provider google-cls-v2 \
    --user-project "${GOOGLE_PROJECT}" \
    --project "${GOOGLE_PROJECT}" \
    --image "gcr.io/bick-aps2/ghcr.io/rgcgithub/regenie/regenie:v3.2.4.gz" \
    --network "${AOU_NETWORK}" \
    --subnetwork "${AOU_SUBNETWORK}" \
    --service-account "$(gcloud config get-value account)" \
    --user "${DSUB_USER_NAME}" \
    --regions us-central1 \
    --logging "${WORKSPACE_BUCKET}/dsub/logs/{job-name}/{user-id}/$(date +'%Y%m%d/%H%M%S')/{job-id}-{task-id}-{task-attempt}.log" \
    "$@" \
    --preemptible \
    --boot-disk-size 1000 \
    --machine-type ${MACHINE_TYPE} \
    --name "${JOB_NAME}" \
    --script "${BASH_SCRIPT}" \
    --env GOOGLE_PROJECT=${GOOGLE_PROJECT} \
    --input plink2_file="gs://fc-secure-6eb7a615-fd07-4e0e-9394-928a16db191c/data/plink2" \
    --input pgen_file="gs://fc-aou-datasets-controlled/v8/wgs/short_read/snpindel/acaf_threshold/pgen/acaf_threshold.chr${chromo}.pgen" \
    --input pvar_file="gs://fc-aou-datasets-controlled/v8/wgs/short_read/snpindel/acaf_threshold/pgen/acaf_threshold.chr${chromo}.pvar" \
    --input psam_file="gs://fc-aou-datasets-controlled/v8/wgs/short_read/snpindel/acaf_threshold/pgen/acaf_threshold.chr${chromo}.psam" \
    --input pheno_file="gs://fc-secure-7ac6cae0-10b6-47a1-acb4-c71d46ca046e/data/aou_hypolipidemics.LDL_QT.txt" \
    --input cov_file="gs://fc-secure-7ac6cae0-10b6-47a1-acb4-c71d46ca046e/data/aou_hypolipidemics.LDL_covariates.txt" \
    --env trait=${t} \
    --env chrom=${chromo} \
    --output-recursive OUTPUT_PATH="${OUTPUT_FILES}/${chromo}"
done
done

# dsub for plink PGS

In [None]:
## MODIFY FOR FULL DATA RUN 
# Use underscores, not whitespace since it will become part of the bucket path.
#JOB_NAME='regenie_hypertensiondrug' ## add name in quotes, copy name in quotes to 4.1
JOB_NAME='score_lipidaemiadrug' ## add name in quotes, copy name in quotes to 4.1

# Save this Python variable as an environment variable so that its easier to use within %%bash cells.
%env JOB_NAME={JOB_NAME}

In [None]:
## Analysis Results Folder 
line_count_results_folder = os.path.join(
    os.getenv('WORKSPACE_BUCKET'),
    'dsub',
    'results',
    JOB_NAME,
    USER_NAME,
    datetime.now().strftime('%Y%m%d'))

line_count_results_folder

In [None]:
## Where the output files will go
output_files = os.path.join(line_count_results_folder, "results")
print(output_files)

In [None]:
OUTPUT_FILES = output_files

# Save this Python variable as an environment variable so that its easier to use within %%bash cells.
%env OUTPUT_FILES={OUTPUT_FILES}

2025.09.29 added option --score cols=+scoresums

In [None]:
%%writefile ~/score.sh

set -o pipefail 
set -o errexit

cp "${plink2_file}" ./plink2
chmod 700 ./plink2

cat ${sample_file} | \
  sed "s/^0\t0\t0\.0/0\t0\t0/" | \
  cut -f 1,2,3 > mysample

./plink2 \
  --bgen ${bgen_file} ref-unknown \
  --sample mysample \
  --score ${coeff_file} list-variants cols=+scoresums \
  --out score.${trait}.chr${chrom}

mv "score.${trait}.chr${chrom}"* ${OUTPUT_PATH}

In [None]:
%%writefile ~/export.sh

set -o pipefail 
set -o errexit

cp "${plink2_file}" ./plink2
chmod 700 ./plink2

./plink2 \
  --bgen ${bgen_file} ref-unknown \
  --snp chr10:38839760:G:C \
  --export A \
  --out chr10:38839760:G:C

mv chr10:38839760:G:C* ${OUTPUT_PATH}

In [None]:
## Modify to personal gs bucket
!gsutil cp /home/jupyter/score.sh {my_bucket}/data/dsub/

In [None]:
## Modify to personal gs bucket
!gsutil cp /home/jupyter/export.sh {my_bucket}/data/dsub/

In [None]:
!gsutil ls {my_bucket}/data/dsub/*.sh

The next cell contains the dsub command that will kick off the Regenie GWAS once ran. Right now, the ```--input``` variable options below point to the paths of the input files used in the original v7 LDL-C Regenie GWAS. If you would like to re-create the original run of the v7 LDL GWAS, change nothing (other than the lower and upper bounds depending on your test region) and run the next cell as is.

If you just created new phenotype and covariate files using notebooks 1-3 and section 2 above, you will need to change the ```--input pheno_file``` and ```--input cov_file``` variable paths to point to your new files in your bucket. 

If you just ran sections 3-6 and have new bgen files, you will need to change the ```--input bgen_file``` and ```--input sample_file``` variable paths to point to your new bgen files in your bucket.

In [None]:
%%bash --out LINE_COUNT_JOB_ID

source ~/aou_dsub.bash # This file was created via notebook 01_dsub_setup.ipynb.

# Get a shorter username to leave more characters for the job name.
DSUB_USER_NAME="$(echo "${OWNER_EMAIL}" | cut -d@ -f1)"

LOWER=1
UPPER=23
for ((chromo=$LOWER;chromo<$UPPER;chromo+=1))
do
#for t in C10AA C10AB C10AX09
for t in C03 C07 C08 C09
#for t in C10AX09
#for t in PA
do


    aou_dsub \
    --name "${JOB_NAME}" \
    --logging "${WORKSPACE_BUCKET}/dsub/logs/{job-name}/{user-id}/$(date +'%Y%m%d/%H%M%S')/{job-id}-{task-id}-{task-attempt}.log" \
    --image "gcr.io/bick-aps2/ghcr.io/rgcgithub/regenie/regenie:v3.2.4.gz" \
    --machine-type "n2-standard-4" \
    --boot-disk-size 1000 \
    --disk-size 1000 \
    --script "${WORKSPACE_BUCKET}/data/dsub/score.sh" \
    --output-recursive OUTPUT_PATH="${OUTPUT_FILES}/${chromo}" \
    --input plink2_file="gs://fc-secure-6eb7a615-fd07-4e0e-9394-928a16db191c/data/plink2" \
    --input bgen_file="gs://fc-aou-datasets-controlled/v8/wgs/short_read/snpindel/acaf_threshold/bgen/chr${chromo}.bgen" \
    --input sample_file="gs://fc-aou-datasets-controlled/v8/wgs/short_read/snpindel/acaf_threshold/bgen/chr${chromo}.sample" \
    --input coeff_file="${WORKSPACE_BUCKET}/data/coeff.${t}.hg38.txt" \
    --env trait=${t} \
    --env chrom=${chromo} \

done
done

In [None]:
# Save this Python variable value as an environment variable so that its easier to use within %%bash cells.
%env JOB_ID={LINE_COUNT_JOB_ID}

In [None]:
! gsutil cat gs://fc-secure-7ac6cae0-10b6-47a1-acb4-c71d46ca046e/dsub/logs/score-lipidaemiadrug/fumibaker/20250912/043440/score-lipi--fumibaker--250912-043442-45-task-None-stdout.log

In [None]:
%%bash

dstat \
    --provider google-batch \
    --project "${GOOGLE_PROJECT}" \
    --location us-central1 \
    --jobs "${JOB_ID}" \
    --users "${USER_NAME}" \
    --status '*'