01 Step 1 of extract PRS SNPs from reference 1KGenomes and HGMD

extract all chrs, perform QC on them. Save to a matrix table to be opened in step 2

Run this notebook in the background using Cloud Compute Profile: 4CPUs, 26GB RAM and 100GB DISK; Dataproc with 100 workers, each w/ 4 CPUs, 15GB RAM, and 150GB HDD.

In [None]:
from datetime import datetime
start = datetime.now()
start

In [None]:
# Confirm Spark is installed.
try:
    import pyspark
except ModuleNotFoundError:
    print("!" * 100 + "\n\n"
          "In the Researcher Workbench, Hail can only be used on a Dataproc cluster.\n"
          "Please use the 'Cloud Analysis Environment' side panel to update your runtime compute type.\n\n" +
          "!" * 100)

# Initialize Hail
import hail as hl
import os
from hail.plot import show

hl.init(default_reference='GRCh38', idempotent=True) #add idempotent = True. I think This setting enables multiple notebooks to use the same Hail context.
hl.plot.output_notebook()

In [None]:
# get reference matrix table. Note, I don't change variable names so that I don't have to fix the names later
path_mt_alpha2_cleaned = 'gs://gcp-public-data--gnomad/release/3.1.2/mt/genomes/gnomad.genomes.v3.1.2.hgdp_1kg_subset_dense.mt'
mt_alpha2 = hl.read_matrix_table(path_mt_alpha2_cleaned)



In [None]:
# connect to workspace
workspace_bucket = os.environ['WORKSPACE_BUCKET']
#!gsutil ls {workspace_bucket}

In [None]:
#pull in the intervals
interval_table = hl.import_locus_intervals('gs://fc-secure-30fdbdfd-a46b-406d-9617-1bc69ae1da9d/CRC-PRS-2021-02-positions.b38.intervals.txt',reference_genome='GRCh38')
#filter on intervals. must use filter_rows here. filter_intervals did not work. 
filtered_mt = mt_alpha2.filter_rows(hl.is_defined(interval_table[mt_alpha2.locus]))


In [None]:
# perform QC steps
#filtered_mt.alleles.summarize()
#hl.summarize_variants(filtered_mt)
#filtered_mt.GQ.summarize()

#require GQ >=20
filtered_mt_GQ_QC = filtered_mt.filter_entries(filtered_mt.GQ >= 20)
#filtered_mt_GQ_QC.GQ.summarize()

In [None]:
# QC by participants
# add in sample qc column?
filtered_mt_GQ_QC = hl.sample_qc(filtered_mt_GQ_QC)
#filtered_mt_GQ_QC.sample_qc.call_rate.summarize()
# keep participants with > 90% call rate
mt_QC1 = filtered_mt_GQ_QC.filter_cols(filtered_mt_GQ_QC.sample_qc.call_rate >= 0.9)

In [None]:
# QC variants
mt_QC1 = hl.variant_qc(mt_QC1)
#mt_QC2 = mt_QC1.filter_rows(mt_QC1.variant_qc.call_rate >= 0.9)
#hl.summarize_variants(mt_QC2)
filtered2_mt = mt_QC1.filter_rows(mt_QC1.variant_qc.call_rate >= 0.9)
#hl.summarize_variants(filtered2_mt)

In [None]:
# save to matrix table in my google bucket
#filtered2_mt = filtered2_mt.cache() 
filtered2_mt.write("gs://fc-secure-30fdbdfd-a46b-406d-9617-1bc69ae1da9d/data/reference/filtered2.mt")

In [None]:
stop = datetime.now()
print ('total time is: {}'.format(stop - start))