In [None]:
####
# This notebook imports the following data sources:
#  - gnomAD v2.1 hail table
#  - ClinVar GRCh37 VCF
# The update to gnomad v3 presented significant changes to the schema of metadata which 
# breaks the handling of AF and populations in this notebook.
# It also renamed the contigs in the ref column which breaks the join with clinvar.
#
"""
ClinVar:
- clinvar_variation_id
- clinvar_review_status
- clinvar_significance
- clinvar_significance_interpretations
- locus + alleles
GnomAD:
- faf95
- source (genomes | exomes)
- popmax_pop
- popmax_ac
- popmax_an
- genomic_coordinates
"""

# Set the output bucket to write to, dataproc service account must have write access
# Do not include trailing slash or "gs://"
output_bucket = ""
assert(len(output_bucket) > 0)
# Set the TSV path to write into bucket. Can contain slash like "folder/file.tsv"
# Do not include leading slash
output_filename = "report.tsv"

reference_genome = "GRCh37"

In [None]:
import hail as hl
# `idempontent=True` is useful for running all cells in the notebook
hl.init(idempotent=True)

In [None]:
# utility functions for file placement
import subprocess
import os
import time, datetime

def run_args(args, fail_on_stderr=False, success_codes=[0]) -> tuple: # (stdout,stderr,returncode)
    print(args)
    p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = p.communicate()
    if (fail_on_stderr and len(stderr) > 0) or (p.returncode not in success_codes):
        raise RuntimeError("command {} failed with code {}:{}".format(
            args, p.returncode, stderr))
    return (stdout, stderr, p.returncode)

def local_to_bucket(local_path:str, gcs_path:str):
    if not gcs_path.startswith("gs://"):
        gcs_path = "gs://{}/{}".format(output_bucket, gcs_path)
    args = ["gsutil", "cp", local_path, gcs_path]
    run_args(args)
    
def bucket_to_local(gcs_path:str, local_path:str):
    if not gcs_path.startswith("gs://"):
        gcs_path = "gs://{}/{}".format(output_bucket, gcs_path)
    args = ["gsutil", "cp", gcs_path, local_path]
    run_args(args)
    
def local_to_hdfs(local_path:str, hdfs_path:str):
    if not local_path.startswith("/"):
        local_path = os.path.join(os.getcwd(), local_path)
    args = ["hdfs", "dfs", "-rm", hdfs_path]
    run_args(args, success_codes=[0,1]) # Allow error
    args = ["hadoop", "fs", "-cp", "file://" + local_path, hdfs_path]
#     args = ["hdfs", "dfs", "-cp", "file://" + local_path, hdfs_path]
    run_args(args)
    
def hdfs_to_local(hdfs_path:str, local_path:str):
    if os.path.exists(local_path):
        os.remove(local_path)
    args = ["hdfs", "dfs", "-cp", hdfs_path, "file://" + local_path]
    run_args(args)

In [None]:
import io, re

import os
# os.listdir("/data/gnomad.genomes.r2.1.1.sites.ht")

In [None]:
import io
import re

# Read gnomAD data as Hail Tables
# ds_exomes = hl.read_table("gs://gnomad-public/release/2.1.1/ht/exomes/gnomad.exomes.r2.1.1.sites.ht")
# ds_exomes = hl.read_table(
#     "gs://gnomad-public-requester-pays/release/2.1.1/ht/exomes/gnomad.exomes.r2.1.1.sites.ht")

# ds_exomes = ds_exomes.annotate(
#     source="gnomAD Exomes"
# )
# ds_genomes = hl.read_table("gs://gnomad-public/release/2.1.1/ht/genomes/gnomad.genomes.r2.1.1.sites.ht")
ds_genomes = hl.read_table(
    #"gs://gnomad-public-requester-pays/release/2.1.1/ht/genomes/gnomad.genomes.r2.1.1.sites.ht"
    "gs://gcp-public-data--gnomad/release/2.1.1/ht/genomes/gnomad.genomes.r2.1.1.sites.ht"
)
ds_genomes = ds_genomes.annotate(
    source="gnomAD Genomes"
)

# Can perform a union here if wanting both (ds = ds1.union(ds2))
def select_necessary_cols(ds):
    ds = ds.select(ds.freq, ds.faf, ds.vep, ds.source)
    return ds

# ds_exomes = select_necessary_cols(ds_exomes)
ds_genomes = select_necessary_cols(ds_genomes)

# Not using VEP in this notebook.
ds_genomes = ds_genomes.drop(ds_genomes.vep)
gnomad = ds_genomes
# ds = ds_genomes.union(ds_exomes, unify=True)

# Show the schema of the hail Table
gnomad.describe()

In [None]:
print(gnomad.n_partitions())

In [None]:
# # ds.describe()
# testds = ds.filter(ds.locus==hl.locus("16", 2185899))
# testds.show()
# print(testds.collect())

In [None]:
"""
ds.freq has raw frequency information, including AN, AC, and pop label. This is an array of 
structs, at indices determined by the categories in ds.globals.freq_index_dict

ds.faf has filtered allele frequency information, including confidence intervals faf95 adn faf99.
This is an array of structs, at indices determined by the category map in ds.globals.faf_index_dict
"""

def add_popmax_af(ds):
    """
    Adds a popmax_faf and popmax_af_pop column to the ds Hail Table.
    
    popmax_faf is a faf structure from the original ds, containing the maximum faf of the
    listed faf structures in the original ds, based on the filtering criteria 
    `default_faf_filter_type`. 
    
    The popmax_index_dict_key column contains the text field from the
    ds.globals.faf_index_dict which corresponds to each popmax_faf. This is similar to the
    ds.popmax_faf.meta["pop"] value but not exactly the same (gnomad_afr vs afr)
    
    Returns the updated ds.
    """
    # Identify indices in FAF field that correspond to the entire dataset (not a subset like non-cancer)
    # faf_index_map = [(k,v) for k, v in hl.eval(ds.globals.faf_index_dict).items() if k.startswith("gnomad_")]
    from enum import Enum
    class FafFilterType(Enum):
        # Each correponds to a filter func for a (k,v) of faf label to value
        GNOMAD_GLOBAL = lambda t: t[0] == "gnomad"
        GNOMAD_SUPERPOP = lambda t: t[0].startswith("gnomad_")
        ANY = lambda t: True

    # By default, filter to superpopulations aggregate faf
    default_faf_filter_type = FafFilterType.GNOMAD_SUPERPOP

    def faf_filter(faf_idx_tuple:tuple):
        return default_faf_filter_type(faf_idx_tuple)

    # Get list of the global faf_index_dict which meets the default_faf_filter criteria
    # This gives the indices of the desired populations, by default will take all top level populations
    faf_index_map = list(filter(faf_filter, [(k,v) for k,v in hl.eval(ds.globals.faf_index_dict).items()]))
    print("faf_index_map:\n" + str(faf_index_map))
    faf_indices = [v for k,v in faf_index_map]
    faf_labels = [k for k,v in faf_index_map]
    
    # freq_index_dict = gnomad.globals.freq_index_dict.collect()[0]
    # freq_index_dict = dict(freq_index_dict.items())
    # print(type(freq_index_dict))
    # print("gnomad.globals.freq_index_dict:\n" + str(freq_index_dict))
    
    # Annotate table with popmax FAF
    
    # This only will return the maximum pop FAF for each
    # variant, even if multiple populations meet the criteria. 
    # If we want all matching populations, need an explode() call
    # to flatten the pop FAFs into a record per pop per variant
    
    faf_index_dicts = [hl.struct(label=k, index=v) for (k,v) in faf_index_map]
    print(faf_index_dicts)
    
    # Add pop fafs sorted desc by faf95
    ds = ds.annotate(
        pop_fafs=hl.sorted(
            # Take only the FAF entries that correspond to the desired populations (faf_indices)
            # foreach faf_indices int, get the ds.faf entry for it
            hl.literal(faf_index_dicts).map(
                lambda i: hl.struct(faf=ds.faf[i.index],
                                    label=i.label)
            ),
            # Sort by 95% confidence FAF
            lambda faf_entry: faf_entry.faf.faf95,
            # Sort high to low
            reverse=True
        )
    )
    
    
    # Replace pop fafs with only those with max value (allowing multiple to have same)
    ds = ds.annotate(
        popmax_fafs=hl.filter(
            lambda faf_entry: faf_entry.faf.faf95 == ds.pop_fafs[0].faf.faf95,
            ds.pop_fafs
        )
    )
    
    # If popmax empty or max (first in list) is zero, set to empty
    # Remove popmax where AC=0
    ds = ds.annotate(
        popmax_fafs=hl.or_missing(
            ds.popmax_fafs[0].faf.faf95 != 0.0,
            hl.dict(ds.popmax_fafs.map(
                lambda f: hl.tuple([f.label, f.faf])
            ))
        )
    )
    
    # Add number of pops which were popmax (edge case where multiple pop AC>0 have same faf)
    ds = ds.annotate(
        popmax_faf_count=ds.popmax_fafs.size() # DictExpression
    )
    
    ds = ds.annotate(
        # Take the first entry with the highest FAF
        #popmax_faf=ds.popmax_fafs[0],
        popmax_faf_count=hl.len(ds.popmax_fafs),

        # Label of the freq_index_dict entry for this record's max pop
        # Take the first entry's label, which has the highest FAF
        popmax_index_dict_keys=ds.popmax_fafs.keys() 
    )
    
    ds = ds.annotate(
        # ds.globals.freq_index_dict uses the same keys as ds.globals.faf_index_dict so
        # we can reuse ds.popmax_index_dict_key created above
        # ds.globals.freq_index_dict[ds.popmax_index_dict_key] is the index within the ds.globals.freq
        # array that contains the frequency info for this population (subpop filtered)
        #popmax_faf_pop_freqs=ds.freq[ds.globals.freq_index_dict[ds.popmax_index_dict_key]] 
        # Creates a map of pop label -> ds.freq struct 
        popmax_faf_pop_freqs=hl.dict(ds.popmax_index_dict_keys.map(
            lambda key: hl.tuple([key, ds.freq[ds.globals.freq_index_dict[key]]]))))
    
    # Do some clean up work to pull each popmax record together.
    # Makes it easier to get to each in case there are multiple popmaxes.
    # ds.popmax will be an ArrayExpression of StructExpressions
    ds = ds.annotate(
        popmax=ds.popmax_index_dict_keys.map(
            lambda popmax_key: hl.struct(
                label=popmax_key,
                faf=ds.popmax_fafs[popmax_key],
                freq=ds.popmax_faf_pop_freqs[popmax_key])))
    
    # Drop fields just used for local computation, to reduce clutter
    ds = ds.drop(
        ds.pop_fafs,
        ds.popmax_fafs,
        ds.popmax_index_dict_keys,
        ds.popmax_faf_pop_freqs)
    return ds


g = gnomad
# g = g.filter(g.locus==hl.locus("1", 155874156))
gnomad_with_popmax = add_popmax_af(g)
# more_than1_popmax = gnomad_with_popmax.filter(gnomad_with_popmax.popmax_faf_count > 1)
# more_than1_popmax.show()

# Done with these, drop them to reduce later clutter in output
gnomad_with_popmax = gnomad_with_popmax.drop(
    gnomad_with_popmax.freq,
    gnomad_with_popmax.faf
)
gnomad_with_popmax.describe()
gnomad_with_popmax.show()

In [None]:
# t = gnomad_with_popmax.filter(gnomad_with_popmax.locus==hl.locus("1", 10108))
# print(t.freq[6].show()) # AC=1 AF=3.55e-3 AN=282
# print(t.collect())

# AC=218 AF=2.23e-2 AN=8710
# t = gnomad_with_popmax.filter(gnomad_with_popmax.locus==hl.locus("1", 155874156))
# t.popmax.show()

In [None]:
# Add gnomad-style coordinates
gnomad_formatted = gnomad_with_popmax.annotate(
#     source = gnomad_with_popmax.source,
#     gene = ds.gene,
#     popmax_pop = gnomad_with_popmax.popmax_faf.meta["pop"],
#     popmax_ac = gnomad_with_popmax.popmax_faf_pop_freq.AC,
#     popmax_an = gnomad_with_popmax.popmax_faf_pop_freq.AN,
#     popmax_faf95 = gnomad_with_popmax.popmax_faf.faf95,
    genomic_coordinates = hl.format("%s-%s-%s-%s",
        gnomad_with_popmax.locus.contig,
        hl.str(gnomad_with_popmax.locus.position),
        gnomad_with_popmax.alleles[0],
        gnomad_with_popmax.alleles[1]
    )
)
# gnomad_formatted.filter(gnomad_formatted.popmax_ac != 0).show()

In [None]:
# Download clinvar BGZF
import os, requests, subprocess

# Function to download a file to a localpath.
def download_to_file(url, filepath):
    r = requests.get(url, stream=True)
    with open(filepath, "wb") as fout: 
        for chunk in r.iter_content(chunk_size=1024): 
             if chunk:
                 fout.write(chunk)


# This url always points to the latest dump file, updated periodically by ClinVar
clinvar_vcf_url = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz"
clinvar_vcf_localpath = "/home/hail/clinvar.vcf.gz"
clinvar_vcf_hdfs = "clinvar.vcf.gz"
# ClinVar VCF is small enough to download to dataproc default local disk.
download_to_file(clinvar_vcf_url, clinvar_vcf_localpath)
assert(os.path.exists(clinvar_vcf_localpath))
print("Downloaded ClinVar VCF, file size (expecting ~28M): %d" % os.path.getsize(clinvar_vcf_localpath))

# Hail needs the file in HDFS
local_to_hdfs(clinvar_vcf_localpath, clinvar_vcf_hdfs)

In [None]:
# Load the ClinVar VCF into Hail Table
clinvar = hl.import_vcf(
    clinvar_vcf_hdfs,
    force_bgz=True,
    drop_samples=True, 
    skip_invalid_loci=True
).rows()
print("ClinVar row count: " + str(clinvar.count()))
clinvar.describe()
clinvar.show()

# ClinVar VCF export sets ID column to the ClinVar Variation ID (not rsid)
# And sets the RS field of INFO to the rsid if it exists.
# (https://ftp.ncbi.nlm.nih.gov/pub/clinvar/README_VCF.txt)
# Hail then sets this ClinVar ID as the rsid column of the clinvar struct
# We can filter to only the variants that exist in clinvar with:
# gnomad_clinvar_ds = gnomad_clinvar_ds.filter(
#     ~hl.is_missing(gnomad_clinvar_ds.clinvar_rsid)
# )

In [None]:
# Load input file to join to
filename = "invitae-variant-input.tsv"
bucket_to_local(filename, filename)
local_to_hdfs(filename, filename)
variant_input = hl.import_table(filename, delimiter='\t')
variant_input.show()

In [None]:
# Left join gnomad+clinvar to the input table
# ClinVar HGVS is in clinvar.info.CLNHGVS string array
c = clinvar.annotate(hgvs=clinvar.info.CLNHGVS)
c = c.explode(c.hgvs)
c = c.key_by(c.hgvs)
c = c.repartition(500)
c = c.persist()
print("ClinVar partitions: " + str(c.n_partitions()))

In [None]:
# Join clinvar to variant input

variant_input = variant_input.key_by(variant_input.Chromosomal_Variant)
variant_input = variant_input.repartition(200)
variant_input = variant_input.persist()
print(variant_input.n_partitions())

ds_joined = variant_input.annotate(
    clinvar=c[variant_input.Chromosomal_Variant]
)

In [None]:
# Debugging
ds_clinvar_missing = ds_joined.filter(hl.is_missing(ds_joined.clinvar))
print("ClinVar missing from {} input variants".format(ds_clinvar_missing.count()))

In [None]:
ds_joined2 = ds_joined.key_by(ds_joined.clinvar.locus, ds_joined.clinvar.alleles)
# Evaluate the re-keying before join
print("Persisting re-keyed ds_joined")
ds_joined2 = ds_joined2.persist()
print("Annotating with gnomad")
ds_joined2 = ds_joined2.annotate(
    gnomad=gnomad_formatted[ds_joined2.clinvar.locus, ds_joined2.clinvar.alleles]
)
# ds_joined2 = ds_joined2.persist()
print("ds_joined2 n_partitions: " + str(ds_joined2.n_partitions()))
print("Describing")
ds_joined2.describe()
# print("Showing")
# ds_joined2.show()

In [None]:
# ds_joined2.describe()
# ds_joined2.show()
ds_joined2.gnomad.popmax.freq.AC.describe()

In [None]:
# Select desired output fields (columns are ordered as provided)
output_ds = ds_joined2
output_ds = output_ds.select(
    output_ds.Input_Variant,
    output_ds.Chromosomal_Variant,
    gnomad_source=output_ds.gnomad.source,
    gnomad_popmax_faf95=hl.delimit(output_ds.gnomad.popmax.faf.faf95),
    gnomad_popmax_pop=hl.delimit(output_ds.gnomad.popmax.label),
    gnomad_popmax_raw_ac=hl.delimit(output_ds.gnomad.popmax.freq.AC),
    gnomad_popmax_raw_an=hl.delimit(output_ds.gnomad.popmax.freq.AN),
    gnomad_genomic_coordinates=output_ds.gnomad.genomic_coordinates,
    clinvar_variation_id=output_ds.clinvar.rsid,
    clinvar_review_status=hl.delimit(output_ds.clinvar.info["CLNREVSTAT"], ","),
    clinvar_significance=hl.delimit(output_ds.clinvar.info["CLNSIG"], ","),
    clinvar_significance_interpretations=hl.delimit(output_ds.clinvar.info["CLNSIGCONF"], ",")
)

# For popmax with ac = 0, set to null
# output_ds = output_ds.annotate(
#     gnomad_popmax_pop=hl.or_missing(
#         output_ds.gnomad_popmax_ac > 0,
#         output_ds.gnomad_popmax_pop
#     )
# )

output_ds.describe()
# output_ds.show()

In [None]:
# Debug removal of 0 ac popmaxes
# output_ds.filter(
#     (~hl.is_missing(output_ds.gnomad_source)) 
#     & hl.is_missing(output_ds.gnomad_popmax_pop)
# ).show()

In [None]:
# Export to TSV
import time
print("Starting export to %s" % output_filename)
start_time = time.time()
output_ds.export(output_filename)
end_time = time.time()
print("Export took %.2f seconds" % (end_time - start_time))

In [None]:
# The export is in HDFS now, copy to machine-local file
report_localpath = os.path.join(os.getcwd(), output_filename)
os.system("rm %s" % report_localpath)
hdfs_to_local(output_filename, report_localpath)

In [None]:
# Upload to bucket and filepath set at top of notebook
gs_output_file = "gs://%s/%s" % (output_bucket, output_filename)
local_to_bucket(report_localpath, gs_output_file)
