In [None]:
# Set the output bucket to write to, dataproc service account must have write access
# Do not include trailing slash or "gs://"
output_bucket = "clingen-dataproc-workspace-kferrite"
# Set the TSV path to write into bucket. Can contain slash like "folder/file.tsv"
# Do not include leading slash
output_filename = "report.tsv"

reference_genome = "GRCh37"

# Set this to true to limit output variants to be those within transcript coding regions
transcript_filter = False


In [None]:
import hail as hl
# `idempontent=True` is useful for running all cells in the notebook
hl.init(idempotent=True)

In [None]:
# Obtain desired thresholds
import io, re

thresholds = """
MYH7	BA1	0.10%	NM_000257.4
MYH7	BS1	0.02%	NM_000257.4
PTPN11	BA1	0.05%	NM_002834.5
PTPN11	BS1	0.03%	NM_002834.5
CDH1	BA1	0.20%	NM_004360.5
CDH1	BS1	0.10%	NM_004360.5
RUNX1	BA1	0.15%	NM_001754.4
RUNX1	BS1	0.015%	NM_001754.4
TP53	BA1	0.10%	NM_000546.5
TP53	BS1	0.03%	NM_000546.5
GJB2	BA1	0.50%	NM_004004.6
GJB2	BS1	0.30%	NM_004004.6
PAH	BA1	1.50%	NM_000277.3
PAH	BS1	0.20%	NM_000277.3
GAA	BA1	1%	NM_000152.5
GAA	BS1	0.50%	NM_000152.5
HRAS	BA1	0.05%	NM_005343.4
HRAS	BS1	0.03%	NM_005343.4
NRAS	BA1	0.05%	NM_002524.5
NRAS	BS1	0.03%	NM_002524.5
"""

thresh_reader = io.StringIO(thresholds)

def parse_thresholds(reader):
    """
    Expects `reader` to be a file/io reader 
    with a newline delimited list of:
    <gene-symbol> <thresh-name> <thresh-percent> [refseq]
    ...
    <thresh-percent> may be pure float or contain % denoting 10e2 scaling
    Returns a multilayer dictionary of gene(str)->threshname(str)->AF->percent(float)
    Example:
    gene_thresholds = {
        "MYH7": {
            "BA1": {
                "AF": 0.0005
            },
            "BS1": {
                "AF": 0.0002
            }
        }
    }
    """
    thresholds = {}
    gene_refseqs = {} # Fill in any that appear in input, fill in rest later
    
    # Load whole reader contents, should be small enough
    contents = reader.read()
    lines = contents.splitlines()
    lines = [l for l in lines if l and len(l)] # skip empty lines
    for line in lines:
        terms = re.split("\s+", line)
        if len(terms) < 3:
            raise RuntimeError("Input lines must contain at least 3 items")
        gene = terms[0]
        thresh_name = terms[1]
        thresh = terms[2]
        
        # Store refseq for the gene if provided
        if len(terms) >= 4:
            if gene in gene_refseqs and gene_refseqs[gene] != terms[3]:
                raise RuntimeError("Gene %s lines did not contain the same refseq")
            gene_refseqs[gene] = terms[3]
        
        # Parse percentages
        if thresh.endswith("%"):
            thresh = float(thresh[:-1]) / 100.0
        else:
            thresh = float(thresh)
        
        if gene not in thresholds:
            thresholds[gene] = {}
        thresholds[gene][thresh_name] = {"AF": thresh}
    return thresholds, gene_refseqs

        
gene_thresholds, gene_refseqs = parse_thresholds(thresh_reader)

print("gene_thresholds:\n%s" % gene_thresholds)
print("input refseqs:\n%s" % gene_refseqs)

gene_thresholds_exp = hl.literal(gene_thresholds)


In [None]:
import io
import re

# Read gnomAD data as Hail Tables
# ds_exomes = hl.read_table("gs://gnomad-public/release/2.1.1/ht/exomes/gnomad.exomes.r2.1.1.sites.ht")
ds_exomes = hl.read_table(
    "gs://gnomad-public-requester-pays/release/2.1.1/ht/exomes/gnomad.exomes.r2.1.1.sites.ht")

ds_exomes = ds_exomes.annotate(
    source="gnomAD Exomes"
)
# ds_genomes = hl.read_table("gs://gnomad-public/release/2.1.1/ht/genomes/gnomad.genomes.r2.1.1.sites.ht")
ds_genomes = hl.read_table(
    "gs://gnomad-public-requester-pays/release/2.1.1/ht/genomes/gnomad.genomes.r2.1.1.sites.ht")
ds_genomes = ds_genomes.annotate(
    source="gnomAD Genomes"
)

# Can perform a union here if wanting both (ds = ds1.union(ds2))
def select_necessary_cols(ds):
    ds = ds.select(ds.freq, ds.faf, ds.vep, ds.source)
    return ds

ds_exomes = select_necessary_cols(ds_exomes)
ds_genomes = select_necessary_cols(ds_genomes)

ds = ds_genomes.union(ds_exomes, unify=True)

# Show the schema of the hail Table
# ds.describe()

In [None]:
"""
ds.freq has raw frequency information, including AN, AC, and pop label. This is an array of 
structs, at indices determined by the categories in ds.globals.freq_index_dict

ds.faf has filtered allele frequency information, including confidence intervals faf95 adn faf99.
This is an array of structs, at indices determined by the category map in ds.globals.faf_index_dict
"""

def add_popmax_af(ds):
    """
    Adds a popmax_faf and popmax_af_pop column to the ds Hail Table.
    
    popmax_faf is a faf structure from the original ds, containing the maximum faf of the
    listed faf structures in the original ds, based on the filtering criteria 
    `default_faf_filter_type`. 
    
    The popmax_index_dict_key column contains the text field from the
    ds.globals.faf_index_dict which corresponds to each popmax_faf. This is similar to the
    ds.popmax_faf.meta["pop"] value but not exactly the same (gnomad_afr vs afr)
    
    Returns the updated ds.
    """
    # Identify indices in FAF field that correspond to the entire dataset (not a subset like non-cancer)
    # faf_index_map = [(k,v) for k, v in hl.eval(ds.globals.faf_index_dict).items() if k.startswith("gnomad_")]
    from enum import Enum
    class FafFilterType(Enum):
        # Each correponds to a filter func for a (k,v) of faf label to value
        GNOMAD_GLOBAL = lambda t: t[0] == "gnomad"
        GNOMAD_SUPERPOP = lambda t: t[0].startswith("gnomad_")
        ANY = lambda t: True

    # By default, filter to superpopulations aggregate faf
    default_faf_filter_type = FafFilterType.GNOMAD_SUPERPOP

    def faf_filter(faf_idx_tuple:tuple):
        return default_faf_filter_type(faf_idx_tuple)

    # Get list of the global faf_index_dict which meets the default_faf_filter criteria
    # This gives the indices of the desired populations, by default will take all top level populations
    faf_index_map = list(filter(faf_filter, [(k,v) for k,v in hl.eval(ds.globals.faf_index_dict).items()]))
    faf_indices = [v for k,v in faf_index_map]
    faf_labels = [k for k,v in faf_index_map]
    
    # Annotate table with popmax FAF
    
    # This only will return the maximum pop FAF for each
    # variant, even if multiple populations meet the criteria. 
    # If we want all matching populations, need an explode() call
    # to flatten the pop FAFs into a record per pop per variant
    
    ds = ds.annotate(
        popmax_faf=hl.sorted(
            # Take only the FAF entries that correspond to the desired populations (faf_indices)
            hl.literal(faf_indices).map(lambda i: ds.faf[i]),
            # Sort by 95% confidence FAF
            lambda faf_entry: faf_entry.faf95,
            # Sort high to low
            reverse=True
        )[0] # Take the first entry with the highest FAF
        ,
        # Label of the freq_index_dict entry for this record's max pop
        popmax_index_dict_key=hl.sorted(
            # List of tuples of (poplabel, faf_index)
            list(zip(list(faf_labels), list(faf_indices))),

            # Take only the FAF entries that correspond to the entire dataset
            # Sort by 95% confidence FAF
            key=lambda tpl: ds.faf[tpl[1]].faf95,
            # Sort high to low
            reverse=True
        )[0][0] # Take the first entry, which has the highest FAF
    )
    
    ds = ds.annotate(
#         popmax_faf_pop_freq=ds.freq[ds.globals.freq_index_dict["gnomad_" + ds.popmax_faf.meta.get("pop")]]

        # ds.globals.freq_index_dict uses the same keys as ds.globals.faf_index_dict so
        # we can reuse ds.popmax_index_dict_key created above
        popmax_faf_pop_freq=ds.freq[ds.globals.freq_index_dict[ds.popmax_index_dict_key]] 
    )
    
    return ds


ds = add_popmax_af(ds)

In [None]:
# These next 2 functions override functions from hail.experimental, modified to return a mapping
# of gene_symbols to the intervals they correspond to. Existing methods return unordered list

import operator
import functools
from hail.genetics.reference_genome import reference_genome_type
from hail.typecheck import typecheck, nullable, sequenceof
from hail.utils.java import info
from hail.utils import new_temp_file

def _load_gencode_gtf(gtf_file=None, reference_genome=None):
    """
    Get Gencode GTF (from file or reference genome)

    Parameters
    ----------
    reference_genome : :obj:`str` or :class:`.ReferenceGenome`, optional
       Reference genome to use (passed along to import_gtf).
    gtf_file : :obj:`str`
       GTF file to load. If none is provided, but `reference_genome` is one of
       `GRCh37` or `GRCh38`, a default will be used (on Google Cloud Platform).

    Returns
    -------
    :class:`.Table`
    """
    GTFS = {
        'GRCh37': 'gs://hail-common/references/gencode/gencode.v19.annotation.gtf.bgz',
        'GRCh38': 'gs://hail-common/references/gencode/gencode.v29.annotation.gtf.bgz',
    }
    if reference_genome is None:
        reference_genome = hl.default_reference().name
    else:
        reference_genome = reference_genome.name
    if gtf_file is None:
        gtf_file = GTFS.get(reference_genome)
        if gtf_file is None:
            raise ValueError(
                'get_gene_intervals requires a GTF file, or the reference genome be one of GRCh37 or GRCh38 (when on Google Cloud Platform)')
    ht = hl.experimental.import_gtf(gtf_file, reference_genome=reference_genome,
                                    skip_invalid_contigs=True, min_partitions=12)
    ht = ht.annotate(gene_id=ht.gene_id.split('\\.')[0],
                     transcript_id=ht.transcript_id.split('\\.')[0])
    return ht

@typecheck(gene_symbols=nullable(sequenceof(str)),
           gene_ids=nullable(sequenceof(str)),
           transcript_ids=nullable(sequenceof(str)),
           verbose=bool, reference_genome=nullable(reference_genome_type), gtf_file=nullable(str))
def get_gene_intervals(gene_symbols=None, gene_ids=None, transcript_ids=None,
                       verbose=True, reference_genome=None, gtf_file=None):
    """Get intervals of genes or transcripts.

    Get the boundaries of genes or transcripts from a GTF file, for quick filtering of a Table or MatrixTable.

    On Google Cloud platform:
    Gencode v19 (GRCh37) GTF available at: gs://hail-common/references/gencode/gencode.v19.annotation.gtf.bgz
    Gencode v29 (GRCh38) GTF available at: gs://hail-common/references/gencode/gencode.v29.annotation.gtf.bgz

    Example
    -------
    >>> hl.filter_intervals(ht, get_gene_intervals(gene_symbols=['PCSK9'], reference_genome='GRCh37'))  # doctest: +SKIP

    Parameters
    ----------

    gene_symbols : :obj:`list` of :obj:`str`, optional
       Gene symbols (e.g. PCSK9).
    gene_ids : :obj:`list` of :obj:`str`, optional
       Gene IDs (e.g. ENSG00000223972).
    transcript_ids : :obj:`list` of :obj:`str`, optional
       Transcript IDs (e.g. ENSG00000223972).
    verbose : :obj:`bool`
       If ``True``, print which genes and transcripts were matched in the GTF file.
    reference_genome : :obj:`str` or :class:`.ReferenceGenome`, optional
       Reference genome to use (passed along to import_gtf).
    gtf_file : :obj:`str`
       GTF file to load. If none is provided, but `reference_genome` is one of
       `GRCh37` or `GRCh38`, a default will be used (on Google Cloud Platform).

    Returns
    -------
    :obj:`list` of :class:`.Interval`
    """
    if gene_symbols is None and gene_ids is None and transcript_ids is None:
        raise ValueError('get_gene_intervals requires at least one of gene_symbols, gene_ids, or transcript_ids')
    ht = _load_gencode_gtf(gtf_file, reference_genome)
    criteria = []
    if gene_symbols:
        criteria.append(hl.any(lambda y: (ht.feature == 'gene') & (ht.gene_name == y), gene_symbols))
    if gene_ids:
        criteria.append(hl.any(lambda y: (ht.feature == 'gene') & (ht.gene_id == y.split('\\.')[0]), gene_ids))
    if transcript_ids:
        criteria.append(hl.any(lambda y: (ht.feature == 'transcript') & (ht.transcript_id == y.split('\\.')[0]), transcript_ids))

    ht = ht.filter(functools.reduce(operator.ior, criteria))
    gene_info = ht.aggregate(hl.agg.collect((ht.feature, ht.gene_name, ht.gene_id, ht.transcript_id, ht.interval)))
    if verbose:
        info(f'get_gene_intervals found {len(gene_info)} entries:\n'
             + "\n".join(map(lambda x: f'{x[0]}: {x[1]} ({x[2] if x[0] == "gene" else x[3]})', gene_info)))
    # intervals = list(map(lambda x: x[-1], gene_info))
    intervals = list(map(lambda x: {
        'gene_symbol': x[1],
        'gene_id': x[2],
        'transcript_id': x[3],
        'interval': x[4]
    }, gene_info))
    return intervals

                         
# Look up intervals for the gene symbols in the input thresholds
gene_symbols = [k for k in gene_thresholds.keys()]
intervals = get_gene_intervals(gene_symbols=gene_symbols, reference_genome="GRCh37")

def get_gene_interval(gene_symbol:str):
    global intervals
    for i in intervals:
        if i["gene_symbol"] == gene_symbol:
            return i["interval"]
    print("Getting new gene interval: %s" % gene_symbol)
    i = get_gene_intervals(gene_symbols=[gene_symbol], reference_genome="GRCh37")[0]
    intervals.append(i)
    return i["interval"]

In [None]:
# Perform some preliminary annotations
ds_crit = ds

# This was removed because we can't assume all gene symbols are the same, a variant can have >1
# ds_crit = ds_http://localhost:8123/notebooks/clingen-dataproc-workspace-kferrite/ClinGen-Gnomad-FAF-Report-V2.ipynb#crit.annotate(
#     gene_symbol=ds_crit.vep.transcript_consequences.gene_symbol # Can't assume they are all the same
# )

print(intervals)
ivl_struct_list = hl.literal(
    [hl.struct(
        gene_symbol=i["gene_symbol"],
        gene_id=i["gene_id"],
        transcript_id=i["transcript_id"],
        interval=i["interval"]
    ) for i in intervals]
)

# Filter by intervals of genes provided in input criteria
ds_crit = hl.filter_intervals(ds_crit, [i["interval"] for i in intervals])

# Now attach the gene field using 1 of two methods.
# If transcript_filter is true, attach gene label based on transcript_consequences
# If transcript_filter is false, attach based on which gene interval it is contained in
if transcript_filter is False:
    ds_crit = ds_crit.annotate(
        gene=ivl_struct_list.find(
            # Check if Interval object contains this variant (start pos)
            lambda ivl: ivl["interval"].contains(ds_crit.locus)
        ).gene_symbol
    )
else:
    # Explode a new record per transcript consequence, each now has 1 gene
    ds_crit = ds_crit.annotate(
        transcript_consequences=ds_crit.vep.transcript_consequences
    )
    ds_crit = ds_crit.explode("transcript_consequences")
    ds_crit = ds_crit.annotate(
        gene=ds_crit.transcript_consequences.gene_symbol
    )


# Sort each gene's criteria thresholds descending by AF so first hl.find is the max
gene_thresholds_sorted = gene_thresholds_exp.map_values(
    lambda gene_criteria: hl.sorted(
        # Transform {"BA1": {"AF": 0.02}} to list of [("BA1", {"AF": 0.02})]
        gene_criteria.keys().map(lambda crit_name: (crit_name, gene_criteria[crit_name])),
        
        # Key to sort the above ArrayExpression
        lambda t: t[1]["AF"],
        
        # Reverse order so we find the max threshold first
        reverse=True
    )
)
print(gene_thresholds_sorted.collect())

# Filter to variants in genes we care about
ds_crit = ds_crit.filter(
    gene_thresholds_exp.keys().contains(ds_crit.gene)
)

ds_crit = ds_crit.annotate(
    # Get the max AF threshold which is less or equal to popmax_faf.faf95
    criteria_satisfied=hl.or_missing(
        # Condition
        gene_thresholds_exp.keys().any(
            lambda threshold_gene: ds_crit.gene == threshold_gene
        ),
        
        # If this gene is in criteria, find max criteria (already reverse sorted, find gets first)
        hl.find(
            lambda tpl: tpl[1]["AF"] <= ds_crit.popmax_faf.faf95,
            # gene_thresholds[ds_crit.gene][crit_name]["AF"] <= ds_crit.popmax_faf.faf95,

            # List of (crit_name, {"AF": 0.02})
            gene_thresholds_sorted[ds_crit.gene]
        )[0] # returns the criteria name (ex: BA1)
        
    )
)

# Filter to variants which meet a criteria
ds_crit = ds_crit.filter(
    ~hl.is_missing(ds_crit.criteria_satisfied)
)


filtered_ds = ds_crit.select(
    criteria_satisfied = ds_crit.criteria_satisfied,
    source = ds_crit.source,
    gene = ds_crit.gene,
    popmax_pop = ds_crit.popmax_faf.meta["pop"],
    popmax_ac = ds_crit.popmax_faf_pop_freq.AC,
    popmax_an = ds_crit.popmax_faf_pop_freq.AN,
    faf95 = ds_crit.popmax_faf.faf95,
    genomic_coordinates = hl.format("%s-%s-%s-%s",
        ds_crit.locus.contig,
        hl.str(ds_crit.locus.position),
        ds_crit.alleles[0],
        ds_crit.alleles[1]
    )
)
# filtered_ds.show()

In [None]:
# Import ClinVar VCF as Hail Table
# clinvar = hl.import_vcf("/path/to/clinvar.vcf.gz", force_bgz=True, drop_samples=True, skip_invalid_loci=True).rows()

# Download clinvar BGZF
import os, requests, subprocess

# Function to download a file to a localpath. ClinVar VCF is small enough to download to dataproc default local disk.
def download_to_file(url, filepath):
    r = requests.get(url, stream=True)
    with open(filepath, "wb") as fout: 
        for chunk in r.iter_content(chunk_size=1024): 
             if chunk:
                 fout.write(chunk)
# This url always points to the latest dump file, updated periodically by ClinVar
clinvar_vcf_url = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz"
clinvar_vcf_localpath = "/home/hail/clinvar.vcf.gz"
clinvar_vcf_hdfs = "clinvar.vcf.gz"
download_to_file(clinvar_vcf_url, clinvar_vcf_localpath)
assert(os.path.exists(clinvar_vcf_localpath))
print("Downloaded ClinVar VCF, file size (expecting ~28M): %d" % os.path.getsize(clinvar_vcf_localpath))

# Hail needs the file in HDFS
p = subprocess.Popen(["hdfs", "dfs", "-cp", "file://" + clinvar_vcf_localpath, clinvar_vcf_hdfs])
print(p.communicate())


clinvar = hl.import_vcf(
    clinvar_vcf_hdfs,
    force_bgz=True,
    drop_samples=True, 
    skip_invalid_loci=True
).rows()
print(clinvar.count())

# Join ClinVar table to gnomAD table. ClinVar fields available under the table.clinvar struct
gnomad_clinvar_ds = filtered_ds.annotate(
    clinvar=clinvar[filtered_ds.locus, filtered_ds.alleles]
)

# ClinVar VCF export sets ID column to the ClinVar Variation ID (not rsid)
# And sets the RS field of INFO to the rsid if it exists.
# (https://ftp.ncbi.nlm.nih.gov/pub/clinvar/README_VCF.txt)
# Hail then sets this ClinVar ID as the rsid column of the clinvar struct
# We can filter to only the variants that exist in clinvar with:
# gnomad_clinvar_ds = gnomad_clinvar_ds.filter(
#     ~hl.is_missing(gnomad_clinvar_ds.clinvar_rsid)
# )

In [None]:
print("Loading ClinGen Allele Registry gene info")

In [None]:
# Map genes to MANE Preferred ref seqs
# example: http://reg.genome.network/genes?name=MYH7
import json

gene_url_templ = "http://reg.genome.network/genes?name={gene_name}"

genes_MANE_pref_map = {}

def list_flatten(L):
    return [e for l in L for e in l]

# input_genes = list_flatten([t.keys() for t in gene_thresholds.collect()])
input_genes = list(gene_thresholds.keys())

for gene_name in input_genes:
    gene_url = gene_url_templ.format(gene_name=gene_name)
    response = requests.get(gene_url)
    if response.status_code != 200:
        handle_http_error(response)
    r = json.loads(response.text)
    # Filter to only those which have this gene symbol (discard overlappings)
    r = [e for e in r if e["externalRecords"]["HGNC"]["symbol"] == gene_name]
    
    if len(r) == 0:
        raise RuntimeError("Gene name %s not found in allele registry /genes" % gene_name)
    elif len(r) > 1:
        print("Warning: gene name %s returned multiple entries from allele registry /genes: %s" % (
            gene_name, json.dumps(r, indent=2)))
    g = r[0] # Select first gene sequence object
    external_records = g["externalRecords"]
    if external_records["HGNC"]["symbol"] != gene_name:
        raise RuntimeError("Unexpected symbol %s returned from %s query" % (
            external_records["HGNC"]["symbol"], gene_name))
    mane_pref_refseq = external_records["MANEPrefRefSeq"]["id"]
    
    # Add to mapping
    genes_MANE_pref_map[gene_name] = mane_pref_refseq


# For genes not in gene_refseqs (from user input), set refseq to MANE Preferred
for gene_name in input_genes:
    if gene_name not in gene_refseqs:
        print("Updating gene %s refseq to MANE Preferred %s" % (
            gene_name, genes_MANE_pref_map[gene_name]))
        gene_refseqs[gene_name] = genes_MANE_pref_map[gene_name]

# This is just for debugging/checking later, can be removed if later block referencing it is removed

print("Processing genes_MANE_pref_map names and versions")
genes_MANE_pref_map_split = {}
for gene_name, refseq in genes_MANE_pref_map.items():
    r_id, r_version = refseq.split(".")
    genes_MANE_pref_map_split[gene_name] = {
        "refseq": refseq,
        "refseq_id": r_id,
        "refseq_version": r_version
    }
genes_MANE_pref_map_split_hl = hl.literal(genes_MANE_pref_map_split)

print(json.dumps(gene_refseqs, indent=2))

In [None]:
print("Downloading ClinGen Allele Registry variant info")

In [None]:
# Add information from the ClinGen Allele Registry
import requests
import json
allele_registry_baseurl = "https://reg.clinicalgenome.org"

# If the collected rows are the gnomad_ids can't fit into memory, would need to write out
# to a file and connect the request to the file stream. As is, is generally a few 100KiB or few MiB
# def write_col_to_file(ds, colname:str, filename:str):
#     ds.select(colname).export(filename, header=False)
# gnomad_id_file = "gnomad-ids.txt"
# write_col_to_file(gnomad_clinvar_ds, "genomic_coordinates", gnomad_id_file)

allele_registry_ds = gnomad_clinvar_ds

def handle_http_error(response):
    raise RuntimeError("Request failed (status code %s), response: %s" % (response.status_code, response.text))

gnomad_ids = allele_registry_ds.select("genomic_coordinates").collect()
# The above returns structs which still contain locus and position keys
gnomad_ids = [e.genomic_coordinates for e in gnomad_ids]
post_body = "\n".join(gnomad_ids)

    

In [None]:
response = requests.post(
    allele_registry_baseurl + "/alleles?file=gnomAD.id",
    data=post_body
)
if response.status_code != 200:
    handle_http_error(response)

print("Downloaded %d bytes from allele registry" % len(response.text))
reg_entries_d = json.loads(response.text) # list

# Remove response from memory after parsing
del response

In [None]:
# Register missing variants in the ClinGen Allele Registry
import hashlib, time

# def register_alleles(identifiers:list, identifier_type:str):
#     url = allele_registry_baseurl
#     url = "https://reg.clinicalgenome.org"
#     login = "PROVIDE_CREDENTIALS"
#     password = "PROVIDE_CREDENTIALS"
#     identity = hashlib.sha1((login + password).encode('utf-8')).hexdigest()
    
#     gbTime = str(int(time.time()))
    
#     token = hashlib.sha1((url + identity + gbTime).encode('utf-8')).hexdigest()
#     url_params = { # values must be url safe
#         "file": "gnomAD.id",
#         "gbLogin": login,
#         "gbTime": gbTime,
#         "gbToken": token
#     }
#     url = url + "/alleles?" + "&".join(p[0]+"="+p[1] for p in url_params.items())
#     print(url)
#     response = requests.put(
#         url,
#         data="\n".join(identifiers))
#     return response



In [None]:
# Manipulate the response from the Allele Registry to filter it down and map it to keys
# that we can use later in annotations

# Store the alleles which the ClinGen Allele Registry does not have registered
unregistered_gnomad_ids = []

# Map gnomad IDs to allele registry entries
gnomad_reg_map = {}

for reg_entry in reg_entries_d:
    if "genomicAlleles" not in reg_entry:
        if reg_entry.get("errorType", ""):
            # print("Unregistered gnomAD id: " + reg_entry["inputLine"])
            unregistered_gnomad_ids.append(reg_entry["inputLine"])
#             gnomad_reg_map[reg_entry["inputLine"]] = {}
            continue
        else:
            raise RuntimeError("genomicAlleles field not in response entry from allele registry:\n%s" % reg_entry)
    desired_genomic_allele = None
    for genomic_allele in reg_entry["genomicAlleles"]:
        if "referenceGenome" not in genomic_allele:
            raise RuntimeError("referenceGenome is not in genomicAllele entry")
        if genomic_allele["referenceGenome"] == reference_genome:
            desired_genomic_allele = genomic_allele
            gnomad_id = str(reg_entry["externalRecords"]["gnomAD"][0]["id"])
            gnomad_reg_map[gnomad_id] = reg_entry
            break
    if desired_genomic_allele is None:
        raise RuntimeError("Did not find genomic allele with reference %s" % reference_genome)

if len(unregistered_gnomad_ids) > 0:
    print("gnomAD ids not registered in Allele Registry:")
    for u in unregistered_gnomad_ids:
        print(u)


# Pre-process the accession identifier and version
print("Processing gene_refseqs names and versions")
gene_refseqs_split = {}
for gene_name, refseq in gene_refseqs.items():
    r_id, r_version = refseq.split(".")
    gene_refseqs_split[gene_name] = {
        "refseq": refseq,
        "refseq_id": r_id,
        "refseq_version": r_version
    }


# Use gnomad ID -> allele registry entry important info
gnomad_hgvs_map = {}

for gnomad_id, reg_entry in gnomad_reg_map.items():
    # Get protein change
    if "transcriptAlleles" in reg_entry:
        transcript_alleles = reg_entry["transcriptAlleles"]
        gnomad_id_entry_count = len(gnomad_hgvs_map.get(gnomad_id, []))
        for transcript_allele in transcript_alleles:
            transcript_gene = transcript_allele.get("geneSymbol", None)
#             protein_effect = hl.null("str")
            protein_effect = "" # Empty string default for compatibility with string operations
            
            if transcript_gene in input_genes:
                if "proteinEffect" in transcript_allele:
                    protein_effect = transcript_allele["proteinEffect"]["hgvs"]
                
                for hgvs in transcript_allele["hgvs"]:
                    # Take the entry if it has an hgvs expression with the same refseq_id
                    # even if the version is different. We collapse to 1 preferred in a later step
                    if hgvs.startswith(gene_refseqs_split[transcript_gene]["refseq_id"]):
                        if gnomad_id not in gnomad_hgvs_map:
                            gnomad_hgvs_map[gnomad_id] = []
                        
                        h_terms = hgvs.replace(":", ".").split(".")
                        h_id, h_version = h_terms[0], h_terms[1]
#                         p_terms = protein_effect.replace(":", ".").split(".")
#                         p_id, p_version = p_terms[0], p_terms[1]
#                         print("hgvs: %s, seq: %s, version: %s" % (hgvs, h_id, h_version))
#                         print("protein_effect: %s, seq: %s, version: %s" % (protein_effect, p_id, p_version))
                        
                        gnomad_hgvs_map[gnomad_id].append({
                            "refseq": hgvs,
                            "refseq_id": h_id,
                            "refseq_version": h_version,
                            "protein_effect": protein_effect
                        })
        print("Added %s entries for gnomad_id %s" % (len(gnomad_hgvs_map.get(gnomad_id, [])), gnomad_id))
    else:
        print("transcriptAlleles not in reg_entry:\n%s" % json.dumps(reg_entry))

print("Freeing gnomad_reg_map")
del gnomad_reg_map

print("gnomad_hgvs_map len: %d" % (len(gnomad_hgvs_map)))
print("was expecting: %d" % len(gnomad_ids))
# gnomad_hgvs_hl = hl.literal(gnomad_hgvs_map)

# gene_refseqs_hl = hl.literal(gene_refseqs)
print("Converting objects to hail expressions")
gene_refseqs_split_hl = hl.literal(gene_refseqs_split)
input_genes_hl = hl.literal(input_genes)

In [None]:
print(gnomad_hgvs_map)

In [None]:
print("Adding annotations")

reg_ds = gnomad_clinvar_ds

import pandas as pd
# from pyspark.sql import Row, SparkSession
import numpy as np
print("Converting gnomad_hgvs_map to Hail Table")

print(type(gnomad_hgvs_map))

def convert_gnomad_hgvs_map_to_hail_table1():
    """
    Had some issue with type conversion here but may be resolved. It may be better to use this
    instead of going through HDFS directly as in the other version of this function
    """
    rows = []
    for gnomad_id, hgvs_list in gnomad_hgvs_map.items():
        for hgvs_entry in hgvs_list:
            
            row = [gnomad_id,
                   hgvs_entry["refseq"],
                   hgvs_entry["refseq_id"],
                   hgvs_entry["refseq_version"],
                   hgvs_entry["protein_effect"]]
            rows.append(row)
            print(row)
    df = pd.DataFrame(
        rows,
        columns=[
            "gnomad_id", 
            "refseq", 
            "refseq_id",
            "refseq_version",
            "protein_effect"]
    )
    print(df)
    print(df.dtypes)
    print(df.describe())
    ht = hl.Table.from_pandas(df, key=["gnomad_id"])
    return ht

def convert_gnomad_hgvs_map_to_hail_table():
    """
    This dumps the stripped-down allele registry response with gnomad-ids to
    HDFS and imports it as a Hail Table for doing a join to the overall table
    
    Converting it to an in-memory DictExpression for the annotations results
    in higher memory usage which can crash the Java Spark backend
    """
    rows = []
    temp_file = "gnomad-hgvs.tsv"
    os.system("rm %s" % temp_file)
    with open(temp_file, "w") as f_out:
        # Write headers
        headers = [
            "gnomad_id", "refseq", "refseq_id", "refseq_version", "protein_effect"]
        f_out.write("\t".join(headers))
        f_out.write("\n")
        for gnomad_id, hgvs_list in gnomad_hgvs_map.items():
            for hgvs_entry in hgvs_list:
                row = [gnomad_id,
                       hgvs_entry["refseq"],
                       hgvs_entry["refseq_id"],
                       hgvs_entry["refseq_version"],
                       hgvs_entry["protein_effect"]]
#                 print(row)
                f_out.write("\t".join(row))
                f_out.write("\n")
    # Copy to HDFS
    temp_file_path = os.path.join(os.getcwd(), temp_file)
    p = subprocess.Popen(["hdfs", "dfs", "-rm", "-f", temp_file],
            stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = p.communicate()
    if len(stderr) > 0:
        raise RuntimeError("stdout:\n%s\nstderr:\n%s\n" % (stdout, stderr))
    p = subprocess.Popen(["hdfs", "dfs", "-cp", "file://"+temp_file_path, temp_file],
            stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = p.communicate()
    if len(stderr) > 0:
        raise RuntimeError("stdout:\n%s\nstderr:\n%s\n" % (stdout, stderr))
    ht = hl.import_table(temp_file)
    ht = ht.key_by(ht.gnomad_id)
    return ht

ht = convert_gnomad_hgvs_map_to_hail_table()

refseqs_array_hl = gene_refseqs_split_hl.values()

# ht.filter(ht.gnomad_id=="1-115248097-T-C").show()

# Only take the preferred ref sequence or if preferred ref version isn't present,
# take the highest version for the same sequence identifier
ht = ht.filter(
    (
        # If exact match, keep
        refseqs_array_hl.any(
            lambda r: r["refseq"] == ht.refseq
        ) |
        # If not exact match and is the highest for this sequence identifier
        (
            ~refseqs_array_hl.any(lambda r: r["refseq"] == ht.refseq)) \
             & \
            (ht.refseq_version == hl.str(hl.max(refseqs_array_hl.filter(
                lambda r: r["refseq_id"] == ht.refseq_id
            ).map(
                lambda r: hl.int(r["refseq_version"])
            )))
        )
    )
)

# ht.filter(ht.gnomad_id=="1-115248097-T-C").show()

def get_hgvs_by_coordinates(genomic_coordinates):
    ret = gnomad_hgvs_map.get(genomic_coordinates)
    if ret is None:
        raise RuntimeError("None on " + genomic_coordinates)
    return ret

# This join works as long as there is a 1:1 mapping along the key reg_ds.genomic_coordinates <-> ht.gnomad_id
# Otherwise it may just select one of the matches arbitrarily. To do a proper join in that case, use Table.join
reg_ds = reg_ds.annotate(
    hgvs=ht[reg_ds.genomic_coordinates])


In [None]:
# Some quick sanity checking. These do not affect the report file.

# reg_ds.describe()


# # Records missing protein_effect
# no_protein_effect_ht = reg_ds.filter(
#     hl.is_missing(reg_ds.hgvs.protein_effect) | (reg_ds.hgvs.protein_effect.length() == 0))
# print("%s records were missing protein_effect" % (no_protein_effect_ht.count()))
# no_protein_effect_ht.show()


# Records missing hgvs refseq 
# no_hgvs_refseq_ht = reg_ds.filter(
#     hl.is_missing(reg_ds.hgvs.refseq) | (reg_ds.hgvs.refseq.length() == 0))
# print("%s out of %s records were missing refseq" % (no_hgvs_refseq_ht.count(), reg_ds.count()))
# no_hgvs_refseq_ht.show()


# Records with refseq included for different version from MANE preferred
# refseq_version_mismatch = reg_ds.filter(
#     (genes_MANE_pref_map_split_hl[reg_ds.gene]["refseq_id"] == reg_ds.hgvs.refseq_id) \
#     & (genes_MANE_pref_map_split_hl[reg_ds.gene]["refseq_version"] != reg_ds.hgvs.refseq_version)
# )
# print("%s records had refseq preferred version mismatch" % (refseq_version_mismatch.count()))
# refseq_version_mismatch.show()


In [None]:
# Old annotations
# gnomad_clinvar_allele_reg_ds = gnomad_clinvar_ds.annotate(
#     hgvs=hl.or_missing(
#         input_genes_hl.contains(gnomad_clinvar_ds.gene),
        
#         # If the record is in one of the input genes, do the following
#         hl.or_else(
#             # First check if there is an exact match for the preferred gene refseq version
#             gnomad_hgvs_hl.get(gnomad_clinvar_ds.genomic_coordinates).find(
#                 lambda entry: entry["refseq"] == gene_refseqs_split_hl.get(gnomad_clinvar_ds.gene)["refseq"]
#             ),
            
#             # If not found, use the highest refseq with same identifier ignoring version
#             # if none, this returns missing
#             hl.sorted(
#                 # Collection of possible entries (filtered to those with same unversioned seq identifier)
#                 gnomad_hgvs_hl.values().filter(
#                     lambda e: e["refseq_id"] == gene_refseqs_split_hl.get(gnomad_clinvar_ds.gene)["refseq_id"]
#                 ),
#                 # Key to sort on (version integer)
#                 key=lambda e: int(e["refseq_version"]),
#                 # Reverse for descending order
#                 reverse=True
#             )[0] # take highest
#         ).get("refseq") # Use whole accession.version value
#     ),
#     # Same as above hgvs field but other value in the gnomad_hgvs_hl elemenet
#     protein_effect=hl.or_missing(
#         input_genes_hl.contains(gnomad_clinvar_ds.gene),
        
#         # If the record is in one of the input genes, do the following
#         hl.or_else(
#             # First check if there is an exact match for the preferred gene refseq version
#             gnomad_hgvs_hl.get(gnomad_clinvar_ds.genomic_coordinates).find(
#                 lambda entry: entry["refseq"] == gene_refseqs_split_hl.get(gnomad_clinvar_ds.gene)["refseq"]
#             ),
            
#             # If not found, use the highest refseq with same identifier ignoring version
#             # if none, this returns missing
#             hl.sorted(
#                 # Collection of possible entries (filtered to those with same unversioned seq identifier)
#                 gnomad_hgvs_hl.values().filter(
#                     lambda e: e["refseq_id"] == gene_refseqs_split_hl.get(gnomad_clinvar_ds.gene)["refseq_id"]
#                 ),
#                 # Key to sort on (version integer)
#                 key=lambda e: int(e["refseq_version"]),
#                 # Reverse for descending order
#                 reverse=True
#             )[0] # take highest
#         ).get("protein_effect")
#     )
# )

In [None]:
print("Finished transcript hgvs and protein annotation")

In [None]:
# Select desired output fields (columns are ordered as provided)
output_ds = reg_ds

output_ds = output_ds.select(
    output_ds.criteria_satisfied,
    output_ds.gene,
    output_ds.faf95,
    output_ds.source,
    output_ds.popmax_pop,
    output_ds.popmax_ac,
    output_ds.popmax_an,
    output_ds.genomic_coordinates,
    hgvs=output_ds.hgvs.refseq,
    protein_effect=output_ds.hgvs.protein_effect,
    clinvar_variation_id=output_ds.clinvar.rsid,
    clinvar_review_status=hl.delimit(output_ds.clinvar.info["CLNREVSTAT"], ","),
    clinvar_significance=hl.delimit(output_ds.clinvar.info["CLNSIG"], ","),
    clinvar_significance_interpretations=hl.delimit(output_ds.clinvar.info["CLNSIGCONF"], ",")
)

# output_ds.describe()

# Export to TSV
report_filename = "report.tsv"
import time
print("Starting export to %s" % report_filename)
start_time = time.time()
output_ds.export(report_filename)
end_time = time.time()
print("Export took %.2f seconds" % (end_time - start_time))

In [None]:
# The export is in HDFS now, copy to machine-local file
report_localpath = os.path.join(os.getcwd(), report_filename)
os.system("rm %s" % report_localpath)
p = subprocess.Popen(["hdfs", "dfs", "-cp", report_filename, "file://" + report_localpath],
        stdout=subprocess.PIPE, stderr=subprocess.PIPE)
print(p.communicate())

In [None]:
# Upload to bucket and filepath set at top of notebook
gs_output_file = "gs://%s/%s" % (output_bucket, output_filename)
p = subprocess.Popen(["gsutil", "cp", report_localpath, gs_output_file],
        stdout=subprocess.PIPE, stderr=subprocess.PIPE)
print(p.communicate())