This notebook uploads two outputs to Google Cloud Storage buckets.

1) The inputs array converted to a vcf-style hail table.
2) The next upload is a JSON file of the input variants converted to GA4GH GKS VA/VRS annotations using gnomAD data.

It can read from a locally-staged hail table directory, or use one in GCS. The table must have a pre-computed `.info.VRS` field. This is computed using `tgg_methods` `vrs_annotation_batch.py`, which uses the `vrs-python` `vcf_annotation.py`

https://github.com/broadinstitute/tgg_methods/blob/master/vrs/vrs_annotation_batch.py (last checked at `a0002f02fbd5dd25487b261e94081a3daec29c64`)

This validates gnomad_methods functions `gnomad_gks` and `get_gks`, off the branch in this pull request: https://github.com/broadinstitute/gnomad_methods/pull/556

The JSON schema is off the `aw-lb-drafting` branch (at `bf3b5aa`)

The cell that patches the gnomad_gks function can be removed if the python environment this notebook is in is run off the version of code in the gnomad_methods PR 556.

In [None]:
# configuration for data outputs

# Writes inputs array as a hail table to this destination, if not None.
# This can be useful for other testing, using this hail table as input without reconstructing it
inputs_ht_destination_url = "gs://clingen-public-requesterpays/gnomad-gks-larry-filtered.ht"

# Copies output annotations as newline delimited json to this url, if not None
outputs_destination_file = "gs://clingen-public-requesterpays/gnomad-gks-qc/outputs.ndjson"

In [None]:
import hail as hl

In [None]:
import gnomad
import gnomad.utils.annotations
import gnomad.resources.grch38.gnomad

# reload (re-running this cell will reload modifications to these modules on disk)
import importlib
importlib.reload(gnomad.utils.annotations)
importlib.reload(gnomad.resources.grch38.gnomad)

from gnomad.utils.annotations import get_gks
from gnomad.resources.grch38.gnomad import gnomad_gks

In [None]:
import subprocess
subprocess.run(["rm", "-rf", "va-spec"])
p = subprocess.run(["git", "clone", "https://github.com/ga4gh/va-spec"],
                   check=True)
p = subprocess.run(["bash", "-c",
                    "cd va-spec && git checkout aw-lb-drafting"],
                   check=True)

In [None]:
import json
with open("va-spec/schema/cohortAlleleFreq.json") as f:
    schema = json.load(f)

In [None]:
# GnomAD 3.1.2 
# GRCh38 expressions

inputs = [
    {
        "clinvar": "422227",
        "gnomad": "16-68737366-C-T"
    },
    {
        "clinvar": "619",
        "gnomad": "12-102843683-C-T"
    },
    {
       "clinvar": "808527",
       "gnomad": "19-38442434-C-T"
    },
    {
        "clinvar": "695823",
        "gnomad": "1-11130641-G-A"
    },
    # Not in gnomAD 3.1.2
#     {
#         "clinvar": "251162",
#         "gnomad": "19-11105249-C-T"
#     },
    {
        "clinvar": "143344",
        "gnomad": "X-154030690-C-T"
    },
    {
        "clinvar": "137390",
        "gnomad": "14-28767768-C-T"
    },
    {
        "clinvar": "129997",
        "gnomad": "15-89317458-C-G"
    },
    {
        "clinvar": "89198",
        "gnomad": "2-47799491-C-G"
    },
    {
        "clinvar": "43032",
        "gnomad": "14-23416241-G-A"
    },
    {
        "clinvar": "2356",
        "gnomad": "1-216247118-C-A"
    },
    {
        "clinvar": "3038",
        "gnomad": "11-108248927-T-G"
    },
    {
        "clinvar": "256155",
        "gnomad": "7-107701083-T-A"
    },
    # example variant from va-spec simple_result_example.yaml
    {
        "gnomad": "1-55051215-G-GA"
    }
]


In [None]:
# ht_url can be a gs:// path, or a file:// local path

# Publicly readable, but doesn't have all gnomad variants in it
ht_url = "gs://clingen-public-requesterpays/downsample_to_100k_full_release.ht"

# Can refer to a local hail table directory
ht_url = "../downsample_to_100k_full_release.ht"

# gnomad-vrs-io-finals is a private bucket, needs a service account with read access 
ht_url = "gs://gnomad-vrs-io-finals/ht-outputs/release_0426_dmtest_v3.1.2-Full-ht-release-output-updated-schema-050523.ht"

ht = hl.read_table(ht_url)

In [None]:
# This cell constructs an indexed hail table from the input alleles
# so the full gnomad dataset can be rapidly filtered by left joining it onto this.
# Using a .filter() to check membership in the input set is slow because it needs
# to do a table scan.
# Too slow:
# ht_with_coords.filter(input_gnomad_expressions.contains(ht_with_coords.genomic_coordinates))

import pandas

input_gnomad_expressions = hl.literal([x["gnomad"] for x in inputs])
input_terms = [x["gnomad"].split("-") for x in inputs]

df = pandas.DataFrame(
    {
        "contig": [str("chr" + i[0]) for i in input_terms],
        "position": [int(i[1]) for i in input_terms],
        "ref": [i[2] for i in input_terms],
        "alt": [i[3] for i in input_terms]
    }
)
inp_ht = hl.Table.from_pandas(df)
inp_ht = (inp_ht
    .annotate(
        locus=hl.locus(inp_ht.contig, inp_ht.position, reference_genome="GRCh38"),
        alleles=hl.array([inp_ht.ref, inp_ht.alt]))
    .drop("contig", "position", "ref", "alt")
    .key_by("locus", "alleles"))

inp_ht.show()

In [None]:
ht_with_coords = ht.annotate(
    genomic_coordinates = hl.format("%s-%s-%s-%s",
        ht.locus.contig[3:], # Remove 'chr'
        hl.str(ht.locus.position),
        ht.alleles[0],
        ht.alleles[1]
    )
)

# Indexed filter using join on the indexed vcf table constructed from inputs
ht_filtered = inp_ht.join(ht_with_coords)
ht_filtered.genomic_coordinates.show()

if inputs_ht_destination_url:
    ht_filtered.write(inputs_ht_destination_url, overwrite=True)

In [None]:
# variant = ht_filtered.genomic_coordinates.take(1)[0]
# print(variant)

In [None]:
# # Patch gnomad_gks using code copied from local development environment

# # need these references for monkey patch of gnomad_gks
# from typing import Union
# from gnomad.resources.grch38.gnomad import coverage, POPS
# from gnomad.sample_qc.ancestry import POP_NAMES

# def gnomad_gks(
#     version: str,
#     variant: str,
#     data_type: str = "genomes",
#     by_ancestry_group: bool = False,
#     by_sex: bool = False,
#     vrs_only: bool = False,
#     ht: Union[str,hl.Table] = None,
# ) -> dict:
#     """
#     Call get_gks() and return VRS information and frequency information for the specified gnomAD release version and variant.

#     :param version: String of version of gnomAD release to use.
#     :param variant: String of variant to search for (chromosome, position, ref, and alt, separated by '-'). Example for a variant in build GRCh38: "chr5-38258681-C-T".
#     :param data_type: String of either "exomes" or "genomes" for the type of reads that are desired.
#     :param by_ancestry_group: Boolean to pass to obtain frequency information for each ancestry group in the desired gnomAD version.
#     :param by_sex: Boolean to pass if want to return frequency information for each ancestry group split by chromosomal sex.
#     :param vrs_only: Boolean to pass if only want VRS information returned (will not include allele frequency information).
#     :param custom_ht_path: Path of Hail Table to parse if different from what the public_release() method would return for the version.
#     :return: Dictionary containing VRS information (and frequency information split by ancestry groups and sex if desired) for the specified variant.

#     """
#     # If ht is not already a table,
#     # read in gnomAD release table to filter to chosen variant.
#     if not isinstance(ht, hl.Table):
#         if ht:
#             ht = hl.read_table(ht)
#         else:
#             ht = hl.read_table(public_release(data_type).versions[version].path)

#     high_level_version = f"v{version.split('.')[0]}"

#     # Read coverage statistics.

#     if high_level_version == "v3":
#         coverage_version = "3.0.1"
#     else:
#         raise NotImplementedError(
#             "gnomad_gks() is currently only implemented for gnomAD v3."
#         )

#     coverage_ht = hl.read_table(coverage(data_type).versions[coverage_version].path)

#     # Retrieve ancestry groups from the imported POPS dictionary.
#     pops_list = list(POPS[high_level_version]) if by_ancestry_group else None

#     # Throw warnings if contradictory arguments passed.
#     if by_ancestry_group and vrs_only:
#         logger.warning(
#             "Both 'vrs_only' and 'by_ancestry_groups' have been specified. Ignoring"
#             " 'by_ancestry_groups' list and returning only VRS information."
#         )
#     elif by_sex and not by_ancestry_group:
#         logger.warning(
#             "Splitting whole database by sex is not yet supported. If using 'by_sex',"
#             " please also specify 'by_ancestry_group' to stratify by."
#         )

#     # Call and return get_gks() for chosen arguments.
#     gks_info = get_gks(
#         ht=ht,
#         variant=variant,
#         label_name="gnomAD",
#         label_version=version,
#         coverage_ht=coverage_ht,
#         ancestry_groups=pops_list,
#         ancestry_groups_dict=POP_NAMES,
#         by_sex=by_sex,
#         vrs_only=vrs_only,
#     )

#     return gks_info
# gnomad.resources.grch38.gnomad.gnomad_gks = gnomad_gks

In [None]:
# Parameters for gnomad_gks/get_gks
ancestry_group_short_names = gnomad.resources.grch38.gnomad.POPS["v3"]
ancestry_groups_full_name_map = gnomad.sample_qc.ancestry.POP_NAMES
gnomad_version_label = "3.1.4"

In [None]:
# Test one variant in gnomad_gks
# variant_gks = gnomad.resources.grch38.gnomad.gnomad_gks(
#     version="3.1.4",
#     variant="chr1-55051215-G-GA",
#     data_type="genomes",
#     by_ancestry_group=True,
#     by_sex=True,
#     vrs_only=False,
#     ht=ht_filtered
# )

In [None]:
# Test one variant in get_gks
# variant_gks = get_gks(
#     ht=ht_filtered,
#     variant="chr1-55051215-G-GA",
#     label_name="gnomAD",
#     label_version=gnomad_version_label,
#     coverage_ht=None,
#     ancestry_groups=ancestry_group_short_names,
#     ancestry_groups_dict=ancestry_groups_full_name_map,
#     by_sex=False,
#     vrs_only=False
# )

In [None]:
# json.dumps(variant_gks)

In [None]:
gks_annotations = []
variant_strs = ht_filtered.genomic_coordinates.collect()
print(variant_strs)
for variant_str in variant_strs:
    print("calling get_gks on: " + variant_str)
    va_freq = get_gks(
        ht=ht_filtered,
        variant=str("chr" + variant_str),
        label_name="gnomAD",
        label_version=gnomad_version_label,
        coverage_ht=None,
        ancestry_groups=ancestry_group_short_names,
        ancestry_groups_dict=ancestry_groups_full_name_map,
        by_sex=False,
        vrs_only=False
    )
    gks_annotations.append(va_freq)

In [None]:
import jsonschema
for ann in gks_annotations:
    jsonschema.validate(schema=schema, instance=ann)


In [None]:
import os
import hailtop.fs as hlfs

homedir = os.path.expanduser("~")
with open(homedir + "/gnomad_gks_annotations.ndjson", "w") as f:
    for ann in gks_annotations:
        json.dump(ann, f)
        f.write("\n")

if outputs_destination_file:
    hlfs.copy(f"file://{homedir}/gnomad_gks_annotations.ndjson", outputs_destination_file)
    print("Copied annotations to: " + outputs_destination_file)