In [1]:
from clinvar_gk_pilot.gcs import (
    _local_file_path_for,
    download_to_local_file,
    already_downloaded,
)

# Change the cwd from `notebooks` to the parent (the project root) so we download the files there
import os

os.chdir(os.path.dirname(os.getcwd()))

catvar_blob_uri = (
    "gs://clinvar-gk-pilot/2024-04-07/dev/combined-catvar_output.ndjson.gz"
)
scv_blob_uri = "gs://clinvar-gk-pilot/2024-04-07/dev/combined-scv_output.ndjson.gz"


variation_local_file_path = _local_file_path_for(catvar_blob_uri)
if not already_downloaded(catvar_blob_uri):
    print(f"Downloading {catvar_blob_uri} to {variation_local_file_path}")
    dl_variation_local_file_path = download_to_local_file(catvar_blob_uri)

scv_local_file_path = _local_file_path_for(scv_blob_uri)
if not already_downloaded(scv_blob_uri):
    print(f"Downloading {scv_blob_uri} to {scv_local_file_path}")
    dl_scv_local_file_path = download_to_local_file(scv_blob_uri)

# catvar_file = "combined-catvar_output.ndjson.gz"
# scv_file = "combined-scv_output.ndjson.gz"

catvar_file = variation_local_file_path
scv_file = scv_local_file_path

2024-07-15 12:28:21 - gcs - INFO - Downloading gs://clinvar-gk-pilot/2024-04-07/dev/combined-catvar_output.ndjson.gz to buckets/clinvar-gk-pilot/2024-04-07/dev/combined-catvar_output.ndjson.gz


Downloading gs://clinvar-gk-pilot/2024-04-07/dev/combined-catvar_output.ndjson.gz to buckets/clinvar-gk-pilot/2024-04-07/dev/combined-catvar_output.ndjson.gz


2024-07-15 12:30:27 - gcs - INFO - Downloading gs://clinvar-gk-pilot/2024-04-07/dev/combined-scv_output.ndjson.gz to buckets/clinvar-gk-pilot/2024-04-07/dev/combined-scv_output.ndjson.gz


Downloading gs://clinvar-gk-pilot/2024-04-07/dev/combined-scv_output.ndjson.gz to buckets/clinvar-gk-pilot/2024-04-07/dev/combined-scv_output.ndjson.gz


In [2]:
def print_scvs(scvs):
    """
    Reused function to print some core fields from a GKS-modeled ClinVar SCV record.
    """
    for scv in scvs:
        classification = scv["classification"]["label"]
        condition = scv["condition"]["label"]
        print(f"SCV: {scv['id']} ")
        print(f"  Classification: {classification}")
        print(f"  Condition: {condition}")
        print()

Our ClinVar datasets are available as both id-keyed JSON files, and NDJSON files. For each format there is a variation file and an SCV file. The demos in this notebook use the NDJSON formatted files. The records of the variation file are `CategoricalVariation` objects, and the records of the SCV file are `VariationPathogenicity` (sub-class of `Statement`)

In [3]:
import os
import gzip
import json

################################
# Query the SCV file for a VRS ID using vanilla Python
#
# - for a given ClinVar Variation ID, find the corresponding GA4GH CatVar record in the CatVar
#   file and find the SCVs which reference that variant in the SCV file
#
#   (NOTE: the SCV file also contains the full CatVar definition in the "variation" field, but
#    this example illustrates how to query across both files, since the SCV file can be
#    relationally normalized to extract that redundant entity and refer to the variant
#    by the CatVar ID as a foreign key)
#
# - print the SCV interpretations for that variant
#
################################
################################
# Inputs

################################
# A CanonicalAllele
## For searching based on the GKS Categorical Variation (CatVrs) ID
clinvar_id_canonicalallele = "2769522"
catvar_id_canonicalallele = f"clinvar:{clinvar_id_canonicalallele}"
## For searching based on the GA4GH VRS Variation ID
vrs_id_canonicalallele = "ga4gh:VA.hf_va4AnlG99NuOjtaXJzh_XvszWWOO9"


################################
# A CategoricalCnv
## For searching based on the GKS Categorical Variation (CatVrs) ID
clinvar_id_categoricalcnv = "599353"
catvar_id_categoricalcnv = f"clinvar:{clinvar_id_categoricalcnv}"
## For searching based on the GA4GH VRS Variation ID
vrs_id_categoricalcnv = "ga4gh:CX.5iqyOA4L5njh5FpymTPcwQ8oHTilQFmo"  # GRCh38 member

################################
assert os.path.exists(catvar_file)
assert os.path.exists(scv_file)

In [4]:
################################
# Query the SCV file for the matching VRS ID
################################


def query_scvs_by_vrs_id(vrs_id: str, scv_file_name: str):
    scvs = []
    # catvars = []
    with gzip.open(scv_file_name, "rt") as f:
        for line in f:
            record = json.loads(line)
            variation = record["variation"]
            processing_errors = [
                e
                for e in variation.get("extensions", [])
                if e["name"] == "vrs processing errors"
            ]
            if len(processing_errors) > 0:
                # print(f"Skipping SCV record with VRS processing errors: {line}")
                continue

            match variation["type"]:
                case "CategoricalCnv":
                    if "members" not in variation:
                        # Unsupported?
                        # e.g. "clinvar:1878325"
                        # "NC_000018.9:g.(48556994_48573289)_48573471dup"
                        # raise ValueError(f"CategoricalCnv missing members field: {line}")
                        continue
                    members = variation["members"]
                    member_vrs_ids = [m["id"] for m in members]
                    if vrs_id in member_vrs_ids:
                        scvs.append(record)

                case "CanonicalAllele":
                    if "definingContext" not in variation:
                        # Unsupported allele type?
                        # e.g. clinvar:215984
                        # "NM_000251.2(MSH2):c.212-?_366+?dup"
                        # raise ValueError(f"CanonicalAllele missing definingContext field: {line}")
                        continue
                    if variation["definingContext"]["id"] == vrs_id:
                        scvs.append(record)
                case "DescribedVariation":
                    # not an error in processing, but does not have any VRS IDs
                    continue
                    # raise ValueError(f"DescribedVariation not yet implemented: {line}")
                case _:
                    raise ValueError(
                        f"Unexpected variation type ({variation['type']}): {line}"
                    )
    return scvs

In [5]:
################################
# Query the SCV file for the matching CatVar ID
################################


def query_scvs_by_catvar_id(catvar_id: str, scv_file_name: str):
    scvs = []
    # catvars = []
    with gzip.open(scv_file_name, "rt") as f:
        for line in f:
            record = json.loads(line)
            variation = record["variation"]
            record_catvar_id = variation["id"]

            if record_catvar_id == catvar_id:
                scvs.append(record)

    return scvs

In [6]:
scvs_by_vrs_id_canonicalallele = query_scvs_by_vrs_id(vrs_id_canonicalallele, scv_file)

print_scvs(scvs_by_vrs_id_canonicalallele)

SCV: SCV004334569.1 
  Classification: Likely benign
  Condition: BAP1-related tumor predisposition syndrome



In [7]:
scvs_by_vrs_id_categoricalcnv = query_scvs_by_vrs_id(vrs_id_categoricalcnv, scv_file)

print_scvs(scvs_by_vrs_id_categoricalcnv)

SCV: SCV000864190.1 
  Classification: Likely pathogenic
  Condition: Squalene synthase deficiency



In [8]:
scvs_by_catvar_id_canonicalallele = query_scvs_by_catvar_id(
    catvar_id_canonicalallele, scv_file
)

print_scvs(scvs_by_catvar_id_canonicalallele)

SCV: SCV004334569.1 
  Classification: Likely benign
  Condition: BAP1-related tumor predisposition syndrome



In [9]:
scvs_by_catvar_id_categoricalcnv = query_scvs_by_catvar_id(
    catvar_id_categoricalcnv, scv_file
)

print_scvs(scvs_by_catvar_id_categoricalcnv)

SCV: SCV000864190.1 
  Classification: Likely pathogenic
  Condition: Squalene synthase deficiency

