In [4]:
import duckdb
import gzip
import json
import os

from clinvar_gk_pilot.gcs import (
    _local_file_path_for,
    download_to_local_file,
    already_downloaded,
)

# variation_blob_uri = "gs://clingen-public/clinvar-gk-pilot/2024-04-07/dev/clinvar-variation-20240407.json.gz"
# scv_blob_uri = (
#     "gs://clingen-public/clinvar-gk-pilot/2024-04-07/dev/clinvar-scvs-20240407.json.gz"
# )

catvar_blob_uri = (
    "gs://clinvar-gk-pilot/2024-04-07/dev/combined-catvar_output.ndjson.gz"
)
scv_blob_uri = "gs://clinvar-gk-pilot/2024-04-07/dev/combined-scv_output.ndjson.gz"

catvar_file = "combined-catvar_output.ndjson.gz"


variation_local_file_path = _local_file_path_for(catvar_blob_uri)
if not already_downloaded(catvar_blob_uri):
    print(f"Downloading {catvar_blob_uri} to {variation_local_file_path}")
    dl_variation_local_file_path = download_to_local_file(catvar_blob_uri)
    assert dl_variation_local_file_path == variation_local_file_path

scv_local_file_path = _local_file_path_for(scv_blob_uri)
if not already_downloaded(scv_blob_uri):
    print(f"Downloading {scv_blob_uri} to {scv_local_file_path}")
    dl_scv_local_file_path = download_to_local_file(scv_blob_uri)
    assert dl_scv_local_file_path == scv_local_file_path

catvar_file = variation_local_file_path
scv_file = scv_local_file_path

Our ClinVar datasets are available as NDJSON files. There is a variation file and an SCV file. The records of the variation file are CategoricalVariation objects, and the records of the SCV file are `VariationPathogenicity` (sub-class of `Statement`)

In [5]:
################################
# Query the SCV file for a VRS ID using vanilla Python
#
# - for a given ClinVar Variation ID, find the corresponding GA4GH CatVar record in the CatVar
#   file and find the SCVs which reference that variant in the SCV file
#
#   (NOTE: the SCV file also contains the full CatVar definition in the "variation" field, but
#    this example illustrates how to query across both files, since the SCV file can be
#    relationally normalized to extract that redundant entity and refer to the variant
#    by the CatVar ID as a foreign key)
#
# - print the SCV interpretations for that variant
#
################################
################################
# Inputs

################################
# A CanonicalAllele
## For searching based on the GKS Categorical Variation (CatVrs) ID
clinvar_id = "563765"
## For searching based on the GA4GH VRS Variation ID
vrs_id = "ga4gh:VA.hf_va4AnlG99NuOjtaXJzh_XvszWWOO9"


################################
# A CategoricalCnv
## For searching based on the GKS Categorical Variation (CatVrs) ID
clinvar_id = "599353"
## For searching based on the GA4GH VRS Variation ID
vrs_id = "ga4gh:CX.5iqyOA4L5njh5FpymTPcwQ8oHTilQFmo"  # GRCh38 member


catvar_file = "combined-catvar_output.ndjson.gz"
scv_file = "combined-scv_output.ndjson.gz"
################################
assert os.path.exists(catvar_file)
assert os.path.exists(scv_file)
catvar_id = f"clinvar:{clinvar_id}"


While these can be read with vanilla python by iterating lines and parsing each as JSON, there are also libraries which can make querying the files simpler and potentially more performant.

One option is DuckDB. DuckDB achieves fast speeds and low memory utilization by memory mapping files and dropping rows from memory that don't match filter criteria. It also has the benefit of being able to query GZIP-compressed NDJSON files directly, so disk storage stays minimal, and presenting a SQL interface to the data, with full support of nested structs so we can access fields from nested JSON objects without manipulating the files. Another benefit is that it gracefully handles heterogeneous record schemas, automatically nulling values that don't exist in particular rows.

In [7]:
################################
# Query the SCV file for the matching VRS ID using DuckDB as a query
# engine to obtain the list of SCVs we are interested in.
################################


def query_scvs_by_vrs_id(vrs_id: str, scv_file: str):
    scvs = []

    sql = f"""
    SELECT id, definingContext_id, member.id as member_id, a
    FROM
    (
        SELECT
            id,
            variation.definingContext.id as definingContext_id,
            unnest(variation.members) as member,
            a
        FROM read_json('{scv_file}', format='newline_delimited', ignore_errors=true) a
    )
    WHERE definingContext_id = '{vrs_id}'
    OR member_id = '{vrs_id}'
    """

    sql = f"""
    SELECT id, definingContext_id, member_id, a
    FROM
    (
        SELECT
            id,
            variation.definingContext.id as definingContext_id,
            unnest(json_extract(variation, '$.members[*].id')) as member_id,
            a
        FROM read_json('{scv_file}', format='newline_delimited', ignore_errors=true) a
    )
    WHERE definingContext_id = '{vrs_id}'
    OR member_id = '{vrs_id}'
    """
    # LIMIT 10

    scv_ids = []
    results = duckdb.sql(sql)
    while batch := results.fetchmany(100):
        for row in batch:
            id, defining_context_id, member, rec = row
            d = {"id": id, "definingContext_id": defining_context_id, "member": member}
            print(d)
            scv_ids.append(row[0])
    print(f"Found {len(scv_ids)} SCVs for VRS id {vrs_id}")
    print(f"SCV IDs: {scv_ids}")

    # # Get the actual SCV records
    # with gzip.open(scv_file, "rt") as f:
    #     for line in f:
    #         # Do simple string contains check before parsing the line as JSON.
    #         # With a small set of scv_ids, this is significantly faster than
    #         # parsing every line as JSON first.
    #         for scv_id in scv_ids:
    #             if scv_id in line:
    #                 record = json.loads(line)
    #                 if record["id"] in scv_ids:
    #                     scvs.append(record)

    # # Get the actual SCV records with a second duckdb query
    # sql = f"""
    # SELECT *
    # FROM read_json('{scv_file}', format='newline_delimited', ignore_errors=true)
    # WHERE id IN ({','.join([f"'{id}'" for id in scv_ids])})
    # """
    # results = duckdb.sql(sql)
    # while batch := results.fetchmany(100):
    #     for row in batch:
    #         scvs.append(row)

    return scvs


scvs = query_scvs_by_vrs_id(vrs_id, scv_file)

ConversionException: Conversion Error: Malformed JSON at byte 0 of input: unexpected character.  Input: ga4gh:CX.5iqyOA4L5njh5FpymTPcwQ8oHTilQFmo

In [None]:
print(scvs)

In [None]:
for scv in scvs:
    classification = scv["classification"]["label"]
    condition = scv["condition"]["label"]
    print(f"SCV: {scv['id']} ")
    print(f"  Classification: {classification}")
    print(f"  Condition: {condition}")
    print()