Notebook for creating CIViC TR submissions to ClinVar. 

Assumes you have a `.env` file where `CLINVAR_API_KEY` contains the ClinVar Submission API Key

In [1]:
import json
import logging
import os
import sys
from pathlib import Path

import requests

module_path = os.path.abspath(os.path.join("../"))
if module_path not in sys.path:
    sys.path.append(module_path)
from utils import dry_run_test_api  # noqa: E402

logging.basicConfig(filename="gks-clinvar-civic.log", level=logging.DEBUG)

In [2]:
fn = "civic_studies_20240402.json"
civic_cdm_s3_url = f"https://raw.githubusercontent.com/ga4gh/gk-pilot/main/datasets/civic/dereferenced/{fn}"
response = requests.get(civic_cdm_s3_url, timeout=5)
response.raise_for_status()
civic_gks_data = response.json()

In [3]:
all_civic_combo_studies = []
all_civic_ta_studies = []

for civic_gks in civic_gks_data:
  if civic_gks["therapeutic"]["type"] == "CombinationTherapy":
    all_civic_combo_studies.append(civic_gks)
  elif civic_gks["therapeutic"]["type"] == "TherapeuticAgent":
    all_civic_ta_studies.append(civic_gks)

In [4]:
len(all_civic_combo_studies), len(all_civic_ta_studies)

(99, 610)

In [5]:
def _get_condition_set_and_concept_id(study: dict, study_id: str) -> tuple[dict, str] | None:
  """Get normalized tumor type data from extensions

  :param study: GKS study
  :param study_id: ID for ``study``
  :return: Get condition set data and normalized concept ID if found. Otherwise, ``None``
  """
  tumor_type_exts = study["tumorType"]["extensions"]
  mondo_id = None
  tumor_type_normalized_id = None
  for ext in tumor_type_exts:
    if ext["name"] == "disease_normalizer_data":
      ext_val = ext["value"]
      mondo_id = ext_val["mondo_id"]
      tumor_type_normalized_id = ext_val["normalized_id"]
      break

  if not mondo_id:
    logging.debug("%s: no mondo_id", study_id)
    return None

  if not tumor_type_normalized_id:
    logging.debug("%s: no tumor_type_normalized_id", study_id)
    return None

  condition_set = {
    "condition": [{
      "db": "MONDO",
      "id": mondo_id
    }]
  }

  return condition_set, tumor_type_normalized_id

In [6]:
def _is_within_indication(reg_approval: dict, tumor_type_normalized_id: str) -> bool:
  """Determine if drug is FDA approved within indication

  :param reg_approval: therapy-normalizer extension for regulatory approval
  :param tumor_type_normalized_id: Normalized tumor type concept ID from disease-normalizer
  :return: ``True`` if drug is FDA approved within indication. Otherwise, ``False``
  """
  within_indication = False
  has_indications = reg_approval.get("has_indications") or []
  for ind in has_indications:
    mappings = ind.get("mappings") or []
    for mapping in mappings:
      coding = mapping.get("coding")
      if coding and f"{coding['system']}:{coding['code']}" == tumor_type_normalized_id:
        within_indication = True
        break
  return within_indication

In [7]:
def _get_ta_ind_and_drug_label(therapeutic: dict, tumor_type_normalized_id: str, study_id: str) -> tuple[bool | None, str | None]:
  """Get indication and drug label information for therapeutic agent

  :param therapeutic: Therapeutic data
  :param tumor_type_normalized_id: Normalized tumor type concept ID from disease-normalizer
  :param study_id: ID of study
  :return: Tuple containing indication and drug label information. Or tuple containing
    ``None`` values if no regulatory approval or drug label information is found
  """
  drug_for_ta = None
  reg_approval = None
  therapy_exts = therapeutic["extensions"]

  for ext in therapy_exts:
    if ext["name"] == "therapy_normalizer_data":
      drug_for_ta = ext["value"]["label"]
    elif ext["name"] == "regulatory_approval":
      reg_approval = ext["value"]

  if not drug_for_ta:
    logging.debug("%s: no drug_for_ta", study_id)
    return None, None

  if not reg_approval:
    logging.debug("%s: no regulatory_approval", study_id)
    return None, None

  within_indication = _is_within_indication(reg_approval, tumor_type_normalized_id)

  return within_indication, drug_for_ta

In [8]:
def _get_ind_and_drug_label(study: dict, study_id: str, tumor_type_normalized_id: str) -> tuple[bool, str] | None:
  """Get indication and drug label information for therapeutic procedure

  :param study: GKS study
  :param study_id: ID for ``study``
  :param tumor_type_normalized_id: Normalized tumor type concept ID from disease-normalizer
  :return: Tuple containing indication and drug label if all components found. Otherwise,
    ``None``
  """
  therapeutic = study["therapeutic"]
  drug_for_ta = None
  within_indication = None

  if therapeutic["type"] == "TherapeuticAgent":
    within_indication, drug_for_ta = _get_ta_ind_and_drug_label(
      therapeutic, tumor_type_normalized_id, study_id
    )
  elif therapeutic["type"] == "CombinationTherapy":
    drug_for_ta_list = []
    within_indication_list = []

    for comp in therapeutic["components"]:
      _within_indication, _drug_for_ta = _get_ta_ind_and_drug_label(
        comp, tumor_type_normalized_id, study_id
      )
      if not _drug_for_ta:
        return None

      drug_for_ta_list.append(_drug_for_ta)
      within_indication_list.append(_within_indication)

    drug_for_ta = ";".join(drug_for_ta_list)
    within_indication = all(within_indication_list)

  if not drug_for_ta:
    return None

  return within_indication, drug_for_ta

In [9]:
def _get_clinical_impact_class_descr(vicc_concept_label: str, within_indication: bool, study_id: str) -> str:
  """Get clinical impact description given vicc concept label

  :param vicc_concept_label: VICC concept label
  :param within_indication: Whether drug is FDA approved within indication
  :param study_id: ID of GKS study
  :return: ClinVar clinical impact description
  """
  clinical_impact_class_desc = None
  if vicc_concept_label in {"authoritative evidence", "FDA recognized evidence"}:
    if within_indication:
      clinical_impact_class_desc = "Tier I - Strong"
    else:
      clinical_impact_class_desc = "Tier II - Potential"
  elif vicc_concept_label in {"professional guideline evidence", "clinical cohort evidence"}:
    clinical_impact_class_desc = "Tier I - Strong"
  elif vicc_concept_label in {"case study evidence", "observational study evidence", "interventional study evidence", "preclinical evidence"}:
    clinical_impact_class_desc = "Tier II - Potential"
  else:
    logging.debug("%s: no clinical_impact_class_desc", study_id)
  return clinical_impact_class_desc

In [10]:
def _get_citations(study: dict, study_id: str) -> list[dict]:
  """Get citation data

  :param study: GKS study
  :param study_id: ID for ``study``
  :return: List of citation data if all components found. Otherwise, empty list
  """
  citations = []
  for doc in study["isReportedIn"]:
    if doc.get("pmid") and doc["pmid"] is not None:
      citations.append({
        "db": "PubMed", "id": str(doc["pmid"])
      })
  if not citations:
    logging.debug("%s: no citations", study_id)
    return []
  return citations


In [11]:
def _get_assertion_type_for_clinical_impact(study: dict, study_id: str) -> str | None:
  """Get assertion type for clinical impact

  :param study: GKS study
  :param study_id: ID for ``study``
  :return: Assertion type for clinical impact data if all components found. Otherwise,
    ``None``
  """
  assertion_type_for_clinical_impact = None
  if study["predicate"] == "predictsSensitivityTo":
    assertion_type_for_clinical_impact = "therapeutic: sensitivity/response"
  elif study["predicate"] == "predictsResistanceTo":
    assertion_type_for_clinical_impact = "therapeutic: resistance"
  else:
    logging.debug("%s: no assertion_type_for_clinical_impact", study_id)

  return assertion_type_for_clinical_impact


In [12]:
def _get_variant_set(study: dict, study_id: str) -> dict:
  """Get variant set data

  :param study: GKS study
  :param study_id: ID for ``study``
  :return: Variant set data if all components found. Otherwise, empty dict
  """
  hgvs_c = None
  for expr in study["variant"]["definingContext"].get("expressions") or []:
    if expr["syntax"] == "hgvs.c":
      hgvs_c = expr["value"]
  if not hgvs_c:
    logging.debug("%s: no hgvs_c", study_id)
    return {}

  ncbi_gene = None
  for mapping in study["qualifiers"]["geneContext"]["mappings"]:
    source, code = mapping["coding"]["code"].split(":")
    if source == "ncbigene":
      ncbi_gene = int(code)
  if not ncbi_gene:
    logging.debug("%s: no ncbi_gene", study_id)
    return {}

  return {
    "variant": [{
      "hgvs": hgvs_c,
      "gene": [{"id": ncbi_gene}]
    }]
  }

In [13]:
def create_clinvar_sub(study: dict) -> dict | None:
  """Create a ClinVar clinical impact submission for CIViC data

  :param study: CIViC study in GKS format
  :return: ClinVar impact submission if able to create from study data. Otherwise,
    ``None``
  """
  study_id = study["id"]
  clinvar_impact_sub = {
    "recordStatus": "novel",
    "localID": study["variant"]["id"],
    "localKey": study_id,
    "observedIn": [{
      "alleleOrigin": "somatic",
      "affectedStatus": "yes",
      "collectionMethod": "curation"
    }]
  }

  # conditionSet
  condition_set_data = _get_condition_set_and_concept_id(study, study_id)
  if not condition_set_data:
    return None
  condition_set, tumor_type_normalized_id = condition_set_data
  clinvar_impact_sub["conditionSet"] = condition_set

  # drugForTherapeuticAssertion
  ind_and_drug_label_data = _get_ind_and_drug_label(study, study_id, tumor_type_normalized_id)
  if not ind_and_drug_label_data:
    return None
  within_indication, drug_for_ta = ind_and_drug_label_data

  # clinicalImpactClassificationDescription
  vicc_concept_label = study.get("strength").get("label")
  clinical_impact_class_desc = _get_clinical_impact_class_descr(
    vicc_concept_label, within_indication, study_id
  )
  if not clinical_impact_class_desc:
    return None

  # citations
  citations = _get_citations(study, study_id)
  if not citations:
    return None

  # assertionTypeForClinicalImpact
  assertion_type_for_clinical_impact = _get_assertion_type_for_clinical_impact(study, study_id)
  if not assertion_type_for_clinical_impact:
    return None

  # variantSet
  variant_set = _get_variant_set(study, study_id)
  if not variant_set:
    return None
  clinvar_impact_sub["variantSet"] = variant_set

  clinvar_impact_sub["clinicalImpactClassification"] = {
    "clinicalImpactClassificationDescription": clinical_impact_class_desc,
    "assertionTypeForClinicalImpact": assertion_type_for_clinical_impact,
    "drugForTherapeuticAssertion": drug_for_ta,
    "comment": study["description"],
    "citation": citations
  }

  return clinvar_impact_sub

In [14]:
valid_civic_ta_studies = []
valid_civic_combo_studies = []

civic_eid_2997 = []
combo_sub = []
batch_sub_mp12 = []
mpids = set()
batch_sub = []

In [15]:
def add_tr_sub(studies: list[dict], valid_studies: list[dict], is_combo: bool) -> None:
  """Add TR submissions to associated lists

  :param studies: Studies for therapeutic agent or combination therapy
  :param valid_studies: List of valid studies for therapeutic agent or combination
    therapy. This will be mutated
  """
  cutoff = 5 if is_combo else 2
  for study in studies:
    sub = create_clinvar_sub(study)
    if sub:
      valid_studies.append(sub)

      if sub["localKey"] == "civic.eid:2997":
        civic_eid_2997.append(sub)
        continue

      if is_combo and not combo_sub:
        combo_sub.append(sub)
      else:
        if len(batch_sub_mp12) < cutoff and sub["localID"] == "civic.mpid:12":
          batch_sub_mp12.append(sub)
        else:
          if len(batch_sub) < cutoff and sub["localID"] not in mpids:
            mpids.add(sub["localID"])
            batch_sub.append(sub)

In [16]:
add_tr_sub(all_civic_ta_studies, valid_civic_ta_studies, False)
add_tr_sub(all_civic_combo_studies, valid_civic_combo_studies, True)
len(valid_civic_combo_studies), len(valid_civic_ta_studies)

(71, 434)

In [17]:
for fn, sub_data in [
  ("therapeutic_evidence.json", civic_eid_2997),
  ("combination_therapy_evidence.json", combo_sub),
  ("batch_mp12_therapy_evidence.json", batch_sub_mp12),
  ("batch_therapeutic_evidence.json", batch_sub)
]:
  clin_impact_sub = {"clinicalImpactSubmission": sub_data}
  response = dry_run_test_api(clin_impact_sub)

  if response.status_code == 204:
    with Path(fn).open("w") as wf:
      json.dump(clin_impact_sub, wf, indent=2)
      wf.write("\n")
  else:
    msg = f"{fn} is not valid: {response.json()}"
    raise Exception(msg)