This notebook demonstrates the mechanics of translating an HGVS expression to a VR representation for educational purposes. Users who wish to translate HGVS or other expressions routinely should use ga4gh.vr.extras.translator.

In [1]:
# We'll translate this expression to VR:
hgvs_expr = "NC_000013.11:g.32936732G>C"

In [2]:
import json
def dj(o):
    print(json.dumps(o.as_dict(), indent=2, sort_keys=True))

In [3]:
# 1. Translate the HGVS expression manually

from ga4gh.vr import models

allele = models.Allele(
    location = models.SequenceLocation(
        sequence_id = "refseq:NC_000013.11",
        interval = models.SimpleInterval(
            start = 32936731,
            end = 32936732
        )
    ),
    state = models.SequenceState(
        sequence = "C"
    )
)

allele.as_dict()

{'location': {'interval': {'end': 32936732,
   'start': 32936731,
   'type': 'SimpleInterval'},
  'sequence_id': 'refseq:NC_000013.11',
  'type': 'SequenceLocation'},
 'state': {'sequence': 'C', 'type': 'SequenceState'},
 'type': 'Allele'}

In [4]:
# 2. Replace the RefSeq sequence with a GA4GH sequence id
# Implementations choose how to provide sequence and sequence accession services
# The following uses the seqrepo REST interface (https://github.com/biocommons/seqrepo-rest-service/)

from ga4gh.vr.extras.dataproxy import SeqRepoRESTDataProxy
seqrepo_rest_service_url = "http://localhost:5000/seqrepo"
dp = SeqRepoRESTDataProxy(base_url=seqrepo_rest_service_url)

# In general, one identifier may be related to many others in another namespace
# Therefore, translate_sequence_identifier() returns a list.
# Because there will be only 1 ga4gh sequence digest, we choose the first
# and then replace the sequence id in allele.location.

refseq_ir = str(allele.location.sequence_id)
ga4gh_ir = dp.translate_sequence_identifier(refseq_ir, "ga4gh")[0]
allele.location.sequence_id = ga4gh_ir
allele.as_dict()

{'location': {'interval': {'end': 32936732,
   'start': 32936731,
   'type': 'SimpleInterval'},
  'sequence_id': 'ga4gh:SQ._0wi-qoDrvram155UmcSC-zA5ZK4fpLT',
  'type': 'SequenceLocation'},
 'state': {'sequence': 'C', 'type': 'SequenceState'},
 'type': 'Allele'}

In [5]:
# Sidebar: serialization
from ga4gh.core import ga4gh_serialize
ga4gh_serialize(allele)

b'{"location":"v9K0mcjQVugxTDIcdi7GBJ_R6fZ1lsYq","state":{"sequence":"C","type":"SequenceState"},"type":"Allele"}'

In [6]:
import hashlib, base64
blob = ga4gh_serialize(allele)
base64.urlsafe_b64encode(hashlib.sha512(blob).digest()[:24])

b'n9ax-9x6gOC0OEt73VMYqCBfqfxG1XUH'

In [7]:
# 3. Generated the computed identifier
# ga4gh_identify() serializes the object and computes the identifier
# (ga4gh_serialize and ga4gh_digest are called internally)

from ga4gh.core import ga4gh_identify
ga4gh_identify(allele)

'ga4gh:VA.n9ax-9x6gOC0OEt73VMYqCBfqfxG1XUH'

In [8]:
allele._id = ga4gh_identify(allele)
dj(allele)

{
  "_id": "ga4gh:VA.n9ax-9x6gOC0OEt73VMYqCBfqfxG1XUH",
  "location": {
    "interval": {
      "end": 32936732,
      "start": 32936731,
      "type": "SimpleInterval"
    },
    "sequence_id": "ga4gh:SQ._0wi-qoDrvram155UmcSC-zA5ZK4fpLT",
    "type": "SequenceLocation"
  },
  "state": {
    "sequence": "C",
    "type": "SequenceState"
  },
  "type": "Allele"
}


In [9]:
allele_d = allele.as_dict()
allele_d["id"] = ga4gh_identify(allele)
allele_d

{'_id': 'ga4gh:VA.n9ax-9x6gOC0OEt73VMYqCBfqfxG1XUH',
 'location': {'interval': {'end': 32936732,
   'start': 32936731,
   'type': 'SimpleInterval'},
  'sequence_id': 'ga4gh:SQ._0wi-qoDrvram155UmcSC-zA5ZK4fpLT',
  'type': 'SequenceLocation'},
 'state': {'sequence': 'C', 'type': 'SequenceState'},
 'type': 'Allele',
 'id': 'ga4gh:VA.n9ax-9x6gOC0OEt73VMYqCBfqfxG1XUH'}

# Using ga4gh.vr.extras.translator

The VR Translator imports HGVS, SPDI, Beacon, and VCF formats, and appropriate handles more complex cases than shown above.

In [19]:
from ga4gh.vr.extras.translator import Translator
tlr = Translator(data_proxy=dp)

In [26]:
# Translate HGVS → VR Allele
allele = tlr.from_hgvs(hgvs_expr)
allele.as_dict()

{'location': {'interval': {'end': 32936735,
   'start': 32936731,
   'type': 'SimpleInterval'},
  'sequence_id': 'ga4gh:SQ._0wi-qoDrvram155UmcSC-zA5ZK4fpLT',
  'type': 'SequenceLocation'},
 'state': {'sequence': 'C', 'type': 'SequenceState'},
 'type': 'Allele'}

In [33]:
# And translate VR Allele → HGVS
# Because a GA4GH sequence identifier may have many aliases, we return a list
# of HGVS expressions for each of the aliases
tlr.to_hgvs(allele)

['a5437debe2ef9c9ef8f3ea2874ae1d82:g.32936732_32936735delinsC',
 'SQ._0wi-qoDrvram155UmcSC-zA5ZK4fpLT:g.32936732_32936735delinsC',
 'GS__0wi-qoDrvram155UmcSC-zA5ZK4fpLT:g.32936732_32936735delinsC',
 'da80c1b72d32295f701e8ee083e06df9f3e08b9a:g.32936732_32936735delinsC',
 'ff4c22faaa03aefada9b5e795267120becc0e592b87e92d3:g.32936732_32936735delinsC',
 'CM000675.2:g.32936732_32936735delinsC',
 'NC_000013.11:g.32936732_32936735delinsC',
 '2oDBty0yKV9wHo7gg+Bt+fPgi5o:g.32936732_32936735delinsC']

In [34]:
# Most commonly, we'll want expressions from a well-known authority like RefSeq
# Again, there might in general be multiple `refseq` expressions
tlr.to_hgvs(allele, "refseq")

['NC_000013.11:g.32936732_32936735delinsC']

In [36]:
# GRC namespaces is handled as a special case: Because aliases are shared 
# between GRCh releases, they're shown only on request
tlr.to_hgvs(allele, "GRCh38")

['13:g.32936732_32936735delinsC', 'chr13:g.32936732_32936735delinsC']

In [40]:
# Round-trip test: HGVS → VR Allele → HGVS[]
for hgvs_expr in (
    "NC_000013.11:g.32936732C>G",
    "NC_000013.11:g.32936732_32936733del",
    "NC_000013.11:g.32936732_32936737del",
    "NC_000013.11:g.32936732_32936733insC",
    "NC_000013.11:g.32936732_32936733delinsC",
    "NC_000013.11:g.32936732_32936735delinsC",
):
    a = tlr.from_hgvs(hgvs_expr)
    he = tlr.to_hgvs(a, "refseq")
    chk = "✔" if hgvs_expr in he else "✘"
    print(f"{chk} {hgvs_expr}\n  → {ga4gh_identify(a)}\n  → {he}")

✔ NC_000013.11:g.32936732C>G
  → ga4gh:VA.gvCtR5KLdng5G31DwajXiH6S3Gjhm5fh
  → ['NC_000013.11:g.32936732C>G']
✔ NC_000013.11:g.32936732_32936733del
  → ga4gh:VA.yOoxi7-uUnJyn4QkQ23h6RJuT4Zqarow
  → ['NC_000013.11:g.32936732_32936733del']
✔ NC_000013.11:g.32936732_32936737del
  → ga4gh:VA.nJqbt_W7xV07irZ_F5mtsh5e5dkq9dBW
  → ['NC_000013.11:g.32936732_32936737del']
✔ NC_000013.11:g.32936732_32936733insC
  → ga4gh:VA.JEUN0DVx2gySgRhNDqlKYqehZxgKKlsY
  → ['NC_000013.11:g.32936732_32936733insC']
✔ NC_000013.11:g.32936732_32936733delinsC
  → ga4gh:VA.cT0SNJb9bxB_KIhu2s6j37ZbTWaU4ozJ
  → ['NC_000013.11:g.32936732_32936733delinsC']
✔ NC_000013.11:g.32936732_32936735delinsC
  → ga4gh:VA.6ZgsF2lSBqMKcGL-xV-SUSrwN_UQTndJ
  → ['NC_000013.11:g.32936732_32936735delinsC']
