In [1]:
from ga4gh.core import sha512t24u
from ga4gh.vrs import __version__, ga4gh_digest, ga4gh_identify, ga4gh_serialize, models, normalize
from ga4gh.vrs.dataproxy import SeqRepoRESTDataProxy
from ga4gh.vrs.extras.translator import Translator
from ga4gh.vrs.extras.clinvarparser import clinvar_open

# Requires seqrepo REST interface is running on this URL (e.g., using docker image)
seqrepo_rest_service_url = "http://localhost:5000/seqrepo"

# This is an excerpt of data from
# ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2019-04.xml.gz
clinvar_fn = "ClinVarVariationRelease_2019-04-excerpt.xml.gz"

In [2]:
dp = SeqRepoRESTDataProxy(base_url=seqrepo_rest_service_url)
tlr = Translator(data_proxy=dp)

In [9]:
def convert1(v):
    try:
        return tlr.from_hgvs(v)
    except:
        return models.Text(definition=v)

In [10]:
convert1("NC_000013.11:g.32936732G>C").as_dict()

{'location': {'interval': {'end': 32936732,
   'start': 32936731,
   'type': 'SimpleInterval'},
  'sequence_id': 'ga4gh:SQ._0wi-qoDrvram155UmcSC-zA5ZK4fpLT',
  'type': 'SequenceLocation'},
 'state': {'sequence': 'C', 'type': 'SequenceState'},
 'type': 'Allele'}

In [12]:
convert1("bogus").as_dict()

{'definition': 'bogus', 'type': 'Text'}

In [13]:
cvstream = clinvar_open(clinvar_fn)
va = next(cvstream)

In [14]:
va.acv

'VCV000242445.1'

In [15]:
va.hgvs_expressions

['NC_000023.11:g.149483005T>A',
 'NC_000023.10:g.148564536T>A',
 'NM_000202.7:c.1394A>T',
 'NP_000193.1:p.Gln465Leu',
 'NG_011900.3:g.27330A>T']

In [18]:
dvs = models.DiscreteVariationSet(members=[convert1(hgvs) for hgvs in va.hgvs_expressions])

In [19]:
dvs.as_dict()

{'members': [{'location': {'interval': {'end': 149483005,
     'start': 149483004,
     'type': 'SimpleInterval'},
    'sequence_id': 'ga4gh:SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP',
    'type': 'SequenceLocation'},
   'state': {'sequence': 'A', 'type': 'SequenceState'},
   'type': 'Allele'},
  {'location': {'interval': {'end': 148564536,
     'start': 148564535,
     'type': 'SimpleInterval'},
    'sequence_id': 'ga4gh:SQ.v7noePfnNpK8ghYXEqZ9NukMXW7YeNsm',
    'type': 'SequenceLocation'},
   'state': {'sequence': 'A', 'type': 'SequenceState'},
   'type': 'Allele'},
  {'location': {'interval': {'end': 1394,
     'start': 1393,
     'type': 'SimpleInterval'},
    'sequence_id': 'ga4gh:SQ.qLuwvoQyo4EGQLFFmvmKqLJfcNkU5Es-',
    'type': 'SequenceLocation'},
   'state': {'sequence': 'T', 'type': 'SequenceState'},
   'type': 'Allele'},
  {'location': {'interval': {'end': 465,
     'start': 464,
     'type': 'SimpleInterval'},
    'sequence_id': 'ga4gh:SQ.7SMuBtw_QpEk8CEj2bDLadv_NPVfbaFS',
    't

In [31]:
vcv_to_id = {}
variation_sets = {}
for va in clinvar_open(clinvar_fn):
    vs = models.DiscreteVariationSet(members=[convert1(hgvs) 
                                              for hgvs in va.hgvs_expressions])
    id = ga4gh_identify(vs)
    vcv_to_id[va.acv] = id
    variation_sets[id] = vs

In [32]:
vcv_to_id

{'VCV000242445.1': 'ga4gh:DVS.AcbOlOhNq4IZ84QY2UvvYaID4zU1QLba',
 'VCV000242814.1': 'ga4gh:DVS.qiFbLE7zzmmTo3tcP6eq36TVj8IvcAEv',
 'VCV000264664.1': 'ga4gh:DVS.uM5PY6biqGNQ90OuCOYG_ZS3vU4xSVcv',
 'VCV000279615.1': 'ga4gh:DVS.SkVgVNGFNAhmvXibRmxLgEACHhG32-PF',
 'VCV000004605.1': 'ga4gh:DVS.TuJH_qB4ULozyf_jPowZrZ_0aYgNLV8W',
 'VCV000327870.1': 'ga4gh:DVS.3IMH6pcZhphfq1-zumzBhxXyHkI-VCue',
 'VCV000446183.1': 'ga4gh:DVS.96xvvm4oaxSrbUXIVSNwIjkiPhxP9f8O',
 'VCV000386079.2': 'ga4gh:DVS.feHxxM1Wv6ZWJdu-hqMaaCyB69ECpvJ9',
 'VCV000386924.2': 'ga4gh:DVS.TfobpGWTLSGr_0fZAKQwKPd1Kw68h__I',
 'VCV000426571.2': 'ga4gh:DVS.dmzF-fyPDdTAnV80elwCaDckiONnXgEh',
 'VCV000534103.1': 'ga4gh:DVS.LAeLQNcnKB6RW-hIkf-ghmovNuAugWMX',
 'VCV000575037.1': 'ga4gh:DVS.58_PK65hzRyTk6c0mIaSLo6XP62Xt_tD',
 'VCV000242430.1': 'ga4gh:DVS.9S_dKAQKiRadQDthwcK7XUbWr_XhAhy5',
 'VCV000351296.3': 'ga4gh:DVS.E1Z6qJIwzimH1eatdrEYffxQ7Ae67UNW',
 'VCV000582963.2': 'ga4gh:DVS.2p_2dBLZXsT8_oTNPWUyW_i0gNIhIAyo',
 'VCV000242590.1': 'ga4gh

In [34]:
variation_sets[vcv_to_id['VCV000242445.1']].as_dict()

{'members': [{'location': {'interval': {'end': 149483005,
     'start': 149483004,
     'type': 'SimpleInterval'},
    'sequence_id': 'ga4gh:SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP',
    'type': 'SequenceLocation'},
   'state': {'sequence': 'A', 'type': 'SequenceState'},
   'type': 'Allele'},
  {'location': {'interval': {'end': 148564536,
     'start': 148564535,
     'type': 'SimpleInterval'},
    'sequence_id': 'ga4gh:SQ.v7noePfnNpK8ghYXEqZ9NukMXW7YeNsm',
    'type': 'SequenceLocation'},
   'state': {'sequence': 'A', 'type': 'SequenceState'},
   'type': 'Allele'},
  {'location': {'interval': {'end': 1394,
     'start': 1393,
     'type': 'SimpleInterval'},
    'sequence_id': 'ga4gh:SQ.qLuwvoQyo4EGQLFFmvmKqLJfcNkU5Es-',
    'type': 'SequenceLocation'},
   'state': {'sequence': 'T', 'type': 'SequenceState'},
   'type': 'Allele'},
  {'location': {'interval': {'end': 465,
     'start': 464,
     'type': 'SimpleInterval'},
    'sequence_id': 'ga4gh:SQ.7SMuBtw_QpEk8CEj2bDLadv_NPVfbaFS',
    't