# GA4GH VR Validation Tests
This notebook is used to craft the tests. The tests themselves are in vr-schema/validation. 

In [1]:
from ga4gh.vr import ga4gh_digest, identify, models, serialize, __version__
__version__

'0.2.1.dev7+gff304e9'

In [2]:
import yaml
def filter_keys(o, exclude_keys):
    try:
        return {k: filter_keys(o[k], exclude_keys)
                for k in o
                if k not in exclude_keys}
    except:
        return o
def dy(f, o):
    r = {
        "in": o.as_dict(),
        "out": {f.__name__: f(o).decode()}
    }
    print(yaml.dump(filter_keys({o.type._value: {"-": r}}, ["id"])).replace("'-':","-"))

# Functions
## ga4gh_digest

In [11]:
ga4gh_digest(b"")

'z4PhNX7vuL3xVChQ1m2AB9Yg5AULVxXc'

In [12]:
ga4gh_digest(b"ACGT")

'aKF498dAxcJAqme6QYQ7EZ07-fiw8Kw2'

## translate_sequence_identifier

In [5]:
# A seqrepo REST interface must be running on this URL:
seqrepo_rest_service_url = "http://localhost:5000/seqrepo"
from ga4gh.vr.extras.dataproxy import SeqRepoRESTDataProxy
dp = SeqRepoRESTDataProxy(base_url=seqrepo_rest_service_url)

def translate_sequence_identifier(ir):
    return dp.translate_sequence_identifier(ir)

In [7]:
translate_sequence_identifier("refseq:NC_000019.10")

2019-05-29 21:19:35 snafu ga4gh.vr.extras.dataproxy[25334] INFO Fetching http://localhost:5000/seqrepo/1/metadata/RefSeq:NC_000019.10


'ga4gh:SQIIB53T8CNeJJdUqzn9V_JnRtQadwWCbl'

In [8]:
dp.get_metadata('ga4gh:SQIIB53T8CNeJJdUqzn9V_JnRtQadwWCbl')

2019-05-29 21:20:15 snafu ga4gh.vr.extras.dataproxy[25334] INFO Fetching http://localhost:5000/seqrepo/1/metadata/VMC:GS_IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl


{'added': '2016-08-24T08:19:02Z',
 'aliases': ['Ensembl-79:19',
  'Ensembl-80:19',
  'Ensembl-81:19',
  'Ensembl-82:19',
  'Ensembl-83:19',
  'Ensembl-84:19',
  'Ensembl-85:19',
  'GRCh38:19',
  'GRCh38:chr19',
  'GRCh38.p1:19',
  'GRCh38.p1:chr19',
  'GRCh38.p10:19',
  'GRCh38.p10:chr19',
  'GRCh38.p11:19',
  'GRCh38.p11:chr19',
  'GRCh38.p12:19',
  'GRCh38.p12:chr19',
  'GRCh38.p2:19',
  'GRCh38.p2:chr19',
  'GRCh38.p3:19',
  'GRCh38.p3:chr19',
  'GRCh38.p4:19',
  'GRCh38.p4:chr19',
  'GRCh38.p5:19',
  'GRCh38.p5:chr19',
  'GRCh38.p6:19',
  'GRCh38.p6:chr19',
  'GRCh38.p7:19',
  'GRCh38.p7:chr19',
  'GRCh38.p8:19',
  'GRCh38.p8:chr19',
  'GRCh38.p9:19',
  'GRCh38.p9:chr19',
  'MD5:b0eba2c7bb5c953d1e06a508b5e487de',
  'RefSeq:NC_000019.10',
  'SEGUID:AHxM5/L8jIX08UhBBkKXkiO5rhY',
  'SHA1:007c4ce7f2fc8c85f4f148410642979223b9ae16',
  'VMC:GS_IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl',
  'ga4gh:SQIIB53T8CNeJJdUqzn9V_JnRtQadwWCbl',
  'TRUNC512:208079dd3f0235e249754ab39fd57f26746d41a7705826e5',
  '

# Models

In [3]:
simple_interval = models.SimpleInterval(start=42, end=43)
simple_interval.as_dict()

{'end': 43, 'start': 42, 'type': 'SimpleInterval'}

In [4]:
nested_interval = models.NestedInterval(
    inner=models.SimpleInterval(start=29,end=30),
    outer=models.SimpleInterval(start=30,end=39))
nested_interval.as_dict()

{'inner': {'end': 30, 'start': 29, 'type': 'SimpleInterval'},
 'outer': {'end': 39, 'start': 30, 'type': 'SimpleInterval'},
 'type': 'NestedInterval'}

In [5]:
# A SequenceLocation based on a SimpleInterval
sequence_location_si = models.SequenceLocation(
    sequence_id="NM_0001234.5",
    interval=simple_interval)
sequence_location_si.id = identify(sequence_location_si)
sequence_location_si.as_dict()

{'id': 'ga4gh:SL8KJJStVL_dJigtK_AHyVp5AAipy1pMh8',
 'interval': {'end': 43, 'start': 42, 'type': 'SimpleInterval'},
 'sequence_id': 'NM_0001234.5',
 'type': 'SequenceLocation'}

In [6]:
# A SequenceLocation based on a NestedInterval
sequence_location_ni = models.SequenceLocation(sequence_id="NM_0001234.5", 
                                               interval=nested_interval)
sequence_location_ni.id = identify(sequence_location_ni)
sequence_location_ni.as_dict()

{'id': 'ga4gh:SLwezY6icAjjBfaJMJ_Wf6BUsoxoMcmRdS',
 'interval': {'inner': {'end': 30, 'start': 29, 'type': 'SimpleInterval'},
  'outer': {'end': 39, 'start': 30, 'type': 'SimpleInterval'},
  'type': 'NestedInterval'},
 'sequence_id': 'NM_0001234.5',
 'type': 'SequenceLocation'}

In [7]:
cytoband_location = models.CytobandLocation(chr="11", start="q22.3", end="q23.1")
cytoband_location.id = identify(cytoband_location)
cytoband_location.as_dict()

{'chr': '11',
 'end': 'q23.1',
 'id': 'ga4gh:CLR2RiNOcD_3F-NNEQUrIst3M84LTsVQWF',
 'start': 'q22.3',
 'type': 'CytobandLocation'}

In [8]:
gene_location = models.GeneLocation(gene="HGNC:MSH2")
gene_location.id = identify(gene_location)
gene_location.as_dict()

{'gene': 'HGNC:MSH2',
 'id': 'ga4gh:GLHUswIoUpNqPZa2rBwJR_32At9A3wnWJJ',
 'type': 'GeneLocation'}

In [9]:
text_variation = models.Text(definition="PTEN loss")
text_variation.as_dict()

{'definition': 'PTEN loss', 'type': 'Text'}

In [10]:
sequence_state = models.SequenceState(sequence="A")
allele = models.Allele(location=sequence_location_si, state=sequence_state)
allele.id = identify(allele)
allele.as_dict()

{'id': 'ga4gh:VAV_LNMpD_7BfTlhBpvndEghNZlrQA7yXi',
 'location': {'id': 'ga4gh:SL8KJJStVL_dJigtK_AHyVp5AAipy1pMh8',
  'interval': {'end': 43, 'start': 42, 'type': 'SimpleInterval'},
  'sequence_id': 'NM_0001234.5',
  'type': 'SequenceLocation'},
 'state': {'sequence': 'A', 'type': 'SequenceState'},
 'type': 'Allele'}

### serialize()

In [13]:
# This is the allele defined above. Notice that `location` is defined inline
allele.as_dict()

{'id': 'ga4gh:VAV_LNMpD_7BfTlhBpvndEghNZlrQA7yXi',
 'location': {'id': 'ga4gh:SL8KJJStVL_dJigtK_AHyVp5AAipy1pMh8',
  'interval': {'end': 43, 'start': 42, 'type': 'SimpleInterval'},
  'sequence_id': 'NM_0001234.5',
  'type': 'SequenceLocation'},
 'state': {'sequence': 'A', 'type': 'SequenceState'},
 'type': 'Allele'}

In [14]:
# This is the serialized form. Notice that the inline `Location` instance was replaced with
# its identifier and that the Allele id is not included. 
serialize(allele)

b'{"location":"ga4gh:SL8KJJStVL_dJigtK_AHyVp5AAipy1pMh8","state":{"sequence":"A","type":"SequenceState"},"type":"Allele"}'

### identify()
VR computed identifiers are constructed from digests on serialized objects by prefixing a VR digest with a type-specific code.

In [15]:
# applying ga4gh_digest to the serialized allele returns a base64url-encoded digest
ga4gh_digest( serialize(allele) )

'V_LNMpD_7BfTlhBpvndEghNZlrQA7yXi'

In [16]:
# identify() uses this digest to construct a CURIE-formatted identifier.
# The VA prefix identifies this object as a Variation Allele.
identify(allele)

'ga4gh:VAV_LNMpD_7BfTlhBpvndEghNZlrQA7yXi'

In [18]:
dp.get_metadata("refseq:NM_000551.3")

2019-05-29 19:06:57 snafu ga4gh.vr.extras.dataproxy[2487] INFO Fetching http://localhost:5000/seqrepo/1/metadata/RefSeq:NM_000551.3


{'added': '2016-08-24T05:03:11Z',
 'aliases': ['MD5:215137b1973c1a5afcf86be7d999574a',
  'RefSeq:NM_000551.3',
  'SEGUID:T12L0p2X5E8DbnL0+SwI4Wc1S6g',
  'SHA1:4f5d8bd29d97e44f036e72f4f92c08e167354ba8',
  'VMC:GS_v_QTc1p-MUYdgrRv4LMT6ByXIOsdw3C_',
  'ga4gh:SQv_QTc1p-MUYdgrRv4LMT6ByXIOsdw3C_',
  'TRUNC512:bff413735a7e31461d82b46fe0b313e81c9720eb1dc370bf',
  'gi:319655736'],
 'alphabet': 'ACGT',
 'length': 4560}

In [19]:
dp.get_sequence("ga4gh:SQv_QTc1p-MUYdgrRv4LMT6ByXIOsdw3C_", start=0, end=50) + "..."

2019-05-29 19:06:57 snafu ga4gh.vr.extras.dataproxy[2487] INFO Fetching http://localhost:5000/seqrepo/1/sequence/VMC:GS_v_QTc1p-MUYdgrRv4LMT6ByXIOsdw3C_


'CCTCGCCTCCGTTACAACGGCCTACGGTGCTGGAGGATCCTTCTGCGCAC...'

In [20]:
from ga4gh.vr.extras.translator import Translator
tlr = Translator(data_proxy=dp)

2019-05-29 19:06:57 snafu hgvs[2487] INFO hgvs 1.3.0.post0; released: False


In [21]:
a = tlr.from_hgvs("NC_000013.11:g.32936732G>C")
a.as_dict()

2019-05-29 19:06:57 snafu ga4gh.vr.extras.translator[2487] INFO Creating  parser
2019-05-29 19:06:59 snafu ga4gh.vr.extras.dataproxy[2487] INFO Fetching http://localhost:5000/seqrepo/1/metadata/RefSeq:NC_000013.11


{'id': 'ga4gh:VAJBgSI1HBdpOYUNCWtRGwzhLtNrcdXAk8',
 'location': {'id': 'ga4gh:SL0FXQTd1CoM6ElQtD7qK1Ge6XGYhH6OZt',
  'interval': {'end': 32936732, 'start': 32936731, 'type': 'SimpleInterval'},
  'sequence_id': 'ga4gh:SQ_0wi-qoDrvram155UmcSC-zA5ZK4fpLT',
  'type': 'SequenceLocation'},
 'state': {'sequence': 'C', 'type': 'SequenceState'},
 'type': 'Allele'}

In [22]:
# from_beacon: Translate from beacon's form
a = tlr.from_beacon("13 : 32936732 G > C")
a.as_dict()

2019-05-29 19:06:59 snafu ga4gh.vr.extras.dataproxy[2487] INFO Fetching http://localhost:5000/seqrepo/1/metadata/GRCh38:13


{'id': 'ga4gh:VAJBgSI1HBdpOYUNCWtRGwzhLtNrcdXAk8',
 'location': {'id': 'ga4gh:SL0FXQTd1CoM6ElQtD7qK1Ge6XGYhH6OZt',
  'interval': {'end': 32936732, 'start': 32936731, 'type': 'SimpleInterval'},
  'sequence_id': 'ga4gh:SQ_0wi-qoDrvram155UmcSC-zA5ZK4fpLT',
  'type': 'SequenceLocation'},
 'state': {'sequence': 'C', 'type': 'SequenceState'},
 'type': 'Allele'}

In [23]:
# SPDI uses 0-based coordinates
a = tlr.from_spdi("NC_000013.11:32936731:1:C")
a.as_dict()

{'id': 'ga4gh:VAJBgSI1HBdpOYUNCWtRGwzhLtNrcdXAk8',
 'location': {'id': 'ga4gh:SL0FXQTd1CoM6ElQtD7qK1Ge6XGYhH6OZt',
  'interval': {'end': 32936732, 'start': 32936731, 'type': 'SimpleInterval'},
  'sequence_id': 'ga4gh:SQ_0wi-qoDrvram155UmcSC-zA5ZK4fpLT',
  'type': 'SequenceLocation'},
 'state': {'sequence': 'C', 'type': 'SequenceState'},
 'type': 'Allele'}

In [24]:
a = tlr.from_vcf("13-32936732-G-C")   # gnomAD-style expression
a.as_dict()

{'id': 'ga4gh:VAJBgSI1HBdpOYUNCWtRGwzhLtNrcdXAk8',
 'location': {'id': 'ga4gh:SL0FXQTd1CoM6ElQtD7qK1Ge6XGYhH6OZt',
  'interval': {'end': 32936732, 'start': 32936731, 'type': 'SimpleInterval'},
  'sequence_id': 'ga4gh:SQ_0wi-qoDrvram155UmcSC-zA5ZK4fpLT',
  'type': 'SequenceLocation'},
 'state': {'sequence': 'C', 'type': 'SequenceState'},
 'type': 'Allele'}

# Validation examples

In [87]:
for o in [simple_interval, nested_interval, sequence_location_si, sequence_location_ni, cytoband_location, gene_location, allele]:
    dy(serialize, o)

SimpleInterval:
  -
    in:
      end: 43
      start: 42
      type: SimpleInterval
    out:
      serialize: '{"end":43,"start":42,"type":"SimpleInterval"}'

NestedInterval:
  -
    in:
      inner:
        end: 30
        start: 29
        type: SimpleInterval
      outer:
        end: 39
        start: 30
        type: SimpleInterval
      type: NestedInterval
    out:
      serialize: '{"inner":{"end":30,"start":29,"type":"SimpleInterval"},"outer":{"end":39,"start":30,"type":"SimpleInterval"},"type":"NestedInterval"}'

SequenceLocation:
  -
    in:
      interval:
        end: 43
        start: 42
        type: SimpleInterval
      sequence_id: NM_0001234.5
      type: SequenceLocation
    out:
      serialize: '{"interval":{"end":43,"start":42,"type":"SimpleInterval"},"sequence_id":"NM_0001234.5","type":"SequenceLocation"}'

SequenceLocation:
  -
    in:
      interval:
        inner:
          end: 30
          start: 29
          type: SimpleInterval
        outer:
          en