# GA4GH VR Validation Tests
This notebook is used to *generate* the tests in vr-spec/validation.yaml

In [1]:
from ga4gh.core import ga4gh_digest, ga4gh_identify, ga4gh_serialize, sha512t24u
from ga4gh.vr import __version__, models, normalize
__version__

'0.2.2.dev8+gfb89bc2.d20190731'

In [30]:
import json
import yaml
def filter_dict(d):
    try:
        return {k: filter_dict(d[k])
                for k in d
                if not k.startswith("_")}
    except:
        return d
def as_str(s):
    return s if isinstance(s, str) else s.decode()
def dj(o):
    """print VR object as pretty formated json"""
    print(json.dumps(filter_dict(o.as_dict()), indent=2, sort_keys=True))
def dy(fns, o):
    """execute function f in fns on o, returning a yaml block representing the test"""
    r = {
        "in": o.as_dict(),
        "out": {f.__name__: as_str(f(o)) for f in fns}
    }
    print(yaml.dump(filter_dict({o.type._value: {"-": r}})).replace("'-':","-"))

----
# Models

## Locations

### SimpleInterval

In [31]:
simple_interval = models.SimpleInterval(start=44908821, end=44908822)
dj(simple_interval)

{
  "end": 44908822,
  "start": 44908821,
  "type": "SimpleInterval"
}


### NestedInterval

In [32]:
nested_interval = models.NestedInterval(
    inner=models.SimpleInterval(start=44908821-25,end=44908821),
    outer=models.SimpleInterval(start=44908822,end=44908822+25))
dj(nested_interval)

{
  "inner": {
    "end": 44908821,
    "start": 44908796,
    "type": "SimpleInterval"
  },
  "outer": {
    "end": 44908847,
    "start": 44908822,
    "type": "SimpleInterval"
  },
  "type": "NestedInterval"
}


### SequenceLocation

In [26]:
# A SequenceLocation based on a SimpleInterval
sequence_location_si = models.SequenceLocation(
    sequence_id="ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl",
    interval=simple_interval)
dj(sequence_location_si)

{
  "interval": {
    "end": 44908822,
    "start": 44908821,
    "type": "SimpleInterval"
  },
  "sequence_id": "ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl",
  "type": "SequenceLocation"
}


In [5]:
# A SequenceLocation based on a NestedInterval
sequence_location_ni = models.SequenceLocation(sequence_id="ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl", 
                                               interval=nested_interval)
sequence_location_ni.as_dict()

{'interval': {'inner': {'end': 44908821,
   'start': 44908796,
   'type': 'SimpleInterval'},
  'outer': {'end': 44908847, 'start': 44908822, 'type': 'SimpleInterval'},
  'type': 'NestedInterval'},
 'sequence_id': 'ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl',
 'type': 'SequenceLocation'}

## State

### SequenceState

In [28]:
sequence_state = models.SequenceState(sequence="T")
dj(sequence_state)

{
  "sequence": "T",
  "type": "SequenceState"
}


## Variation

### Text

In [33]:
text_variation = models.Text(definition="APOE loss")
dj(text_variation)

{
  "definition": "APOE loss",
  "type": "Text"
}


### Allele

In [34]:
allele = models.Allele(location=sequence_location_si,
                       state=models.SequenceState(sequence="T"))
dj(allele)

{
  "location": {
    "interval": {
      "end": 44908822,
      "start": 44908821,
      "type": "SimpleInterval"
    },
    "sequence_id": "ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl",
    "type": "SequenceLocation"
  },
  "state": {
    "sequence": "T",
    "type": "SequenceState"
  },
  "type": "Allele"
}


----
# Functions

### Digest serialization (`ga4gh_serialize`)

The ga4gh digest serialization form is like json, but it the specification ensures that all implementations will produce the same binary payload.

In [36]:
allele = models.Allele(location=models.SequenceLocation(
    sequence_id="ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl",
    interval=simple_interval),
    state=models.SequenceState(sequence="T"))
ga4gh_serialize(allele)

b'{"location":"u5fspwVbQ79QkX6GHLF8tXPCAXFJqRPx","state":{"sequence":"T","type":"SequenceState"},"type":"Allele"}'

In [38]:
dj(allele)

{
  "location": {
    "interval": {
      "end": 44908822,
      "start": 44908821,
      "type": "SimpleInterval"
    },
    "sequence_id": "ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl",
    "type": "SequenceLocation"
  },
  "state": {
    "sequence": "T",
    "type": "SequenceState"
  },
  "type": "Allele"
}


### Truncated Digest (`ga4gh_digest`)
VR computed identifiers are constructed from digests on serialized objects by prefixing a VR digest with a type-specific code.

In [10]:
# applying ga4gh_digest to the serialized allele returns a base64url-encoded digest
ga4gh_digest(allele)

'EgHPXXhULTwoP4-ACfs-YCXaeUQJBjH_'

In [11]:
# Which is equivalent to
sha512t24u(ga4gh_serialize(allele))

'EgHPXXhULTwoP4-ACfs-YCXaeUQJBjH_'

### Computed Identifier (`ga4gh_identify`)

In [12]:
ga4gh_identify(allele)

'ga4gh:VA.EgHPXXhULTwoP4-ACfs-YCXaeUQJBjH_'

----
# Validation examples

## models.yaml

In [18]:
for o in [simple_interval, nested_interval, sequence_location_si, sequence_location_ni]:
    dy([ga4gh_serialize], o)
for o in [allele]:
    dy([ga4gh_serialize, ga4gh_digest, ga4gh_identify], o)

SimpleInterval:
  -
    in:
      end: 44908822
      start: 44908821
      type: SimpleInterval
    out:
      ga4gh_serialize: '{"end":44908822,"start":44908821,"type":"SimpleInterval"}'

NestedInterval:
  -
    in:
      inner:
        end: 44908821
        start: 44908796
        type: SimpleInterval
      outer:
        end: 44908847
        start: 44908822
        type: SimpleInterval
      type: NestedInterval
    out:
      ga4gh_serialize: '{"inner":{"end":44908821,"start":44908796,"type":"SimpleInterval"},"outer":{"end":44908847,"start":44908822,"type":"SimpleInterval"},"type":"NestedInterval"}'

SequenceLocation:
  -
    in:
      interval:
        end: 44908822
        start: 44908821
        type: SimpleInterval
      sequence_id: ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl
      type: SequenceLocation
    out:
      ga4gh_serialize: '{"interval":{"end":44908822,"start":44908821,"type":"SimpleInterval"},"sequence_id":"IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl","type":"SequenceLocatio