# Examples and Validation Tests

This notebook is used to generate [vr-spec](https://vr-spec.readthedocs.io/) examples and the tests in [vr-spec/validation/](https://github.com/ga4gh/vr-spec/tree/master/validation).

To Do:
* use real examples from ApoE for Alleles, Haplotypes, and Genotypes; e.g.. 4 alleles, 2 Haplotypes, 1 Genotype
* test order invariance
* test inline v. reference objects


In [1]:
from ga4gh.core import ga4gh_digest, ga4gh_identify, ga4gh_serialize, sha512t24u
from ga4gh.vrs import __version__, models, normalize
__version__

Removing allOf attribute from CopyNumber to avoid python-jsonschema-objects error.
Removing allOf attribute from SequenceInterval to avoid python-jsonschema-objects error.
Removing allOf attribute from RepeatedSequenceExpression to avoid python-jsonschema-objects error.


'0.7.7.dev1+g92313b5.d20230223'

In [2]:
import json
import yaml
def filter_dict(d):
    try:
        return {k: filter_dict(d[k])
                for k in d
                if not k.startswith("_")}
    except:
        return d
def as_str(s):
    return s if isinstance(s, str) else s.decode()
def dj(o):
    """print VR object as pretty formated json"""
    print(json.dumps(filter_dict(o.as_dict()), indent=2, sort_keys=True))
def dy(fns, o):
    """execute function f in fns on o, returning a yaml block representing the test"""
    r = {
        "in": o.as_dict(),
        "out": {f.__name__: as_str(f(o)) for f in fns}
    }
    print(yaml.dump(filter_dict({o.type._value: {"-": r}})).replace("'-':","-"))

----
# Generate sample objects

In [3]:
# SimpleInterval
simple_interval = models.SimpleInterval(start=44908821, end=44908822, type="SimpleInterval")
dj(simple_interval)

{
  "end": 44908822,
  "start": 44908821,
  "type": "SimpleInterval"
}


In [4]:
# SequenceLocation based on a SimpleInterval
sequence_location = models.SequenceLocation(
    sequence_id="ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl",
    interval=simple_interval,
    type="SequenceLocation")
dj(sequence_location)

{
  "interval": {
    "end": 44908822,
    "start": 44908821,
    "type": "SimpleInterval"
  },
  "sequence_id": "ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl",
  "type": "SequenceLocation"
}


In [5]:
# SequenceState
sequence_state = models.SequenceState(sequence="T", type="SequenceState")
dj(sequence_state)

{
  "sequence": "T",
  "type": "SequenceState"
}


In [6]:
# Text
text_variation = models.Text(definition="APOE loss", type="Text")
dj(text_variation)

{
  "definition": "APOE loss",
  "type": "Text"
}


In [7]:
# Allele
allele = models.Allele(location=sequence_location,
                       state=models.SequenceState(sequence="T", type="SequenceState"),
                       type="Allele")
dj(allele)

{
  "location": {
    "interval": {
      "end": 44908822,
      "start": 44908821,
      "type": "SimpleInterval"
    },
    "sequence_id": "ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl",
    "type": "SequenceLocation"
  },
  "state": {
    "sequence": "T",
    "type": "SequenceState"
  },
  "type": "Allele"
}


In [8]:
# Haplotype
haplotype = models.Haplotype(members=[allele], type="Haplotype")
dj(haplotype)

{
  "members": [
    {
      "location": {
        "interval": {
          "end": 44908822,
          "start": 44908821,
          "type": "SimpleInterval"
        },
        "sequence_id": "ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl",
        "type": "SequenceLocation"
      },
      "state": {
        "sequence": "T",
        "type": "SequenceState"
      },
      "type": "Allele"
    }
  ],
  "type": "Haplotype"
}


----
# Functions

### Truncated Digest (sha512t24u)

In [9]:
sha512t24u(b"")

'z4PhNX7vuL3xVChQ1m2AB9Yg5AULVxXc'

In [10]:
sha512t24u(b"ACGT")

'aKF498dAxcJAqme6QYQ7EZ07-fiw8Kw2'

### Digest Serialization (`ga4gh_serialize`)

The ga4gh digest serialization form is like json, but it the specification ensures that all implementations will produce the same binary payload.

In [11]:
allele = models.Allele(location=models.SequenceLocation(
    sequence_id="ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl",
    interval=simple_interval,
    type="SequenceLocation"),
    state=models.SequenceState(sequence="T", type="SequenceState"),
    type="Allele")
ga4gh_serialize(allele)

b'{"location":"u5fspwVbQ79QkX6GHLF8tXPCAXFJqRPx","state":{"sequence":"T","type":"SequenceState"},"type":"Allele"}'

In [12]:
dj(allele)

{
  "location": {
    "interval": {
      "end": 44908822,
      "start": 44908821,
      "type": "SimpleInterval"
    },
    "sequence_id": "ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl",
    "type": "SequenceLocation"
  },
  "state": {
    "sequence": "T",
    "type": "SequenceState"
  },
  "type": "Allele"
}


### Object Digest (`ga4gh_digest`)
VR computed identifiers are constructed from digests on serialized objects by prefixing a VR digest with a type-specific code.

In [13]:
# applying ga4gh_digest to the serialized allele returns a base64url-encoded digest
ga4gh_digest(allele)

'EgHPXXhULTwoP4-ACfs-YCXaeUQJBjH_'

In [14]:
# Which is equivalent to
sha512t24u(ga4gh_serialize(allele))

'EgHPXXhULTwoP4-ACfs-YCXaeUQJBjH_'

### Object Computed Identifier (`ga4gh_identify`)

In [15]:
ga4gh_identify(allele)

'ga4gh:VA.EgHPXXhULTwoP4-ACfs-YCXaeUQJBjH_'

----
## External Data

In [16]:
from ga4gh.vrs.dataproxy import SeqRepoRESTDataProxy
seqrepo_rest_service_url = "https://services.genomicmedlab.org/seqrepo"
dp = SeqRepoRESTDataProxy(base_url=seqrepo_rest_service_url)

In [17]:
def get_sequence(identifier, start=None, end=None):
    """returns sequence for given identifier, optionally limited to interbase <start, end> interval"""
    return dp.get_sequence(identifier, start, end)
def get_sequence_length(identifier):
    """return length of given sequence identifier"""
    return dp.get_metadata(identifier)["length"]
def translate_sequence_identifier(identifier, namespace):
    """return for given identifier, return *list* of equivalent identifiers in given namespace"""
    return dp.translate_sequence_identifier(identifier, namespace)

In [18]:
get_sequence_length("ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl")

58617616

In [19]:
start, end = 44908821-25, 44908822+25
get_sequence("ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl", start, end)

'CCGCGATGCCGATGACCTGCAGAAGCGCCTGGCAGTGTACCAGGCCGGGGC'

In [20]:
translate_sequence_identifier("GRCh38:19", "ga4gh")

['ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl']

In [21]:
translate_sequence_identifier("ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl", "GRCh38")

['GRCh38:chr19', 'GRCh38:19']

----
# Validation examples

## models.yaml

In [22]:
for o in [simple_interval]:
    dy([ga4gh_serialize], o)
for o in [sequence_location, allele, haplotype]:
    dy([ga4gh_serialize, ga4gh_digest, ga4gh_identify], o)

SimpleInterval:
  -
    in:
      end: 44908822
      start: 44908821
      type: SimpleInterval
    out:
      ga4gh_serialize: '{"end":44908822,"start":44908821,"type":"SimpleInterval"}'

SequenceLocation:
  -
    in:
      interval:
        end: 44908822
        start: 44908821
        type: SimpleInterval
      sequence_id: ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl
      type: SequenceLocation
    out:
      ga4gh_digest: u5fspwVbQ79QkX6GHLF8tXPCAXFJqRPx
      ga4gh_identify: ga4gh:VSL.u5fspwVbQ79QkX6GHLF8tXPCAXFJqRPx
      ga4gh_serialize: '{"interval":{"end":44908822,"start":44908821,"type":"SimpleInterval"},"sequence_id":"IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl","type":"SequenceLocation"}'

Allele:
  -
    in:
      location:
        interval:
          end: 44908822
          start: 44908821
          type: SimpleInterval
        sequence_id: ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl
        type: SequenceLocation
      state:
        sequence: T
        type: SequenceState
      type