In [1]:
import json
import string
import yaml

from IPython.display import display, Markdown

from ga4gh.core import ga4gh_digest, ga4gh_identify, ga4gh_serialize, is_identifiable, sha512t24u
from ga4gh.vrs import __version__, models, normalize, vrs_enref, vrs_deref
__version__

Removing allOf attribute from CopyNumber to avoid pjs error.
Removing allOf attribute from SequenceInterval to avoid pjs error.
Removing allOf attribute from RepeatedSequenceExpression to avoid pjs error.


'0.7.0rc4.dev4+gd806dd4.d20210628'

In [17]:
def filter_dict(d) -> dict:
    """remove keys starting with underscore"""
    try:
        return {k: filter_dict(d[k])
                for k in d
                if not k.startswith("_")}
    except:
        return d
def dump_json(o) -> str:
    """return VRS object as pretty formated json (string)"""
    return json.dumps(filter_dict(o.as_dict()), indent=2, sort_keys=True)
def dump_tests(o, fns=None) -> str:
    """return VRS object with and function results as yaml test definition (string)"""
    def as_str(s) -> str:
        return s if isinstance(s, str) else s.decode()
    if fns is None:
        fns = [ga4gh_serialize]
        if is_identifiable(o):
            fns += [ga4gh_digest, ga4gh_identify]
    r = {
        "in": o.as_dict(),
        "out": {f.__name__: as_str(f(o)) for f in fns}
    }
    return yaml.dump(filter_dict({o.type._value: {"-": r}})).replace("'-':","-")

all_yaml = ""
def output(o) -> None:
    """dump as json and yaml"""
    global all_yaml
    test_yaml = dump_tests(o)
    all_yaml += test_yaml
    md = [
        "**example object**",
        "```",
        dump_json(o),
        "```",
        "",
        "**validation test**",
        "```",
        test_yaml,
        "```",
    ]
    display(Markdown("\n".join(md)))
def pj(o): print(dump_json(o))

----
# External Data

In [18]:
from ga4gh.vrs.dataproxy import SeqRepoRESTDataProxy
seqrepo_rest_service_url = "http://localhost:5000/seqrepo"
dp = SeqRepoRESTDataProxy(base_url=seqrepo_rest_service_url)

In [19]:
def get_sequence(identifier, start=None, end=None):
    """returns sequence for given identifier, optionally limited to interbase <start, end> interval"""
    return dp.get_sequence(identifier, start, end)
def get_sequence_length(identifier):
    """return length of given sequence identifier"""
    return dp.get_metadata(identifier)["length"]
def translate_sequence_identifier(identifier, namespace):
    """return for given identifier, return *list* of equivalent identifiers in given namespace"""
    return dp.translate_sequence_identifier(identifier, namespace)

In [20]:
# NC_000019.10:g.44908822C>T

In [21]:
GRCh38chr19 = translate_sequence_identifier("NC_000019.10","ga4gh")[0]
GRCh38chr19

'ga4gh:GS.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl'

In [23]:
interval = models.SequenceInterval(
    start=models.Number(value=44908821),
    end=models.Number(value=44908822))
pj(interval)

{
  "end": {
    "type": "Number",
    "value": 44908822
  },
  "start": {
    "type": "Number",
    "value": 44908821
  },
  "type": "SequenceInterval"
}


In [26]:
location = models.SequenceLocation(
    sequence_id="refseq:NC_000019.10",
    interval=interval)
pj(location)

{
  "interval": {
    "end": {
      "type": "Number",
      "value": 44908822
    },
    "start": {
      "type": "Number",
      "value": 44908821
    },
    "type": "SequenceInterval"
  },
  "sequence_id": "refseq:NC_000019.10",
  "type": "SequenceLocation"
}


In [28]:
lse = models.LiteralSequenceExpression(sequence="T")
pj(lse)

{
  "sequence": "T",
  "type": "LiteralSequenceExpression"
}


In [30]:
a = models.Allele(location=location, state=lse)
pj(a)

{
  "location": {
    "interval": {
      "end": {
        "type": "Number",
        "value": 44908822
      },
      "start": {
        "type": "Number",
        "value": 44908821
      },
      "type": "SequenceInterval"
    },
    "sequence_id": "refseq:NC_000019.10",
    "type": "SequenceLocation"
  },
  "state": {
    "sequence": "T",
    "type": "LiteralSequenceExpression"
  },
  "type": "Allele"
}


In [37]:
s = ga4gh_serialize(a)
s

b'{"location":"esDSArZQC-Sx-96ZZzHnzAVNOc439oE5","state":{"sequence":"T","type":"LiteralSequenceExpression"},"type":"Allele"}'

In [31]:
ir = ga4gh_identify(a)
ir

'ga4gh:VA._YNe5V9kyydfkGU0NRyCMHDSKHL4YNvc'

In [35]:
a._id = ir
print(json.dumps(a.as_dict(), indent=2, sort_keys=True))

{
  "_id": "ga4gh:VA._YNe5V9kyydfkGU0NRyCMHDSKHL4YNvc",
  "location": {
    "interval": {
      "end": {
        "type": "Number",
        "value": 44908822
      },
      "start": {
        "type": "Number",
        "value": 44908821
      },
      "type": "SequenceInterval"
    },
    "sequence_id": "refseq:NC_000019.10",
    "type": "SequenceLocation"
  },
  "state": {
    "sequence": "T",
    "type": "LiteralSequenceExpression"
  },
  "type": "Allele"
}
