# Associating Annotions with VR Objects

This notebook demonstrates how to associate information with VR objects.

Information is never embedded within VR objects. Instead, it is associated with those objects by means of their ids.  This approach to annotations scales better in size and distributes better across multiple data sources.

In [1]:
import collections
from ga4gh.vr import ga4gh_identify, models
from ga4gh.vr.extras.dataproxy import SeqRepoRESTDataProxy
from ga4gh.vr.extras.translator import Translator

# Requires seqrepo REST interface is running on this URL (e.g., using docker image)
seqrepo_rest_service_url = "http://localhost:5000/seqrepo"
dp = SeqRepoRESTDataProxy(base_url=seqrepo_rest_service_url)

tlr = Translator(data_proxy=dp)

In [2]:
data = (
    ("rs7412C",   "NC_000019.10:g.44908822="),
    ("rs7412T",   "NC_000019.10:g.44908822C>T"),
    ("rs429358C", "NC_000019.10:g.44908684="),
    ("rs429358T", "NC_000019.10:g.44908684T>C")
)

In [3]:
alleles = {}
rs_names = {}
hgvs_names = collections.defaultdict(lambda: dict())
for rs, hgvs_expr in data:
    allele = tlr.from_hgvs(hgvs_expr)
    allele_id = ga4gh_identify(allele)
    alleles[allele_id] = allele
    hgvs_names[allele_id]["GRCh38"] = hgvs_expr
    rs_names[allele_id] = rs

rs_to_id = {r: i for i, r in rs_names.items()}

In [4]:
freqs = {
    "gnomad": {
        "global": {
            rs_to_id["rs7412C"]: 0.9385,
            rs_to_id["rs7412T"]: 0.0615,
            rs_to_id["rs429358C"]: 0.1385,
            rs_to_id["rs429358T"]: 0.8615,
        }
    }
}

In [5]:
doc = {
    "alleles": alleles,
    "hgvs_names": hgvs_names,
    "rs_names": rs_names,
    "freqs": freqs
}

In [6]:
# For the benefit of pretty printing, let's replace the allele objects with their dict representations
doc["alleles"] = {i: a.as_dict() for i, a in doc["alleles"].items()}

In [7]:
import json
print(json.dumps(doc, indent=2))

{
  "alleles": {
    "ga4gh:VA.UUvQpMYU5x8XXBS-RhBhmipTWe2AALzj": {
      "_digest": "UUvQpMYU5x8XXBS-RhBhmipTWe2AALzj",
      "location": {
        "_digest": "u5fspwVbQ79QkX6GHLF8tXPCAXFJqRPx",
        "interval": {
          "end": 44908822,
          "start": 44908821,
          "type": "SimpleInterval"
        },
        "sequence_id": "ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl",
        "type": "SequenceLocation"
      },
      "state": {
        "sequence": "C",
        "type": "SequenceState"
      },
      "type": "Allele"
    },
    "ga4gh:VA.EgHPXXhULTwoP4-ACfs-YCXaeUQJBjH_": {
      "_digest": "EgHPXXhULTwoP4-ACfs-YCXaeUQJBjH_",
      "location": {
        "_digest": "u5fspwVbQ79QkX6GHLF8tXPCAXFJqRPx",
        "interval": {
          "end": 44908822,
          "start": 44908821,
          "type": "SimpleInterval"
        },
        "sequence_id": "ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl",
        "type": "SequenceLocation"
      },
      "state": {
        "sequence": "T"