# ApoE Example

The ApoE gene is associated with risks of Alzheimer's disease and hypercholesterolemia. Risk of AD is attributed to haplotypes comprised of two locations, rs429358 and rs7412, both of which are C/T transitions. This notebook demonstrates how to represent alleles, haplotypes, and genotypes and associated annotations.

```
                             rs7412 
                             NC_000019.10:g.44908822
                             C          T
rs429358                 C   APOE-ε4    APOE-ε1
NC_000019.10:g.44908684  T   APOE-ε3    APOE-ε2
```

(Source http://snpedia.com/index.php/APOE)

## Setup

In [1]:
import collections
import datetime
import json

import jsonschema

from vmcdemo import models, computed_id, serialize, schema_path
from vmcdemo.digest import id_to_ir
from vmcdemo.seqrepo import get_vmc_sequence_id, _sr
print(_sr)

# pretty print json
def ppj(o): print(json.dumps(json.loads(o.serialize()), indent=4, sort_keys=True))

SeqRepo(root_dir=/usr/local/share/seqrepo/master, writeable=False)


## Identifiers

In [2]:
identifiers = collections.defaultdict(list)

## Sequences
A description of sequence variation, with VMC or otherwise, requires the availability of sequences in order to define coordinate systems.  Typically sequences are referred to with an accession like NC_000019.10.  There are two issues with using sequence accessions:

* Identical sequences have different names (e.g., "NC_000019.10" == "CM000681.2" == (GRCh38) "19" == (GRCh38 UCSC) "chr19").  Naive comparison of the same allele defined using different sequence name will fail.
* With graph genomes, it will become infeasible to assign sequence identifiers.

For these reasons, VMC encourages (but doesn't require) the use of computed identifiers based on a SHA512 digest, truncated to 24 bytes, and URL-safe base64 encoded.

get_vmc_sequence_id returns the computed sequence identifier for a given accession.

In [26]:
ir = models.Identifier(namespace="NCBI", accession="NC_000019.10")
sequence_id = get_vmc_sequence_id(ir)
sequence_id

'VMC:GS_IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl'

In [27]:
identifiers[sequence_id].append(ir)

## Intervals and Locations
An Interval consists of start and end positions in interbase coordinates.
A Location refers to a continuous span within a sequence, where the sequence is identified by Id and the span is defined by an Interval.

In [28]:
locations_by_name = {
    "rs429358": models.Location(
        sequence_id = sequence_id,
        interval = models.Interval(start=44908683, end=44908684),
    ),
    "rs7412": models.Location(
        sequence_id = sequence_id,
        interval=models.Interval(start=44908821, end=44908822),
    )
}
for n, l in locations_by_name.items():
    l.id = computed_id(l)
    identifiers[l.id].append(models.Identifier(accession=n))

In [6]:
# This is the string that is hashed to generate a computed identifier
serialize(locations_by_name["rs429358"])

'<Location:<Identifier:VMC:GS_IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl>:<Interval:44908683:44908684>>'

In [7]:
ppj(locations_by_name["rs429358"])

{
    "id": "VMC:GL_9Jht-lguk_jnBvG-wLJbjmBw5v_v7rQo",
    "interval": {
        "end": 44908684,
        "start": 44908683
    },
    "sequence_id": "VMC:GS_IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl"
}


## Alleles

In [8]:
alleles_by_name = {
    "rs429358T": models.Allele(location_id=locations_by_name["rs429358"].id, state="T"),
    "rs429358C": models.Allele(location_id=locations_by_name["rs429358"].id, state="C"),
    "rs7412T":   models.Allele(location_id=locations_by_name["rs7412"].id,   state="T"),
    "rs7412C":   models.Allele(location_id=locations_by_name["rs7412"].id,   state="C"),
}
for n, a in alleles_by_name.items():
    a.id = computed_id(a)
    identifiers[a.id].append(models.Identifier(accession=n))

In [9]:
serialize(alleles_by_name["rs429358C"])

'<Allele:<Identifier:VMC:GL_9Jht-lguk_jnBvG-wLJbjmBw5v_v7rQo>:C>'

In [10]:
ppj(alleles_by_name["rs429358C"])

{
    "id": "VMC:GA_8vT5C3XyPLVz4_AXCI5P-J0gobxoGdxY",
    "location_id": "VMC:GL_9Jht-lguk_jnBvG-wLJbjmBw5v_v7rQo",
    "state": "C"
}


## Haplotypes

In [11]:
haplotypes_by_name = {
    "ε1": models.Haplotype(
        allele_ids = [alleles_by_name["rs429358C"].id, alleles_by_name["rs7412T"].id],
        completeness = "COMPLETE"
    ),
    "ε2": models.Haplotype(
        allele_ids = [alleles_by_name["rs429358T"].id, alleles_by_name["rs7412T"].id],
        completeness = "COMPLETE"
    ),
    "ε3": models.Haplotype(
        allele_ids = [alleles_by_name["rs429358T"].id, alleles_by_name["rs7412C"].id],
        completeness = "COMPLETE"
    ),
    "ε4": models.Haplotype(
        allele_ids = [alleles_by_name["rs429358C"].id, alleles_by_name["rs7412C"].id],
        completeness = "COMPLETE"
    ),
}

for n, h in haplotypes_by_name.items():
    h.id = computed_id(h)
    identifiers[h.id].append(models.Identifier(accession=n))

In [12]:
serialize(haplotypes_by_name["ε1"])

'<Haplotype:COMPLETE:[<Identifier:VMC:GA_8vT5C3XyPLVz4_AXCI5P-J0gobxoGdxY>;<Identifier:VMC:GA_FABxPGRP7dT3sKot_91vXQrPvzfNYCbX>]>'

In [13]:
ppj(haplotypes_by_name["ε4"])

{
    "allele_ids": [
        "VMC:GA_8vT5C3XyPLVz4_AXCI5P-J0gobxoGdxY",
        "VMC:GA_Bdf7gntpo-snoQdY356RrpCuyrrzYynA"
    ],
    "completeness": "COMPLETE",
    "id": "VMC:GH_d3UvMyD-ArHLi-ZucGWxURhfeALz7arO"
}


In [14]:
# Reversing allele ids results in the same digest (that's good!)
h_ε4r = models.Haplotype(
        allele_ids = [alleles_by_name["rs7412C"].id, alleles_by_name["rs429358C"].id],
        completeness = "COMPLETE"
)
h_ε4r.id = computed_id(h_ε4r)
ppj(h_ε4r)

{
    "allele_ids": [
        "VMC:GA_Bdf7gntpo-snoQdY356RrpCuyrrzYynA",
        "VMC:GA_8vT5C3XyPLVz4_AXCI5P-J0gobxoGdxY"
    ],
    "completeness": "COMPLETE",
    "id": "VMC:GH_d3UvMyD-ArHLi-ZucGWxURhfeALz7arO"
}


## Genotypes

In [15]:
genotypes_by_name = {
    "ε2/ε3": models.Genotype(
        haplotype_ids = [haplotypes_by_name["ε2"].id, haplotypes_by_name["ε3"].id],
        completeness = "COMPLETE"
    ),
    "ε3/ε2": models.Genotype(
        haplotype_ids = [haplotypes_by_name["ε3"].id, haplotypes_by_name["ε2"].id],
        completeness = "COMPLETE"
    ),
    "ε4/ε4": models.Genotype(
        haplotype_ids = [haplotypes_by_name["ε4"].id, haplotypes_by_name["ε4"].id],
        completeness = "COMPLETE"
    ),
}

for n, h in genotypes_by_name.items():
    h.id = computed_id(h)
    identifiers[h.id].append(models.Identifier(accession=n))

In [16]:
serialize(genotypes_by_name["ε4/ε4"])

'<Genotype:COMPLETE:[<Identifier:VMC:GH_d3UvMyD-ArHLi-ZucGWxURhfeALz7arO>;<Identifier:VMC:GH_d3UvMyD-ArHLi-ZucGWxURhfeALz7arO>]>'

In [17]:
ppj(genotypes_by_name["ε2/ε3"])

{
    "completeness": "COMPLETE",
    "haplotype_ids": [
        "VMC:GH_exlsvXjQFFhoMxc5IKUvdgOnMAbZ2oBh",
        "VMC:GH_SF_ZVWlwehopjxKDIF__paB1Q2DwjB4B"
    ],
    "id": "VMC:GG_ISiZFONyC1HHaBxi2kBklfDQEdb5CRRe"
}


In [18]:
ppj(genotypes_by_name["ε3/ε2"])

{
    "completeness": "COMPLETE",
    "haplotype_ids": [
        "VMC:GH_SF_ZVWlwehopjxKDIF__paB1Q2DwjB4B",
        "VMC:GH_exlsvXjQFFhoMxc5IKUvdgOnMAbZ2oBh"
    ],
    "id": "VMC:GG_ISiZFONyC1HHaBxi2kBklfDQEdb5CRRe"
}


## Bundle Serialization, Validation, and Roundtripping

In [19]:
bundle = models.Vmcbundle(
    meta=models.Meta(
            generated_at=datetime.datetime.isoformat(datetime.datetime.now()),
            vmc_version=0,
        ),
    locations = {o.id: o.as_dict() for o in locations_by_name.values()},
    alleles = {o.id: o.as_dict() for o in alleles_by_name.values()},
    haplotypes = {o.id: o.as_dict() for o in haplotypes_by_name.values()},
    genotypes = {o.id: o.as_dict() for o in genotypes_by_name.values()},
    identifiers = {n: [ir.as_dict() for ir in irs] for n, irs in identifiers.items()}
)

In [20]:
ppj(bundle)

{
    "alleles": {
        "VMC:GA_8vT5C3XyPLVz4_AXCI5P-J0gobxoGdxY": {
            "id": "VMC:GA_8vT5C3XyPLVz4_AXCI5P-J0gobxoGdxY",
            "location_id": "VMC:GL_9Jht-lguk_jnBvG-wLJbjmBw5v_v7rQo",
            "state": "C"
        },
        "VMC:GA_Bdf7gntpo-snoQdY356RrpCuyrrzYynA": {
            "id": "VMC:GA_Bdf7gntpo-snoQdY356RrpCuyrrzYynA",
            "location_id": "VMC:GL_LStELzYmlIQP3Zan9FhibgiFGAgSM7CI",
            "state": "C"
        },
        "VMC:GA_FABxPGRP7dT3sKot_91vXQrPvzfNYCbX": {
            "id": "VMC:GA_FABxPGRP7dT3sKot_91vXQrPvzfNYCbX",
            "location_id": "VMC:GL_LStELzYmlIQP3Zan9FhibgiFGAgSM7CI",
            "state": "T"
        },
        "VMC:GA_xXBYkzzu1AH0HRbLeFESvllmAKUNN1MF": {
            "id": "VMC:GA_xXBYkzzu1AH0HRbLeFESvllmAKUNN1MF",
            "location_id": "VMC:GL_9Jht-lguk_jnBvG-wLJbjmBw5v_v7rQo",
            "state": "T"
        }
    },
    "genotypes": {
        "VMC:GG_ISiZFONyC1HHaBxi2kBklfDQEdb5CRRe": {
            "completene

### Validate against schema

In [21]:
s = bundle.serialize()  # same as above ppj(bundle), but not pretty printed

In [22]:
schema = json.load(open(schema_path))
jsonschema.validate(bundle.as_dict(), schema)

### Verify that bundle roundtrips to same structure

In [23]:
bundle_round_trip = models.Vmcbundle(**json.loads(s))

In [24]:
bundle == bundle_round_trip

True

### Save bundle
This will be used in the VMC Bundle Example

In [25]:
open("ApoE Example.vmc.json", "w").write(s)

3571