# Aggregate Variation

In [1]:
from ga4gh.vr import models, class_refatt_map
from ga4gh.core import ga4gh_identify, ga4gh_serialize, ga4gh_digest, ga4gh_enref, ga4gh_deref

## Setup Sample Alleles

In [2]:
a1 = models.Allele(
    location=models.SequenceLocation(
        sequence_id="ga4gh:SQ.01234abcde",
        interval=models.SimpleInterval(start=10, end=11)
    ),
    state=models.SequenceState(sequence="C")
)
a2 = models.Allele(
    location=models.SequenceLocation(
        sequence_id="ga4gh:SQ.01234abcde",
        interval=models.SimpleInterval(start=20, end=21)
    ),
    state=models.SequenceState(sequence="C")
)
a3 = models.Allele(
    location=models.SequenceLocation(
        sequence_id="ga4gh:SQ.01234abcde",
        interval=models.SimpleInterval(start=30, end=31)
    ),
    state=models.SequenceState(sequence="C")
)

In [3]:
alleles = [a1,a2,a3]

alleles

[<Allele _id=None location=<SequenceLocation _id=None interval=<SimpleInterval end=<Literal<int> 11> start=<Literal<int> 10> type=<Literal<str> SimpleInterval>> sequence_id=<Literal<str> ga4gh:SQ.01234abcde> type=<Literal<str> SequenceLocation>> state=<SequenceState sequence=<Literal<str> C> type=<Literal<str> SequenceState>> type=<Literal<str> Allele>>,
 <Allele _id=None location=<SequenceLocation _id=None interval=<SimpleInterval end=<Literal<int> 21> start=<Literal<int> 20> type=<Literal<str> SimpleInterval>> sequence_id=<Literal<str> ga4gh:SQ.01234abcde> type=<Literal<str> SequenceLocation>> state=<SequenceState sequence=<Literal<str> C> type=<Literal<str> SequenceState>> type=<Literal<str> Allele>>,
 <Allele _id=None location=<SequenceLocation _id=None interval=<SimpleInterval end=<Literal<int> 31> start=<Literal<int> 30> type=<Literal<str> SimpleInterval>> sequence_id=<Literal<str> ga4gh:SQ.01234abcde> type=<Literal<str> SequenceLocation>> state=<SequenceState sequence=<Literal<s

In [4]:
allele_ids = [ga4gh_identify(a) for a in alleles]

allele_ids

['ga4gh:VA.6xjH0Ikz88s7MhcyN5GJTa1p712-M10W',
 'ga4gh:VA.7k2lyIsIsoBgRFPlfnIOeCeEgj_2BO7F',
 'ga4gh:VA.ikcK330gH3bYO2sw9QcTsoptTFnk_Xjh']

## DiscreteVariationSet

In [5]:
dvs_inlined = models.DiscreteVariationSet(members=[a1,a2,a3]) 

In [6]:
dvs_inlined.as_dict()

{'members': [{'location': {'interval': {'end': 11,
     'start': 10,
     'type': 'SimpleInterval'},
    'sequence_id': 'ga4gh:SQ.01234abcde',
    'type': 'SequenceLocation'},
   'state': {'sequence': 'C', 'type': 'SequenceState'},
   'type': 'Allele'},
  {'location': {'interval': {'end': 21, 'start': 20, 'type': 'SimpleInterval'},
    'sequence_id': 'ga4gh:SQ.01234abcde',
    'type': 'SequenceLocation'},
   'state': {'sequence': 'C', 'type': 'SequenceState'},
   'type': 'Allele'},
  {'location': {'interval': {'end': 31, 'start': 30, 'type': 'SimpleInterval'},
    'sequence_id': 'ga4gh:SQ.01234abcde',
    'type': 'SequenceLocation'},
   'state': {'sequence': 'C', 'type': 'SequenceState'},
   'type': 'Allele'}],
 'type': 'DiscreteVariationSet'}

In [8]:
ga4gh_identify(dvs_inlined)

'ga4gh:DVS.1KROloaQq-Hlzja6fzAQhn_tY4Vt4_hP'

In [9]:
# computed id does not depend on order of members
dvs_inlined2 = models.DiscreteVariationSet(members=[a3,a2,a1])
ga4gh_identify(dvs_inlined2)

'ga4gh:DVS.1KROloaQq-Hlzja6fzAQhn_tY4Vt4_hP'

In [11]:
# computed id is the same when members are defined by id
dvs_referenced = models.DiscreteVariationSet(members=allele_ids)
dvs_referenced.as_dict()

{'members': ['ga4gh:VA.6xjH0Ikz88s7MhcyN5GJTa1p712-M10W',
  'ga4gh:VA.7k2lyIsIsoBgRFPlfnIOeCeEgj_2BO7F',
  'ga4gh:VA.ikcK330gH3bYO2sw9QcTsoptTFnk_Xjh'],
 'type': 'DiscreteVariationSet'}

In [12]:
ga4gh_identify(dvs_referenced)

'ga4gh:DVS.1KROloaQq-Hlzja6fzAQhn_tY4Vt4_hP'

### Intentional error: members must be unique (a set)

In [13]:
dvs = models.DiscreteVariationSet(members=[a1,a2,a3,a3])

ValidationError: [<Allele _id=None location=<SequenceLocation _id=None interval=<SimpleInterval end=<Literal<int> 11> start=<Literal<int> 10> type=<Literal<str> SimpleInterval>> sequence_id=<Literal<str> ga4gh:SQ.01234abcde> type=<Literal<str> SequenceLocation>> state=<SequenceState sequence=<Literal<str> C> type=<Literal<str> SequenceState>> type=<Literal<str> Allele>>, <Allele _id=None location=<SequenceLocation _id=None interval=<SimpleInterval end=<Literal<int> 21> start=<Literal<int> 20> type=<Literal<str> SimpleInterval>> sequence_id=<Literal<str> ga4gh:SQ.01234abcde> type=<Literal<str> SequenceLocation>> state=<SequenceState sequence=<Literal<str> C> type=<Literal<str> SequenceState>> type=<Literal<str> Allele>>, <Allele _id=None location=<SequenceLocation _id=None interval=<SimpleInterval end=<Literal<int> 31> start=<Literal<int> 30> type=<Literal<str> SimpleInterval>> sequence_id=<Literal<str> ga4gh:SQ.01234abcde> type=<Literal<str> SequenceLocation>> state=<SequenceState sequence=<Literal<str> C> type=<Literal<str> SequenceState>> type=<Literal<str> Allele>>, <Allele _id=None location=<SequenceLocation _id=None interval=<SimpleInterval end=<Literal<int> 31> start=<Literal<int> 30> type=<Literal<str> SimpleInterval>> sequence_id=<Literal<str> ga4gh:SQ.01234abcde> type=<Literal<str> SequenceLocation>> state=<SequenceState sequence=<Literal<str> C> type=<Literal<str> SequenceState>> type=<Literal<str> Allele>>] has duplicate elements, but uniqueness required 
while setting 'members' in DiscreteVariationSet

## Enref / Deref

In [14]:
object_store = {}
def vr_enref(o): return ga4gh_enref(o, cra_map=class_refatt_map, object_store=object_store)
def vr_deref(o): return ga4gh_deref(o, cra_map=class_refatt_map, object_store=object_store)

In [15]:
dvs = models.DiscreteVariationSet(members=[a1,a2,a3]) 
dvs.as_dict()

{'members': [{'location': {'interval': {'end': 11,
     'start': 10,
     'type': 'SimpleInterval'},
    'sequence_id': 'ga4gh:SQ.01234abcde',
    'type': 'SequenceLocation'},
   'state': {'sequence': 'C', 'type': 'SequenceState'},
   'type': 'Allele'},
  {'location': {'interval': {'end': 21, 'start': 20, 'type': 'SimpleInterval'},
    'sequence_id': 'ga4gh:SQ.01234abcde',
    'type': 'SequenceLocation'},
   'state': {'sequence': 'C', 'type': 'SequenceState'},
   'type': 'Allele'},
  {'location': {'interval': {'end': 31, 'start': 30, 'type': 'SimpleInterval'},
    'sequence_id': 'ga4gh:SQ.01234abcde',
    'type': 'SequenceLocation'},
   'state': {'sequence': 'C', 'type': 'SequenceState'},
   'type': 'Allele'}],
 'type': 'DiscreteVariationSet'}

In [22]:
# "enref" recursively identifies and stores the embedded objects in the object store
dvs2 = vr_enref(dvs)
dvs2.as_dict()

{'members': ['ga4gh:VA.6xjH0Ikz88s7MhcyN5GJTa1p712-M10W',
  'ga4gh:VA.7k2lyIsIsoBgRFPlfnIOeCeEgj_2BO7F',
  'ga4gh:VA.ikcK330gH3bYO2sw9QcTsoptTFnk_Xjh'],
 'type': 'DiscreteVariationSet'}

In [23]:
# object_store now contains the fully-referenced forms of all objects, recursively
list(object_store.keys())

['ga4gh:VSL.EIy4ssWCI2YW3XDTSaf26A75Zjxqu0qD',
 'ga4gh:VA.6xjH0Ikz88s7MhcyN5GJTa1p712-M10W',
 'ga4gh:VSL.SHAyou8BM660a9u9OXzn7h-DYOX9OSMD',
 'ga4gh:VA.7k2lyIsIsoBgRFPlfnIOeCeEgj_2BO7F',
 'ga4gh:VSL.FEJTkuL6G4U2WUJ2LgejLm--ZUDnCiV7',
 'ga4gh:VA.ikcK330gH3bYO2sw9QcTsoptTFnk_Xjh',
 'ga4gh:DVS.1KROloaQq-Hlzja6fzAQhn_tY4Vt4_hP']

In [24]:
# "deref" reconstitutes the fully inlined objects
dvs3 = vr_deref(dvs2)
dvs3.as_dict()

{'members': [{'location': {'interval': {'end': 11,
     'start': 10,
     'type': 'SimpleInterval'},
    'sequence_id': 'ga4gh:SQ.01234abcde',
    'type': 'SequenceLocation'},
   'state': {'sequence': 'C', 'type': 'SequenceState'},
   'type': 'Allele'},
  {'location': {'interval': {'end': 21, 'start': 20, 'type': 'SimpleInterval'},
    'sequence_id': 'ga4gh:SQ.01234abcde',
    'type': 'SequenceLocation'},
   'state': {'sequence': 'C', 'type': 'SequenceState'},
   'type': 'Allele'},
  {'location': {'interval': {'end': 31, 'start': 30, 'type': 'SimpleInterval'},
    'sequence_id': 'ga4gh:SQ.01234abcde',
    'type': 'SequenceLocation'},
   'state': {'sequence': 'C', 'type': 'SequenceState'},
   'type': 'Allele'}],
 'type': 'DiscreteVariationSet'}

In [25]:
dvs == dvs3

True