# Aggregate Variation

In [1]:
from ga4gh.vr import models
from ga4gh.core import ga4gh_identify, ga4gh_serialize, ga4gh_digest

## Setup Sample Alleles

In [2]:
a1 = models.Allele(
    location=models.SequenceLocation(
        sequence_id="ga4gh:SQ.01234abcde",
        interval=models.SimpleInterval(start=10, end=11)
    ),
    state=models.SequenceState(sequence="C")
)
a2 = models.Allele(
    location=models.SequenceLocation(
        sequence_id="ga4gh:SQ.01234abcde",
        interval=models.SimpleInterval(start=20, end=21)
    ),
    state=models.SequenceState(sequence="C")
)
a3 = models.Allele(
    location=models.SequenceLocation(
        sequence_id="ga4gh:SQ.01234abcde",
        interval=models.SimpleInterval(start=30, end=31)
    ),
    state=models.SequenceState(sequence="C")
)

In [3]:
alleles = [a1,a2,a3]

alleles

[<Allele _id=None location=<SequenceLocation _id=None interval=<SimpleInterval end=<Literal<int> 11> start=<Literal<int> 10> type=<Literal<str> SimpleInterval>> sequence_id=<Literal<str> ga4gh:SQ.01234abcde> type=<Literal<str> SequenceLocation>> state=<SequenceState sequence=<Literal<str> C> type=<Literal<str> SequenceState>> type=<Literal<str> Allele>>,
 <Allele _id=None location=<SequenceLocation _id=None interval=<SimpleInterval end=<Literal<int> 21> start=<Literal<int> 20> type=<Literal<str> SimpleInterval>> sequence_id=<Literal<str> ga4gh:SQ.01234abcde> type=<Literal<str> SequenceLocation>> state=<SequenceState sequence=<Literal<str> C> type=<Literal<str> SequenceState>> type=<Literal<str> Allele>>,
 <Allele _id=None location=<SequenceLocation _id=None interval=<SimpleInterval end=<Literal<int> 31> start=<Literal<int> 30> type=<Literal<str> SimpleInterval>> sequence_id=<Literal<str> ga4gh:SQ.01234abcde> type=<Literal<str> SequenceLocation>> state=<SequenceState sequence=<Literal<s

In [4]:
allele_ids = [ga4gh_identify(a) for a in alleles]

allele_ids

['ga4gh:VA.6xjH0Ikz88s7MhcyN5GJTa1p712-M10W',
 'ga4gh:VA.7k2lyIsIsoBgRFPlfnIOeCeEgj_2BO7F',
 'ga4gh:VA.ikcK330gH3bYO2sw9QcTsoptTFnk_Xjh']

## DiscreteVariationSet

In [5]:
dvs = models.DiscreteVariationSet(members=[a1,a2,a3]) 

In [6]:
dvs.as_dict()

{'members': [{'location': {'interval': {'end': 11,
     'start': 10,
     'type': 'SimpleInterval'},
    'sequence_id': 'ga4gh:SQ.01234abcde',
    'type': 'SequenceLocation'},
   'state': {'sequence': 'C', 'type': 'SequenceState'},
   'type': 'Allele'},
  {'location': {'interval': {'end': 21, 'start': 20, 'type': 'SimpleInterval'},
    'sequence_id': 'ga4gh:SQ.01234abcde',
    'type': 'SequenceLocation'},
   'state': {'sequence': 'C', 'type': 'SequenceState'},
   'type': 'Allele'},
  {'location': {'interval': {'end': 31, 'start': 30, 'type': 'SimpleInterval'},
    'sequence_id': 'ga4gh:SQ.01234abcde',
    'type': 'SequenceLocation'},
   'state': {'sequence': 'C', 'type': 'SequenceState'},
   'type': 'Allele'}],
 'type': 'DiscreteVariationSet'}

In [7]:
ga4gh_serialize(dvs)

b'{"members":["6xjH0Ikz88s7MhcyN5GJTa1p712-M10W","7k2lyIsIsoBgRFPlfnIOeCeEgj_2BO7F","ikcK330gH3bYO2sw9QcTsoptTFnk_Xjh"],"type":"DiscreteVariationSet"}'

In [8]:
ga4gh_identify(dvs)

'ga4gh:DVS.1KROloaQq-Hlzja6fzAQhn_tY4Vt4_hP'

### Defined in different order

In [9]:
dvs = models.DiscreteVariationSet(members=[a3,a2,a1])

In [10]:
ga4gh_identify(dvs)

'ga4gh:DVS.1KROloaQq-Hlzja6fzAQhn_tY4Vt4_hP'

### Defined by ids rather than objects

In [11]:
dvs = models.DiscreteVariationSet(members=allele_ids)
ga4gh_identify(dvs)

'ga4gh:DVS.1KROloaQq-Hlzja6fzAQhn_tY4Vt4_hP'

### Intentional error: members must be unique (a set)

In [12]:
dvs = models.DiscreteVariationSet(members=[a1,a2,a3,a3])

ValidationError: [<Allele _id=None location=<SequenceLocation _id=None interval=<SimpleInterval end=<Literal<int> 11> start=<Literal<int> 10> type=<Literal<str> SimpleInterval>> sequence_id=<Literal<str> ga4gh:SQ.01234abcde> type=<Literal<str> SequenceLocation>> state=<SequenceState sequence=<Literal<str> C> type=<Literal<str> SequenceState>> type=<Literal<str> Allele>>, <Allele _id=None location=<SequenceLocation _id=None interval=<SimpleInterval end=<Literal<int> 21> start=<Literal<int> 20> type=<Literal<str> SimpleInterval>> sequence_id=<Literal<str> ga4gh:SQ.01234abcde> type=<Literal<str> SequenceLocation>> state=<SequenceState sequence=<Literal<str> C> type=<Literal<str> SequenceState>> type=<Literal<str> Allele>>, <Allele _id=None location=<SequenceLocation _id=None interval=<SimpleInterval end=<Literal<int> 31> start=<Literal<int> 30> type=<Literal<str> SimpleInterval>> sequence_id=<Literal<str> ga4gh:SQ.01234abcde> type=<Literal<str> SequenceLocation>> state=<SequenceState sequence=<Literal<str> C> type=<Literal<str> SequenceState>> type=<Literal<str> Allele>>, <Allele _id=None location=<SequenceLocation _id=None interval=<SimpleInterval end=<Literal<int> 31> start=<Literal<int> 30> type=<Literal<str> SimpleInterval>> sequence_id=<Literal<str> ga4gh:SQ.01234abcde> type=<Literal<str> SequenceLocation>> state=<SequenceState sequence=<Literal<str> C> type=<Literal<str> SequenceState>> type=<Literal<str> Allele>>] has duplicate elements, but uniqueness required 
while setting 'members' in DiscreteVariationSet