# Haplotypes

This notebook demonstrates VRS Haplotypes for HLA-A.

# Setup

In [1]:
from ga4gh.vrs import models, vrs_deref, vrs_enref
from ga4gh.core import ga4gh_identify, ga4gh_serialize, ga4gh_digest, ga4gh_deref

import json
def ppo(o, indent=2):
    """pretty print object as json"""
    print(json.dumps(o.as_dict(), sort_keys=True, indent=indent))
    
from ga4gh.vrs.dataproxy import SeqRepoRESTDataProxy
from biocommons.seqrepo import SeqRepo
    


Removing allOf attribute from AbsoluteCopyNumber to avoid python-jsonschema-objects error.
Removing allOf attribute from SequenceInterval to avoid python-jsonschema-objects error.
Removing allOf attribute from RepeatedSequenceExpression to avoid python-jsonschema-objects error.


In [2]:
seqrepo_rest_service_url = "https://services.genomicmedlab.org/seqrepo"
dp = SeqRepoRESTDataProxy(base_url=seqrepo_rest_service_url)

In [3]:
sequence_id = dp.translate_sequence_identifier("NM_002116.7", "ga4gh")[0]

In [4]:
dp.translate_sequence_identifier("ga4gh:SQ.5YTAZHEuLC_-Mee3VqsnsGt9YZ_WFaTi", "refseq")

['refseq:NM_002116.7']

In [5]:
sequence_id

'ga4gh:SQ.5YTAZHEuLC_-Mee3VqsnsGt9YZ_WFaTi'

In [6]:
hla_locations = {
    "a1_loc": models.SequenceLocation(
        sequence_id = sequence_id,
        interval = models.SequenceInterval(start=models.Number(value=97, type="Number"), 
                                           end=models.Number(value=98, type="Number"), 
                                           type="SequenceInterval"),
        type="SequenceLocation"),
    "a2_loc": models.SequenceLocation(
        sequence_id = sequence_id,
        interval=models.SequenceInterval(start=models.Number(value=120, type="Number"), 
                                         end=models.Number(value=121, type="Number"),
                                         type="SequenceInterval"),
        type="SequenceLocation")
}

In [7]:
# Note: this was an example exercise that only used the first two
#       Alleles of this Haplotype. For an exhaustive view, see
#       https://www.ebi.ac.uk/cgi-bin/ipd/pl/hla/get_allele_hgvs.cgi?A*01:02:01:01

hla_alleles = {
    "a1_allele": models.Allele(location=hla_locations["a1_loc"], state=models.SequenceState(sequence="C", type="SequenceState"), type="Allele"),
    "a2_allele": models.Allele(location=hla_locations["a2_loc"], state=models.SequenceState(sequence="A", type="SequenceState"), type="Allele"),
}
    

In [8]:
hla_haplotype = {
    "hla_hap_1": models.Haplotype(members=[hla_alleles["a1_allele"], hla_alleles["a2_allele"]]),
}

In [9]:
hla_haplotype['hla_hap_1']

<Haplotype _id=None members=<#/definitions/Haplotype/members_<anonymous_field>=[<Allele _id=None location=<SequenceLocation _id=None interval=<SequenceInterval end=<Number type=<Literal<str> Number> value=<Literal<int> 98>> start=<Number type=<Literal<str> Number> value=<Literal<int> 97>> type=<Literal<str> SequenceInterval>> sequence_id=<Literal<str> ga4gh:SQ.5YTAZHEuLC_-Mee3VqsnsGt9YZ_WFaTi> type=<Literal<str> SequenceLocation>> state=<LiteralSequenceExpression sequence=<Literal<str> C> type=<Literal<str> SequenceState>> type=<Literal<str> Allele>>, <Allele _id=None location=<SequenceLocation _id=None interval=<SequenceInterval end=<Number type=<Literal<str> Number> value=<Literal<int> 121>> start=<Number type=<Literal<str> Number> value=<Literal<int> 120>> type=<Literal<str> SequenceInterval>> sequence_id=<Literal<str> ga4gh:SQ.5YTAZHEuLC_-Mee3VqsnsGt9YZ_WFaTi> type=<Literal<str> SequenceLocation>> state=<LiteralSequenceExpression sequence=<Literal<str> A> type=<Literal<str> Sequenc

In [10]:
hla_haplotype['hla_hap_1'].as_dict()

{'type': 'Haplotype',
 'members': [{'type': 'Allele',
   'location': {'type': 'SequenceLocation',
    'sequence_id': 'ga4gh:SQ.5YTAZHEuLC_-Mee3VqsnsGt9YZ_WFaTi',
    'interval': {'type': 'SequenceInterval',
     'start': {'type': 'Number', 'value': 97},
     'end': {'type': 'Number', 'value': 98}}},
   'state': {'type': 'SequenceState', 'sequence': 'C'}},
  {'type': 'Allele',
   'location': {'type': 'SequenceLocation',
    'sequence_id': 'ga4gh:SQ.5YTAZHEuLC_-Mee3VqsnsGt9YZ_WFaTi',
    'interval': {'type': 'SequenceInterval',
     'start': {'type': 'Number', 'value': 120},
     'end': {'type': 'Number', 'value': 121}}},
   'state': {'type': 'SequenceState', 'sequence': 'A'}}]}

In [13]:
exon1 = models.SequenceLocation(
        sequence_id = sequence_id,
        interval = models.SequenceInterval(start=models.Number(value=0, type="Number"), 
                                           end=models.Number(value=100, type="Number"), 
                                           type="SequenceInterval"))
exon2 = models.SequenceLocation(
        sequence_id = sequence_id,
        interval = models.SequenceInterval(start=models.Number(value=100, type="Number"), 
                                           end=models.Number(value=200, type="Number"), 
                                           type="SequenceInterval"))
min_information_obj = {
    "variation": hla_haplotype['hla_hap_1'].as_dict(),
    "profiled_regions": [exon1.as_dict(), exon2.as_dict()],
    "label": "A*01:02:01:01"
}

In [14]:
min_information_obj

{'variation': {'type': 'Haplotype',
  'members': [{'type': 'Allele',
    'location': {'type': 'SequenceLocation',
     'sequence_id': 'ga4gh:SQ.5YTAZHEuLC_-Mee3VqsnsGt9YZ_WFaTi',
     'interval': {'type': 'SequenceInterval',
      'start': {'type': 'Number', 'value': 97},
      'end': {'type': 'Number', 'value': 98}}},
    'state': {'type': 'SequenceState', 'sequence': 'C'}},
   {'type': 'Allele',
    'location': {'type': 'SequenceLocation',
     'sequence_id': 'ga4gh:SQ.5YTAZHEuLC_-Mee3VqsnsGt9YZ_WFaTi',
     'interval': {'type': 'SequenceInterval',
      'start': {'type': 'Number', 'value': 120},
      'end': {'type': 'Number', 'value': 121}}},
    'state': {'type': 'SequenceState', 'sequence': 'A'}}]},
 'profiled_regions': [{'type': 'SequenceLocation',
   'sequence_id': 'ga4gh:SQ.5YTAZHEuLC_-Mee3VqsnsGt9YZ_WFaTi',
   'interval': {'type': 'SequenceInterval',
    'start': {'type': 'Number', 'value': 0},
    'end': {'type': 'Number', 'value': 100}}},
  {'type': 'SequenceLocation',
   '

In [34]:
from copy import deepcopy
haplo2 = deepcopy(hla_haplotype['hla_hap_1'])
haplo1 = hla_haplotype['hla_hap_1']

In [40]:
haplo2.as_dict()

{'type': 'Haplotype',
 'members': [{'type': 'Allele',
   'location': {'type': 'SequenceLocation',
    'sequence_id': 'ga4gh:SQ.5YTAZHEuLC_-Mee3VqsnsGt9YZ_WFaTi',
    'interval': {'type': 'SequenceInterval',
     'start': {'type': 'Number', 'value': 97},
     'end': {'type': 'Number', 'value': 98}}},
   'state': {'type': 'SequenceState', 'sequence': 'C'}},
  {'type': 'Allele',
   'location': {'type': 'SequenceLocation',
    'sequence_id': 'ga4gh:SQ.5YTAZHEuLC_-Mee3VqsnsGt9YZ_WFaTi',
    'interval': {'type': 'SequenceInterval',
     'start': {'type': 'Number', 'value': 120},
     'end': {'type': 'Number', 'value': 121}}},
   'state': {'type': 'SequenceState', 'sequence': 'A'}}]}

In [41]:
haplo2 = models.Haplotype(members=[hla_alleles["a1_allele"]])


In [42]:
haplo2.as_dict()

{'type': 'Haplotype',
 'members': [{'type': 'Allele',
   'location': {'type': 'SequenceLocation',
    'sequence_id': 'ga4gh:SQ.5YTAZHEuLC_-Mee3VqsnsGt9YZ_WFaTi',
    'interval': {'type': 'SequenceInterval',
     'start': {'type': 'Number', 'value': 97},
     'end': {'type': 'Number', 'value': 98}}},
   'state': {'type': 'SequenceState', 'sequence': 'C'}}]}

In [43]:
set2 = set([ga4gh_identify(x) for x in haplo2.members])
set1 = set([ga4gh_identify(x) for x in haplo1.members])

In [44]:
set1

{'ga4gh:VA.FO7GQBJ7w2CIh5PWV6or955HH0CvYOfC',
 'ga4gh:VA.bvtGNh3LVB_wwjRiZ3Fj-1HHaDxUYzvZ'}

In [45]:
set2

{'ga4gh:VA.bvtGNh3LVB_wwjRiZ3Fj-1HHaDxUYzvZ'}

In [46]:
set2 < set1

True

In [49]:
set1 & set2

{'ga4gh:VA.bvtGNh3LVB_wwjRiZ3Fj-1HHaDxUYzvZ'}

In [48]:
['ga4gh:VA.FO7GQBJ7w2CIh5PWV6or955HH0CvYOfC' in x for x in (set1, set2)]

[True, False]