In [39]:
import ga4gh.core
from ga4gh.vrs import models
from ga4gh.core import sha512t24u

In [40]:
allele_dict = {
    'location': {
        'end': 55181320,
        'start': 55181319,
        'sequenceReference': {
            'type': 'SequenceReference',
            'refgetAccession': 'SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul'
        },
        'type': 'SequenceLocation'
    },
    'state': {
        'sequence': 'T',
        'type': 'LiteralSequenceExpression'
    },
    'type': 'Allele'
}

a = models.Allele(**allele_dict)

In [41]:
a.model_dump_json()

'{"location":"_G2K0qSioM74l_u3OaKR0mgLYdeTL7Xd","state":{"sequence":"T","type":"LiteralSequenceExpression"},"type":"Allele"}'

In [42]:
a.location.model_dump_json()

'{"end":55181320,"sequenceReference":{"refgetAccession":"SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul","type":"SequenceReference"},"start":55181319,"type":"SequenceLocation"}'

In [43]:
s = '{"members":["CvJUnTllC5zQ-M1Hbj9oj6BQitKw67J9","QZGrlXd07EPr1mUVyhfaEN8mJVmN1PGF"],"type":"Haplotype"}'
sha512t24u(s.encode('utf-8'))

'INEbVdrxv2YgfkREl0A1suBM6AL-3Fo3'

In [44]:
iri = models.IRI.model_construct("ga4gh:VA.Hy2XU_-rp4IMh6I_1NXNecBo8Qx8n0oE")
iri.model_dump_json()

'"Hy2XU_-rp4IMh6I_1NXNecBo8Qx8n0oE"'

In [45]:
from ga4gh.core import GA4GH_IR_REGEXP

In [46]:
GA4GH_IR_REGEXP.match(iri.root)['digest']

'Hy2XU_-rp4IMh6I_1NXNecBo8Qx8n0oE'

## Schema testing

In [47]:
from ga4gh.vrs import models

In [48]:
set(models.SequenceLocation.__fields__)

{'description',
 'digest',
 'end',
 'extensions',
 'id',
 'label',
 'sequenceReference',
 'start',
 'type'}

In [49]:
getattr(models, 'SequenceLocation', False)

ga4gh.vrs._internal.models.SequenceLocation

In [50]:
from pathlib import Path
import os
import yaml

In [51]:
ROOT_DIR = Path(os.getcwd()).parent
VRS_SCHEMA_DIR = ROOT_DIR / 'submodules' / 'vrs' / 'schema'

In [52]:

concrete_class_names = list()
with open(VRS_SCHEMA_DIR / 'vrs.yaml') as vrs_yaml:
    vrs_schema = yaml.safe_load(vrs_yaml)


In [53]:
for vrs_class in vrs_schema['$defs']:
    if 'properties' in vrs_schema['$defs'][vrs_class].keys():
        concrete_class_names.append(vrs_class)

In [54]:
getattr(models, 'Allele')

ga4gh.vrs._internal.models.Allele

In [55]:
p = getattr(models, 'LengthExpression')

## Enref and deref

In [56]:
from ga4gh.vrs import vrs_enref, vrs_deref

In [57]:
obj_store = dict()
a.get_or_create_ga4gh_identifier()
a.model_dump(exclude_none=True)

{'id': 'ga4gh:VA.Hy2XU_-rp4IMh6I_1NXNecBo8Qx8n0oE',
 'type': 'Allele',
 'digest': 'Hy2XU_-rp4IMh6I_1NXNecBo8Qx8n0oE',
 'location': {'type': 'SequenceLocation',
  'digest': '_G2K0qSioM74l_u3OaKR0mgLYdeTL7Xd',
  'sequenceReference': {'type': 'SequenceReference',
   'refgetAccession': 'SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul'},
  'start': 55181319,
  'end': 55181320},
 'state': {'type': 'LiteralSequenceExpression', 'sequence': 'T'}}

In [58]:
a_enref = vrs_enref(a, obj_store)
sl_enref = vrs_enref(a.location, obj_store)

In [59]:
a_enref.model_dump(exclude_none=True)

  Expected `Union[definition-ref, plain_function[ga4gh_serialize]]` but got `str` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(


{'id': 'ga4gh:VA.Hy2XU_-rp4IMh6I_1NXNecBo8Qx8n0oE',
 'type': 'Allele',
 'digest': 'Hy2XU_-rp4IMh6I_1NXNecBo8Qx8n0oE',
 'location': 'ga4gh:SL._G2K0qSioM74l_u3OaKR0mgLYdeTL7Xd',
 'state': {'type': 'LiteralSequenceExpression', 'sequence': 'T'}}

In [60]:
obj_store

{'ga4gh:SL._G2K0qSioM74l_u3OaKR0mgLYdeTL7Xd': SequenceLocation(id='ga4gh:SL._G2K0qSioM74l_u3OaKR0mgLYdeTL7Xd', label=None, description=None, extensions=None, type='SequenceLocation', digest='_G2K0qSioM74l_u3OaKR0mgLYdeTL7Xd', sequenceReference=SequenceReference(id=None, label=None, description=None, extensions=None, type='SequenceReference', refgetAccession='SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul', residueAlphabet=None), start=55181319, end=55181320),
 'ga4gh:VA.Hy2XU_-rp4IMh6I_1NXNecBo8Qx8n0oE': Allele(id='ga4gh:VA.Hy2XU_-rp4IMh6I_1NXNecBo8Qx8n0oE', label=None, description=None, extensions=None, type='Allele', digest='Hy2XU_-rp4IMh6I_1NXNecBo8Qx8n0oE', expressions=None, location='ga4gh:SL._G2K0qSioM74l_u3OaKR0mgLYdeTL7Xd', state=LiteralSequenceExpression(id=None, label=None, description=None, extensions=None, type='LiteralSequenceExpression', sequence=SequenceString(root='T')))}

In [61]:
a_deref = vrs_deref(a_enref, obj_store)

SequenceLocation not in cra_map {'Allele': ['location'], 'Haplotype': ['members'], '_CopyNumber': ['location'], 'CopyNumberCount': ['location'], 'CopyNumberChange': ['location']}


In [62]:
a_deref.location.model_dump(exclude_none=True)

{'id': 'ga4gh:SL._G2K0qSioM74l_u3OaKR0mgLYdeTL7Xd',
 'type': 'SequenceLocation',
 'digest': '_G2K0qSioM74l_u3OaKR0mgLYdeTL7Xd',
 'sequenceReference': {'type': 'SequenceReference',
  'refgetAccession': 'SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul'},
 'start': 55181319,
 'end': 55181320}

## Dups

In [63]:
from ga4gh.vrs.dataproxy import SeqRepoDataProxy
from biocommons.seqrepo import SeqRepo
from ga4gh.vrs.extras.translator import Translator
from ga4gh.vrs import models

data_proxy = SeqRepoDataProxy(SeqRepo("/usr/local/share/seqrepo/2021-01-29"))
translator = Translator(data_proxy=data_proxy)

In [64]:
from ga4gh.vrs import normalize
a = {
    'location': {
        'end': 289464,
        'start': 289464,
        'sequenceReference': {
            'type': 'SequenceReference',
            'refgetAccession': 'SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl'
        },
        'type': 'SequenceLocation'
    },
    'state': {
        'sequence': 'CAGCAG',
        'type': 'LiteralSequenceExpression'
    },
    'type': 'Allele'
}
small_dup = models.Allele(**a)

In [65]:
# Expecting an RLE with RSL=3
a2 = normalize(small_dup, data_proxy=data_proxy)
a2.model_dump(exclude_none=True)

{'type': 'Allele',
 'location': {'type': 'SequenceLocation',
  'sequenceReference': {'type': 'SequenceReference',
   'refgetAccession': 'SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl'},
  'start': 289464,
  'end': 289469},
 'state': {'type': 'ReferenceLengthExpression',
  'length': 11,
  'sequence': 'CAGCAGCAGCA',
  'repeatSubunitLength': 3}}

In [66]:
b = {
    'location': {
        'end': 289464,
        'start': 289464,
        'sequenceReference': {
            'type': 'SequenceReference',
            'refgetAccession': 'SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl'
        },
        'type': 'SequenceLocation'
    },
    'state': {
        'sequence': 'CACA',
        'type': 'LiteralSequenceExpression'
    },
    'type': 'Allele'
}
small_dup2 = models.Allele(**b)

In [67]:
# Expecting an RLE with RSL=2
normalize(small_dup2, data_proxy=data_proxy)

Allele(id=None, label=None, description=None, extensions=None, type='Allele', digest=None, expressions=None, location=SequenceLocation(id=None, label=None, description=None, extensions=None, type='SequenceLocation', digest=None, sequenceReference=SequenceReference(id=None, label=None, description=None, extensions=None, type='SequenceReference', refgetAccession='SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl', residueAlphabet=None), start=289464, end=289466), state=ReferenceLengthExpression(id=None, label=None, description=None, extensions=None, type='ReferenceLengthExpression', length=6, sequence=SequenceString(root='CACACA'), repeatSubunitLength=2))

In [68]:
c = {'digest': 'swY2caCgv1kP6YqKyPlcEzJqTvou15vC',
 'id': 'ga4gh:VA.swY2caCgv1kP6YqKyPlcEzJqTvou15vC',
 'location': {'digest': 'ikECYncPpE1xh6f_LiComrFGevocjDHQ',
              'end': 32331094,
              'id': 'ga4gh:SL.ikECYncPpE1xh6f_LiComrFGevocjDHQ',
              'sequenceReference': {'refgetAccession': 'SQ._0wi-qoDrvram155UmcSC-zA5ZK4fpLT',
                                    'type': 'SequenceReference'},
              'start': 32331082,
              'type': 'SequenceLocation'},
 'state': {'length': 14,
           'repeatSubunitLength': 2,
           'sequence': 'TTTTTTTTTTTTTT',
           'type': 'ReferenceLengthExpression'},
 'type': 'Allele'}
multi_repeat_rle = models.Allele(**c)

In [69]:
d = {
    'location': {
        'end': 32331094,
        'start': 32331082,
        'sequenceReference': {
            'type': 'SequenceReference',
            'refgetAccession': 'SQ._0wi-qoDrvram155UmcSC-zA5ZK4fpLT'
        },
        'type': 'SequenceLocation'
    },
    'state': {
        'sequence': 'TTTTTTTTTTTTTT',
        'type': 'LiteralSequenceExpression'
    },
    'type': 'Allele'
}
multi_repeat_lse = models.Allele(**d)

In [70]:
normalize(multi_repeat_lse, data_proxy=data_proxy)

Allele(id=None, label=None, description=None, extensions=None, type='Allele', digest=None, expressions=None, location=SequenceLocation(id=None, label=None, description=None, extensions=None, type='SequenceLocation', digest=None, sequenceReference=SequenceReference(id=None, label=None, description=None, extensions=None, type='SequenceReference', refgetAccession='SQ._0wi-qoDrvram155UmcSC-zA5ZK4fpLT', residueAlphabet=None), start=32331082, end=32331094), state=ReferenceLengthExpression(id=None, label=None, description=None, extensions=None, type='ReferenceLengthExpression', length=14, sequence=SequenceString(root='TTTTTTTTTTTTTT'), repeatSubunitLength=2))

In [76]:
a = {
    "type": "Allele",
    "digest": "SZIS2ua7AL-0YgUTAqyBsFPYK3vE8h_d",
    "id": "ga4gh:VA.SZIS2ua7AL-0YgUTAqyBsFPYK3vE8h_d",
    "location": {
        "type": "SequenceLocation",
        "sequenceReference": {
            "type": "SequenceReference",
            "refgetAccession": "SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI"
        },
        "start": 128325834,
        "end": 128325835
    },
    "state": {
        "type": "LiteralSequenceExpression",
        "sequence": "T"
    }
}
a_norm = normalize(models.Allele(**a), data_proxy=data_proxy)
a_norm.model_dump(exclude_none=True)

{'id': 'ga4gh:VA.SZIS2ua7AL-0YgUTAqyBsFPYK3vE8h_d',
 'type': 'Allele',
 'digest': 'SZIS2ua7AL-0YgUTAqyBsFPYK3vE8h_d',
 'location': {'type': 'SequenceLocation',
  'sequenceReference': {'type': 'SequenceReference',
   'refgetAccession': 'SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI'},
  'start': 128325834,
  'end': 128325835},
 'state': {'type': 'LiteralSequenceExpression', 'sequence': 'T'}}

In [74]:
a_norm.compute_ga4gh_identifier(recompute=True)

'ga4gh:VA.SZIS2ua7AL-0YgUTAqyBsFPYK3vE8h_d'

In [75]:
a_norm.location.compute_ga4gh_identifier(recompute=True)

'ga4gh:SL.TaoXEhpHvA6SdilBUO-AX00YDARv9Uoe'