In [13]:
import json
from timeit import default_timer as timer

import ga4gh.vrs._internal.models
import ga4gh.core._internal.identifiers
import ga4gh.core._internal.pydantic 
import importlib
importlib.reload(ga4gh.vrs._internal.models)
importlib.reload(ga4gh.core._internal.identifiers)
importlib.reload(ga4gh.core._internal.pydantic)

from ga4gh.vrs._internal.models import (
    Allele,
    Haplotype,
    Genotype,
    GenotypeMember
)
from ga4gh.core._internal.identifiers import (
    ga4gh_serialize,
    ga4gh_identify,
    ga4gh_digest,
    sha512t24u,
    identify_all,
    collapse_identifiable_values,
    replace_with_digest
)
def pretty_print(d: dict):
    print(json.dumps(d, indent=2))

TODO

- Update pydantic.py helper functions to work for classes in models.py (if those functions are needed)
- Update Translator _from* functions to work for the simple Allele in test_vrs.py (also below in allele_dict)
- Update the ga4gh_serialize and ga4gh_digest functions to work with Pydantic classes and the ga4ghDigest field in VRS 2.0
- Update test in test_vrs.py to work for new serialize and digest functions, for update expected values for VRS 2.model
- location start/end validation in Pydantic class
- IRI validation in Pydantic class (curie, compacted identifier, or fully qualified IRI according to RFCs)


VRS 2.0 merge issues with pydantic classes
ga4gh.vrs.extras.localizer. Line 56, refers to _value. Line 49 refers to _id


In [14]:
allele_dict = {
    'location': {
        'end': 55181320,
        'start': 55181319,
        'sequence': {
            'type': 'SequenceReference',
            'refgetAccession': 'SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul'
        },
        # 'sequence_id': 'ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul',
        'type': 'SequenceLocation'
    },
    'state': {
        'sequence': 'T',
        'type': 'LiteralSequenceExpression'
    },
    'type': 'Allele'
}
allele_dict2 = {
    'location': {
        'end': 55181320,
        'start': 55181319,
        'sequence': 'SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul',
        'type': 'SequenceLocation'
    },
    'state': {
        'sequence': 'T',
        'type': 'LiteralSequenceExpression'
    },
    'type': 'Allele'
}
allele = Allele(**allele_dict)
allele2 = Allele(**allele_dict2)
#print(allele2.model_dump(exclude_none=True))
print(allele2)

id=None label=None extensions=None type='Allele' digest=None location=SequenceLocation(id=None, label=None, extensions=None, type='SequenceLocation', digest=None, sequence=IRI(root='SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul'), start=55181319, end=55181320) state=LiteralSequenceExpression(id=None, label=None, extensions=None, type='LiteralSequenceExpression', sequence=SequenceString(root='T'))


In [15]:
allele_identified = identify_all(allele)
pretty_print(allele_identified)

{
  "location": {
    "type": "SequenceLocation",
    "start": 55181319,
    "end": 55181320,
    "sequence": {
      "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul",
      "type": "SequenceReference",
      "digest": "OFEyBMeo55q3QRrxAY5FiDqnkdyf0GTV"
    },
    "digest": "X0qrF7RfZxGIVIOddTYooZ_23D9mw6p6"
  },
  "state": {
    "type": "LiteralSequenceExpression",
    "sequence": "T"
  },
  "type": "Allele",
  "digest": "oFiLzDh37SoecjP7dceRaUfVlh32NnCg"
}


In [16]:
location_serialized = ga4gh_serialize(allele.location)
print("Location serialized: " + str(location_serialized))
print("Location digest: " + str(sha512t24u(location_serialized)))
print("Location digest: " + str(ga4gh_digest(allele.location)))

allele_serialized = ga4gh_serialize(allele)
print("Allele serialized: " + str(allele_serialized))
print("Allele digest: " + str(sha512t24u(allele_serialized)))
print("Allele digest: " + str(ga4gh_digest(allele)))

Location serialized: b'{"end":55181320,"sequence":"OFEyBMeo55q3QRrxAY5FiDqnkdyf0GTV","start":55181319,"type":"SequenceLocation"}'
Location digest: X0qrF7RfZxGIVIOddTYooZ_23D9mw6p6
Location digest: X0qrF7RfZxGIVIOddTYooZ_23D9mw6p6
Allele serialized: b'{"location":"X0qrF7RfZxGIVIOddTYooZ_23D9mw6p6","state":{"sequence":"T","type":"LiteralSequenceExpression"},"type":"Allele"}'
Allele digest: oFiLzDh37SoecjP7dceRaUfVlh32NnCg
Allele digest: oFiLzDh37SoecjP7dceRaUfVlh32NnCg


In [17]:
ct = 100
start = timer()
for i in range(ct):
    digest = ga4gh_digest(allele)
end = timer()
print("Average duration: " + str((end - start) / ct))

Average duration: 0.00015168126003118232


In [18]:
# genotype 
# https://www.ncbi.nlm.nih.gov/clinvar/variation/431013/
#   haplotype
#   https://www.ncbi.nlm.nih.gov/clinvar/variation/431012/
#     allele
#     https://www.ncbi.nlm.nih.gov/clinvar/variation/383650/
#     https://www.ncbi.nlm.nih.gov/clinvar/variation/417816/
# TODO other simple allele in genotype

# NC_000009.12:g.128325835C>T
allele_383650_dict = {
    "type": "Allele",
    "location": {
        "type": "SequenceLocation",
        "sequence": {
            "type": "SequenceReference",
            "refgetAccession": "ga4gh:SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI"
        },
        "start": 128325834,
        "end": 128325835
    },
    "state": {
        "type": "LiteralSequenceExpression",
        "sequence": "T"
    }
}
allele_417816_dict = {
    "type": "Allele",
    "location": {
        "type": "SequenceLocation",
        "sequence": {
            "type": "SequenceReference",
            "refgetAccession": "ga4gh:SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI"
        },
        "start": 128325809,
        "end": 128325810
    },
    "state": {
        "type": "LiteralSequenceExpression",
        "sequence": "T"
    }
}
allele_280320_dict = {
      "type": "Allele",
      "location": {
        "type": "SequenceLocation",
        "sequence": {
            "type": "SequenceReference",
            "refgetAccession": "ga4gh:SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI"
        },
        "start": 128322879,
        "end": 128322891
      },
      "state": {
        "type": "LiteralSequenceExpression",
        "sequence": "G"
      }
    }
allele_383650 = Allele(**allele_383650_dict)
allele_417816 = Allele(**allele_417816_dict)
allele_280320 = Allele(**allele_280320_dict)

haplotype_431012_dict = {
    "members": [allele_383650_dict, allele_417816_dict]
}
haplotype_431012 = Haplotype(**haplotype_431012_dict)
#pretty_print(haplotype_431012.model_dump(exclude_none=True))
haplotype_431012_serialized = ga4gh_serialize(haplotype_431012)
#print(f"Haplotype serialized: {haplotype_431012_serialized}")
#print(f"Haplotype digest: {ga4gh_digest(haplotype_431012)}")


start = timer()
for i in range(ct):
    digest = ga4gh_digest(haplotype_431012)
end = timer()
print("Average duration: " + str((end - start) / ct))

Average duration: 0.0003508283200790174


In [19]:
genotype_431013_dict = {
    "type": "Genotype",
    "count": 1,
    "members": [
        {
            "type": "GenotypeMember",
            "variation": haplotype_431012_dict,
            "count": 1
        },
        {
            "type": "GenotypeMember",
            "variation": allele_280320_dict,
            "count": 1
        }
    ]
}
genotype_431013 = Genotype(**genotype_431013_dict)

# pretty_print(genotype_431013.model_dump(exclude_none=True))
genotype_431013_serialized = ga4gh_serialize(genotype_431013)
#print(f"Genotype serialized: {genotype_431013_serialized}")
#print(f"Genotype digest: {ga4gh_digest(genotype_431013)}")

start = timer()
for i in range(ct):
    digest = ga4gh_digest(genotype_431013)
end = timer()
print("Average duration: " + str((end - start) / ct))


Average duration: 0.0007112802198389546


In [20]:
import ga4gh.vrs.extras.translator
import ga4gh.vrs.dataproxy
import ga4gh.vrs
importlib.reload(ga4gh.vrs.extras.translator)
importlib.reload(ga4gh.vrs.dataproxy)
importlib.reload(ga4gh.vrs)

from ga4gh.vrs.extras.translator import Translator
from ga4gh.vrs.dataproxy import SeqRepoDataProxy
from biocommons.seqrepo import SeqRepo


data_proxy = SeqRepoDataProxy(SeqRepo("/Users/kferrite/dev/biocommons.seqrepo/seqrepo/2021-01-29"))
translator = Translator(data_proxy=data_proxy)
# translator._from_beacon("13 : 32936732 G > C")
# data_proxy.get_metadata('GRCh38:13')
spdi_383650 = 'NC_000009.12:128325834:C:T'
translator._from_spdi(spdi_383650)

  Expected `Union[definition-ref, SequenceLocation]` but got `SequenceLocation` - serialized value may not be as expected
  Expected `Union[IRI, SequenceReference]` but got `str` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(


Allele(id='ga4gh:VA.UcWOtAQ-g2vFcpXWj1iS_vZJmRf8S2a2', label=None, extensions=None, type='Allele', digest=None, location=SequenceLocation(id='ga4gh:SL.50MiHlbmBi89q-SdWLoAzo4QvMeFhJSr', label=None, extensions=None, type='SequenceLocation', digest=None, sequence=IRI(root='ga4gh:SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI'), start=128325834, end=128325835), state=LiteralSequenceExpression(id=None, label=None, extensions=None, type='LiteralSequenceExpression', sequence=SequenceString(root='T')))

In [21]:
pretty_print(allele_280320.model_dump(exclude_none=True))
ga4gh.vrs.normalize(allele_280320, data_proxy=data_proxy)

{
  "type": "Allele",
  "location": {
    "type": "SequenceLocation",
    "sequence": {
      "type": "SequenceReference",
      "refgetAccession": "ga4gh:SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI"
    },
    "start": 128322879,
    "end": 128322891
  },
  "state": {
    "type": "LiteralSequenceExpression",
    "sequence": "G"
  }
}


Allele(id=None, label=None, extensions=None, type='Allele', digest=None, location=SequenceLocation(id=None, label=None, extensions=None, type='SequenceLocation', digest=None, sequence=SequenceReference(id=None, label=None, extensions=None, type='SequenceReference', digest=None, refgetAccession='ga4gh:SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI', residueAlphabet=None), start=128322879, end=128322891), state=LiteralSequenceExpression(id=None, label=None, extensions=None, type='LiteralSequenceExpression', sequence=SequenceString(root='G')))