# Modeling Genotypes and PGx Star Alleles with VRS

In [30]:
from ga4gh.vrs import models, vrs_deref, vrs_enref
from ga4gh.core import ga4gh_identify, ga4gh_serialize, ga4gh_digest, ga4gh_deref

import re
import yaml
def ppo(o, indent=3):
    """pretty print object as yaml"""
    print(yaml.dump(o.as_dict(), sort_keys=True, indent=indent))
    
from ga4gh.vrs.dataproxy import SeqRepoRESTDataProxy
from biocommons.seqrepo import SeqRepo
from ga4gh.vrs.extras.translator import Translator

In [14]:
seqrepo_rest_service_url = "https://services.genomicmedlab.org/seqrepo"
dp = SeqRepoRESTDataProxy(base_url=seqrepo_rest_service_url)

tlr = Translator(data_proxy=dp)

## Representing PharmGKB CYP2C19 *1/*17


In [15]:
# "Haplotype" 1: https://www.pharmgkb.org/haplotype/PA165980634

In [16]:
hgvs_expression = "NC_000010.11:g.94842866A>G"

In [17]:
h1_allele1 = tlr.translate_from(hgvs_expression,'hgvs')

In [18]:
# Haplotype 2: https://www.pharmgkb.org/haplotype/PA165816533
# H2 Allele 1: https://www.ncbi.nlm.nih.gov/projects/SNP/snp_ref.cgi?rs=rs12248560
# H2 Allele 2: https://www.ncbi.nlm.nih.gov/projects/SNP/snp_ref.cgi?rs=rs3758581

hgvs_expression1 = "NC_000010.11:g.94761900C>T"
h2_allele1 = tlr.translate_from(hgvs_expression1,'hgvs')

hgvs_expression2 = "NC_000010.11:g.94842866A>G"
h2_allele2 = tlr.translate_from(hgvs_expression2,'hgvs')

In [19]:
h2 = models.Haplotype(members=[h2_allele1, h2_allele2])

In [20]:
gt_mem1 = models.GenotypeMember(copies=models.Number(value=1), variation=h1_allele1)

In [21]:
gt_mem2 = models.GenotypeMember(variation=h2, copies=models.Number(value=1))
gt = models.Genotype(members=[gt_mem1, gt_mem2], copies=models.Number(value=2))

In [22]:
gt._id = ga4gh_identify(gt)

In [23]:
# A representation of this Genotype in YAML
ppo(gt)

_id: ga4gh:GT.ln8ybjp_6bQLyjlfRLmoZpo4oGu-Srnn
copies:
   type: Number
   value: 2
members:
-  copies:
      type: Number
      value: 1
   type: GenotypeMember
   variation:
      _id: ga4gh:VA.geQCxa1Enel8UBUAQQ2-rbphDjIR-cq0
      location:
         interval:
            end:
               type: Number
               value: 94842866
            start:
               type: Number
               value: 94842865
            type: SequenceInterval
         sequence_id: ga4gh:SQ.ss8r_wB0-b9r44TQTMmVTI92884QvBiB
         type: SequenceLocation
      state:
         sequence: G
         type: LiteralSequenceExpression
      type: Allele
-  copies:
      type: Number
      value: 1
   type: GenotypeMember
   variation:
      members:
      -  _id: ga4gh:VA.jWqv036CdZJs4YjwEYptDIBcoT7Uxv5I
         location:
            interval:
               end:
                  type: Number
                  value: 94761900
               start:
                  type: Number
                  value: 

## Developing a community schema for extending Star Allele Genotypes with definitive Sequence Locations

A list of definitive sites was extracted from the CYP2C19 Allele Definition Table:
https://files.cpicpgx.org/data/report/current/allele_definition/CYP2C19_allele_definition_table.xlsx

In [26]:
variant_sites = "g.94761900C>T	g.94762706A>G	g.94762712C>T	g.94762715T>C	g.94762755T>C	g.94762760A>C	g.94762788A>T	g.94762856A>G	g.94775106C>T	g.94775121C>T	g.94775160G>C	g.94775185A>G	g.94775367A>G	g.94775416T>C	g.94775423A>C	g.94775453G>A	g.94775489G>A	g.94775507G>A	g.94780574G>C	g.94780579G>A	g.94780653G>A	g.94781858C>T	g.94781859G>A	g.94781944G>A	g.94781999T>A	g.94842861G>A	g.94842866A>G	g.94842879G>A	g.94842995G>A	g.94849995C>T	g.94852738C>T	g.94852765C>T	g.94852785C>G	g.94852914A>C".split()

In [31]:
site_re = re.compile(r'g.(\d+)\w>\w')

In [48]:
site_positions = [int(site_re.match(x).groups()[0]) for x in variant_sites] 

These sites were used to build VRS [Sequence Location](https://vrs.ga4gh.org/en/stable/terms_and_model.html#sequencelocation) objects.

In [55]:
site_seqlocs = list()
for position in site_positions:
    interval = models.SequenceInterval(
        start=models.Number(value=position-1), end=models.Number(value=position))
    seqloc = models.SequenceLocation(
        sequence_id='ga4gh:SQ.ss8r_wB0-b9r44TQTMmVTI92884QvBiB', interval=interval)
    site_seqlocs.append(seqloc.as_dict())

This was used to build a domain-specific message structure for CYP2C19 *1:

In [61]:
msg = {
    'type': 'StarAllele',
    'variation': h1_allele1.as_dict(),
    'definitive_sites': site_seqlocs
}

# this object can be inspected with the following:
# print(yaml.dump(msg, sort_keys=True, indent=3))