In [1]:
# Example transcript: NM_000314.6 (PTEN) on chr 10

# NM_000314.6
# These exons and CDS on the NM_000314.6 sequences constitute the
# definition of this transcript
t_exons = [(0,1110), (1110,1195), (1195,1240),
            (1240,1284), (1284,1523), (1523,1665),
            (1665,1832), (1832,2057), (2057,8701)]
t_cds = (1031,2243)

# NM_000314.6 aligned (by NCBI) to NC_000010.11 sequence (GRCh38 chr 10), + strand
g_exons = [(87863437, 87864548), (87894024, 87894109), (87925512, 87925557),
           (87931045, 87931089), (87933012, 87933251), (87952117, 87952259),
           (87957852, 87958019), (87960893, 87961118), (87965286, 87971930)]

# Cigars of alignment (relative to transcript)
tg_cigars = "666=1I39=1X404= 85= 45= 44= 239= 142= 167= 225= 6644="

# g_cds is computed from t_cds, accounting for alignment 
# 1032 = 1031 + 1I in cigar
g_cds = (87863437 + 1032, 87965286 + 2243 - 2057)


# Hardcode the ga4gh sequence identifier for NC_000010.11
# In real implementations, do something like this:
#   from ga4gh.vrs.dataproxy import SeqRepoDataProxy
#   from biocommons.seqrepo import SeqRepo
#   dp = SeqRepoDataProxy(SeqRepo(root_dir="/usr/local/share/seqrepo/latest"))
#   t_sequence_id = dp.translate_sequence_identifier("refseq:NM_000314.6", "ga4gh")[0]
#   g_sequence_id = dp.translate_sequence_identifier("refseq:NC_000010.11", "ga4gh")[0]
t_sequence_id = 'ga4gh:SQ.7YNhHjHLiBJwNd43xjLJA7jjnuJwPhxn'
g_sequence_id = 'ga4gh:SQ.ss8r_wB0-b9r44TQTMmVTI92884QvBiB'

In [2]:
from ga4gh.core import ga4gh_identify, ga4gh_serialize
from ga4gh.vrs import models

## Define Transcripts

In [3]:
t_transcript = models.Transcript(
  sequence_id = t_sequence_id,
  exons = [models.SimpleInterval(start=ex[0], end=ex[1]) for ex in t_exons],
  cds = models.SimpleInterval(start=t_cds[0], end=t_cds[1])
)
g_transcript = models.Transcript(
  sequence_id = g_sequence_id,
  exons = [models.SimpleInterval(start=ex[0], end=ex[1]) for ex in g_exons],
  cds = models.SimpleInterval(start=g_cds[0], end=g_cds[1])
)

In [4]:
ga4gh_serialize(t_transcript)

b'{"cds":{"end":2243,"start":1031,"type":"SimpleInterval"},"exons":[{"end":1110,"start":0,"type":"SimpleInterval"},{"end":1195,"start":1110,"type":"SimpleInterval"},{"end":1240,"start":1195,"type":"SimpleInterval"},{"end":1284,"start":1240,"type":"SimpleInterval"},{"end":1523,"start":1284,"type":"SimpleInterval"},{"end":1665,"start":1523,"type":"SimpleInterval"},{"end":1832,"start":1665,"type":"SimpleInterval"},{"end":2057,"start":1832,"type":"SimpleInterval"},{"end":8701,"start":2057,"type":"SimpleInterval"}],"sequence_id":"7YNhHjHLiBJwNd43xjLJA7jjnuJwPhxn","type":"Transcript"}'

In [5]:
t_transcript_id = ga4gh_identify(t_transcript)
g_transcript_id = ga4gh_identify(g_transcript)
(t_transcript_id, g_transcript_id)

('ga4gh:X_GT.nTRjcOgzR6_owupcO39owUADNZeFY0d3',
 'ga4gh:X_GT.pTvbXNiGxYhmHFvCn7FeBnAFb14DzAZr')

## Transcript Locations

In [6]:
boi = models.BaseOffsetInterval(
    datum="txstart",
    start=models.BaseOffset(base=87894024-87863437, offset=-5),
    end=models.BaseOffset(base=87894024-87863437, offset=-4))

In [7]:
tl = models.TranscriptLocation(
    transcript_id = g_transcript_id,
    interval = boi)

In [8]:
tl.as_dict()

{'interval': {'datum': 'txstart',
  'end': {'base': 30587, 'offset': -4},
  'start': {'base': 30587, 'offset': -5},
  'type': 'BaseOffsetInterval'},
 'transcript_id': 'ga4gh:X_GT.pTvbXNiGxYhmHFvCn7FeBnAFb14DzAZr',
 'type': 'TranscriptLocation'}

In [9]:
ga4gh_identify(tl)

'ga4gh:X_GTL.z1eEhvuD5p1dVvpXvLA5O1pW8BuzrRcK'

## Transcript Allele

In [10]:
a = models.Allele(
    location = tl,
    state = models.SequenceState(sequence="C"))

In [11]:
a.as_dict()

{'location': {'interval': {'datum': 'txstart',
   'end': {'base': 30587, 'offset': -4},
   'start': {'base': 30587, 'offset': -5},
   'type': 'BaseOffsetInterval'},
  'transcript_id': 'ga4gh:X_GT.pTvbXNiGxYhmHFvCn7FeBnAFb14DzAZr',
  'type': 'TranscriptLocation'},
 'state': {'sequence': 'C', 'type': 'SequenceState'},
 'type': 'Allele'}

In [12]:
ga4gh_identify(a)

'ga4gh:VA.u0FgwFlVBy2gf3PXSW2xWREdA7Tb8PKf'

## Transcript Feature Locations

In [13]:
tf = models.TranscriptFeature(feature_type="exon", index=0)
tf.as_dict()

{'feature_type': 'exon', 'index': 0}

In [14]:
tfi = models.TranscriptFeatureInterval(
    start=models.TranscriptFeature(feature_type="exon", index=0),
    end=models.TranscriptFeature(feature_type="exon", index=5),
)

In [15]:
tfl = models.TranscriptLocation(
    transcript_id = g_transcript_id,
    interval = tfi)
tfl.as_dict()

{'interval': {'end': {'feature_type': 'exon', 'index': 5},
  'start': {'feature_type': 'exon', 'index': 0},
  'type': 'TranscriptFeatureInterval'},
 'transcript_id': 'ga4gh:X_GT.pTvbXNiGxYhmHFvCn7FeBnAFb14DzAZr',
 'type': 'TranscriptLocation'}

In [16]:
ga4gh_identify(tfl)

'ga4gh:X_GTL.ZZJaNU94bUGrT0Z_y4xfWf2rWhvv7L0O'

## Scraps

In [17]:
t2 = models.Transcript2(
    sequence_id="refseq:foo",
    exons=[(0,10),(20,30)])

In [18]:
t2.as_dict()

{'exons': [[0, 10], [20, 30]],
 'sequence_id': 'refseq:foo',
 'type': 'Transcript'}