Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix wrong coordinate from genomic indels, Fix no ref/alt errors. Add genePred support to allow users to easily load their own transcripts #53

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 61 additions & 39 deletions pyhgvs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -655,7 +655,7 @@ def genomic_to_cdna_coord(transcript, genomic_coord):

def get_allele(hgvs, genome, transcript=None):
"""Get an allele from a HGVSName, a genome, and a transcript."""
chrom, start, end = hgvs.get_coords(transcript)
chrom, start, end = hgvs.get_ref_coords(transcript)
_, alt = hgvs.get_ref_alt(
transcript.tx_position.is_forward_strand if transcript else True)
ref = get_genomic_sequence(genome, chrom, start, end)
Expand All @@ -673,16 +673,31 @@ def get_vcf_allele(hgvs, genome, transcript=None):
ref = get_genomic_sequence(genome, chrom, start, end)

if hgvs.mutation_type in _indel_mutation_types:
if hgvs.mutation_type == 'dup':
# Sometimes we need to retrieve alt from reference, eg:
# No alt supplied: NM_000492.3:c.1155_1156dup
# Number used: NM_004119.2(FLT3):c.1794_1811dup18
# We *know* what the sequence is for "dup18", but not for "ins18"
if not hgvs.alt_allele or re.match("^N+$", hgvs.alt_allele):
alt = get_alt_from_sequence(hgvs, genome, transcript)

# Left-pad alternate allele.
alt = ref[0] + alt
return chrom, start, end, ref, alt


def get_alt_from_sequence(hgvs, genome, transcript):
""" returns ready for VCF """

chrom, start, end = hgvs.get_raw_coords(transcript)
return get_genomic_sequence(genome, chrom, start, end)


def matches_ref_allele(hgvs, genome, transcript=None):
"""Return True if reference allele matches genomic sequence."""
ref, alt = hgvs.get_ref_alt(
ref, _ = hgvs.get_ref_alt(
transcript.tx_position.is_forward_strand if transcript else True)
chrom, start, end = hgvs.get_coords(transcript)
chrom, start, end = hgvs.get_ref_coords(transcript)
genome_ref = get_genomic_sequence(genome, chrom, start, end)
return genome_ref == ref

Expand Down Expand Up @@ -1146,8 +1161,8 @@ def format_genome(self):
"""
return self.format_coords() + self.format_dna_allele()

def get_coords(self, transcript=None):
"""Return genomic coordinates of reference allele."""
def get_raw_coords(self, transcript=None):
""" return genomic coordinates """
if self.kind == 'c':
chrom = transcript.tx_position.chrom
start = cdna_to_genomic_coord(transcript, self.cdna_start)
Expand All @@ -1158,37 +1173,41 @@ def get_coords(self, transcript=None):
raise AssertionError(
"cdna_start cannot be greater than cdna_end")
start, end = end, start
else:
if start > end:
raise AssertionError(
"cdna_start cannot be greater than cdna_end")

if self.mutation_type == "ins":
# Inserts have empty interval.
if start < end:
start += 1
end -= 1
else:
end = start - 1

elif self.mutation_type == "dup":
end = start - 1

if start > end:
raise AssertionError(
"cdna_start cannot be greater than cdna_end")
elif self.kind == 'g':
chrom = self.chrom
start = self.start
end = self.end

else:
raise NotImplementedError(
'Coordinates are not available for this kind of HGVS name "%s"'
% self.kind)

return chrom, start, end

def get_ref_coords(self, transcript=None):
"""Return genomic coordinates of reference allele."""

chrom, start, end = self.get_raw_coords(transcript)

if self.mutation_type == "ins":
# Inserts have empty interval.
if start < end:
start += 1
end -= 1
else:
end = start - 1

elif self.mutation_type == "dup":
end = start - 1
return chrom, start, end

def get_vcf_coords(self, transcript=None):
"""Return genomic coordinates of reference allele in VCF-style."""
chrom, start, end = self.get_coords(transcript)
chrom, start, end = self.get_ref_coords(transcript)

# Inserts and deletes require left-padding by 1 base
if self.mutation_type in ("=", ">"):
Expand Down Expand Up @@ -1344,7 +1363,8 @@ def hgvs_normalize_variant(chrom, offset, ref, alt, genome, transcript=None):

def parse_hgvs_name(hgvs_name, genome, transcript=None,
get_transcript=lambda name: None,
flank_length=30, normalize=True, lazy=False):
flank_length=30, normalize=True, lazy=False,
indels_start_with_same_base=True):
"""
Parse an HGVS name into (chrom, start, end, ref, alt)

Expand All @@ -1353,15 +1373,16 @@ def parse_hgvs_name(hgvs_name, genome, transcript=None,
transcript: Transcript corresponding to HGVS name.
normalize: If True, normalize allele according to VCF standard.
lazy: If True, discard version information from incoming transcript/gene.
indels_start_with_same_base: When normalizing, don't strip common prefix from indels
"""
hgvs = HGVSName(hgvs_name)

# Determine transcript.
if hgvs.kind == 'c' and not transcript:
if '.' in hgvs.transcript and lazy:
hgvs.transcript, version = hgvs.transcript.split('.')
hgvs.transcript, _ = hgvs.transcript.split('.')
elif '.' in hgvs.gene and lazy:
hgvs.gene, version = hgvs.gene.split('.')
hgvs.gene, _ = hgvs.gene.split('.')
if get_transcript:
if hgvs.transcript:
transcript = get_transcript(hgvs.transcript)
Expand All @@ -1377,11 +1398,12 @@ def parse_hgvs_name(hgvs_name, genome, transcript=None,
transcript.tx_position.chrom_stop,
hgvs.transcript)

chrom, start, end, ref, alt = get_vcf_allele(hgvs, genome, transcript)
chrom, start, _, ref, alt = get_vcf_allele(hgvs, genome, transcript)
if normalize:
chrom, start, ref, [alt] = normalize_variant(
chrom, start, ref, [alt], genome,
flank_length=flank_length).variant
nv = normalize_variant(chrom, start, ref, [alt], genome,
flank_length=flank_length,
indels_start_with_same_base=indels_start_with_same_base)
chrom, start, ref, [alt] = nv.variant
return (chrom, start, ref, alt)


Expand All @@ -1408,11 +1430,19 @@ def variant_to_hgvs_name(chrom, offset, ref, alt, genome, transcript,
hgvs = HGVSName()

# Populate coordinates.
if mutation_type == 'ins':
# Insert uses coordinates around the insert point.
offset_start = offset - 1
offset_end = offset
else:
offset_start = offset
offset_end = offset + len(ref) - 1

if not transcript:
# Use genomic coordinate when no transcript is available.
hgvs.kind = 'g'
hgvs.start = offset
hgvs.end = offset + len(ref) - 1
hgvs.start = offset_start
hgvs.end = offset_end
else:
# Use cDNA coordinates.
hgvs.kind = 'c'
Expand All @@ -1425,14 +1455,6 @@ def variant_to_hgvs_name(chrom, offset, ref, alt, genome, transcript,
hgvs.cdna_start = genomic_to_cdna_coord(transcript, offset)
hgvs.cdna_end = hgvs.cdna_start
else:
# Use a range of coordinates.
if mutation_type == 'ins':
# Insert uses coordinates around the insert point.
offset_start = offset - 1
offset_end = offset
else:
offset_start = offset
offset_end = offset + len(ref) - 1
if transcript.strand == '-':
offset_start, offset_end = offset_end, offset_start
hgvs.cdna_start = genomic_to_cdna_coord(transcript, offset_start)
Expand Down
2 changes: 1 addition & 1 deletion pyhgvs/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def __init__(self, name):


class Transcript(object):
"""RefGene Transcripts for hg19
"""

A gene may have multiple transcripts with different combinations of exons.
"""
Expand Down
11 changes: 11 additions & 0 deletions pyhgvs/tests/data/test_name_to_variant.genome
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ chr1 76205650 76205691 TTTATATATTCAAGGCTTATTGTGTAACAGAACCTGGAGCA
chr1 76205639 76205669 TTTCTCTTGTTTTTATATATTCAAGGCTTA
chr1 76205670 76205700 TGTGTAACAGAACCTGGAGCAGGCTCTGAT
chr1 76227048 76227049 C
chr1 76227049 76227050 T
chr1 76227029 76227070 TAATGAGGGATGCCAAAATCTATCAGGTAAGGTTAAAGATG
chr1 76227019 76227049 GTAGAAAAACTAATGAGGGATGCCAAAATC
chr1 76227049 76227079 TATCAGGTAAGGTTAAAGATGATTTTTTTG
Expand All @@ -84,7 +85,13 @@ chr7 117176661 117176664 TAT
chr7 117176642 117176684 CCTCAGAAATGATTGAAAATATCCAATCTGTTAAGGCATACT
chr7 117176630 117176660 GACTTGTGATTACCTCAGAAATGATTGAAA
chr7 117176662 117176692 ATCCAATCTGTTAAGGCATACTGCTGGGAA
chr7 117182084 117182126 CAAGAATATAAGACATTGGAATATAACTTAACGACTACAGAA
chr7 117182103 117182104 A
chr7 117182103 117182106 AAT
chr7 117182106 117182109 ATA
chr7 117182106 117182107 A
chr7 117182107 117182109 TA
chr7 117182077 117182107 TACAAAAGCAAGAATATAAGACATTGGAATA
chr7 117182087 117182129 GAATATAAGACATTGGAATATAACTTAACGACTACAGAAGTA
chr7 117182074 117182104 CTTACAAAAGCAAGAATATAAGACATTGGA
chr7 117182104 117182134 ATATAACTTAACGACTACAGAAGTAGTGAT
Expand All @@ -98,7 +105,11 @@ chr17 41245340 41245341 T
chr17 41245340 41245341 T
chr7 117307164 117307165 A
chr11 17496507 17496508 T
chr17 7125998 7126028 AAGAAGATGGGCATCAAGGCTTCAAACACA
chr17 7126009 7126051 CATCAAGGCTTCAAACACAGCAGAGGTGTTCTTTGATGGAGT
chr17 7126027 7126037 AGCAGAGGTG
chr17 7126028 7126029 A
chr17 7126028 7126058 GCAGAGGTGTTCTTTGATGGAGTACGGGTG
chr1 76216233 76216235 AA
chr1 76199213 76199220 AGGTCTT
chr1 76199194 76199240 AACATTTTGATACTGTAGGAGGTCTTGGACTTGGAACTTTTGATGC
Expand Down
6 changes: 6 additions & 0 deletions pyhgvs/tests/data/test_variant_to_name.genome
Original file line number Diff line number Diff line change
Expand Up @@ -66,3 +66,9 @@ chr7 117188682 117188712 tTTTTTTAACAGGGATTTGGGGAATTATTT
chr7 117188582 117188783 CCATGTGCTTTTCAAACTAATTGTACATAAAACAAGCATCTATTGAAAATATCTGACAAACTCATCTTTTATTTTTGAtgtgtgtgtgtgtgtgtgtgtgtTTTTTTAACAGGGATTTGGGGAATTATTTGAGAAAGCAAAACAAAACAATAACAATAGAAAAACTTCTAATGGTGATGACAGCCTCTTCTTCAGTAATTT
chr7 117188687 117188689 TT
chr7 117188689 117188691 AA
chr17 7125928 7126129 TCCCCCAGTGACAACCTGTTGAACACACCTCTGCTTTCCCACACTGCCCTGACACAGTGGGCCCCCTGAGAAGAAGATGGGCATCAAGGCTTCAAACACAGCAGAGGTGTTCTTTGATGGAGTACGGGTGCCATCGGAGAACGTGCTGGGTGAGGTTGGGAGTGGCTTCAAGGTTGCCATGCACATCCTCAACAATGGAAG
chr17 7126008 7126050 GCATCAAGGCTTCAAACACAGCAGAGGTGTTCTTTGATGGAG
chr17 7126027 7126029 AG
chr17 7126028 7126058 GCAGAGGTGTTCTTTGATGGAGTACGGGTG
chr17 7126029 7126031 CA
chr17 7125998 7126028 AAGAAGATGGGCATCAAGGCTTCAAACACA
6 changes: 4 additions & 2 deletions pyhgvs/tests/genome.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,9 +77,11 @@ def __init__(self, lookup=None, filename=None, db_filename=None,
if SequenceFileDB is None:
raise ValueError('pygr is not available.')
self._genome = SequenceFileDB(db_filename)
self._source_filename = db_filename
elif filename:
# Read genome sequence from lookup table.
self.read(filename)
self._source_filename = filename

def __contains__(self, chrom):
"""Return True if genome contains chromosome."""
Expand Down Expand Up @@ -113,8 +115,8 @@ def get_seq(self, chrom, start, end):
None, end - start))
else:
raise MockGenomeError(
'Sequence not in test data: %s:%d-%d' %
(chrom, start, end))
'Sequence not in test data: %s:%d-%d source: %s' %
(chrom, start, end, self._source_filename))

def read(self, filename):
"""Read a sequence lookup table from a file.
Expand Down
63 changes: 29 additions & 34 deletions pyhgvs/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,57 +11,52 @@


def read_refgene(infile):
"""
Iterate through a refGene file.
""" refGene = genePred with extra column at front (and ignored ones after) """
return read_genepred(infile, skip_first_column=True)


def read_genepred(infile, skip_first_column=False):
"""
GenePred extension format:
http://genome.ucsc.edu/FAQ/FAQformat.html#GenePredExt

Column definitions:
0. uint undocumented id
1. string name; "Name of gene (usually transcript_id from GTF)"
2. string chrom; "Chromosome name"
3. char[1] strand; "+ or - for strand"
4. uint txStart; "Transcription start position"
5. uint txEnd; "Transcription end position"
6. uint cdsStart; "Coding region start"
7. uint cdsEnd; "Coding region end"
8. uint exonCount; "Number of exons"
9. uint[exonCount] exonStarts; "Exon start positions"
10. uint[exonCount] exonEnds; "Exon end positions"
11. uint id; "Unique identifier"
12. string name2; "Alternate name (e.g. gene_id from GTF)"
13. string cdsStartStat; "enum('none','unk','incmpl','cmpl')"
14. string cdsEndStat; "enum('none','unk','incmpl','cmpl')"
15. lstring exonFrames; "Exon frame offsets {0,1,2}"
0. string name; "Name of gene (usually transcript_id from GTF)"
1. string chrom; "Chromosome name"
2. char[1] strand; "+ or - for strand"
3. uint txStart; "Transcription start position"
4. uint txEnd; "Transcription end position"
5. uint cdsStart; "Coding region start"
6. uint cdsEnd; "Coding region end"
7. uint exonCount; "Number of exons"
8. uint[exonCount] exonStarts; "Exon start positions"
9. uint[exonCount] exonEnds; "Exon end positions"
10. uint id; "Unique identifier"
11. string name2; "Alternate name (e.g. gene_id from GTF)"
"""
for line in infile:
# Skip comments.
if line.startswith('#'):
continue
row = line.rstrip('\n').split('\t')
if len(row) != 16:
raise ValueError(
'File has incorrect number of columns '
'in at least one line.')
if skip_first_column:
row = row[1:]

# Skip trailing ,
exon_starts = list(map(int, row[9].split(',')[:-1]))
exon_ends = list(map(int, row[10].split(',')[:-1]))
exon_frames = list(map(int, row[15].split(',')[:-1]))
exon_starts = list(map(int, row[8].split(',')[:-1]))
exon_ends = list(map(int, row[9].split(',')[:-1]))
exons = list(zip(exon_starts, exon_ends))

yield {
'chrom': row[2],
'start': int(row[4]),
'end': int(row[5]),
'id': row[1],
'strand': row[3],
'cds_start': int(row[6]),
'cds_end': int(row[7]),
'gene_name': row[12],
'chrom': row[1],
'start': int(row[3]),
'end': int(row[4]),
'id': row[0],
'strand': row[2],
'cds_start': int(row[5]),
'cds_end': int(row[6]),
'gene_name': row[11],
'exons': exons,
'exon_frames': exon_frames
}


Expand Down
Loading