diff --git a/pyhgvs/__init__.py b/pyhgvs/__init__.py index b4df00c..9b28e8f 100644 --- a/pyhgvs/__init__.py +++ b/pyhgvs/__init__.py @@ -655,7 +655,7 @@ def genomic_to_cdna_coord(transcript, genomic_coord): def get_allele(hgvs, genome, transcript=None): """Get an allele from a HGVSName, a genome, and a transcript.""" - chrom, start, end = hgvs.get_coords(transcript) + chrom, start, end = hgvs.get_ref_coords(transcript) _, alt = hgvs.get_ref_alt( transcript.tx_position.is_forward_strand if transcript else True) ref = get_genomic_sequence(genome, chrom, start, end) @@ -673,16 +673,31 @@ def get_vcf_allele(hgvs, genome, transcript=None): ref = get_genomic_sequence(genome, chrom, start, end) if hgvs.mutation_type in _indel_mutation_types: + if hgvs.mutation_type == 'dup': + # Sometimes we need to retrieve alt from reference, eg: + # No alt supplied: NM_000492.3:c.1155_1156dup + # Number used: NM_004119.2(FLT3):c.1794_1811dup18 + # We *know* what the sequence is for "dup18", but not for "ins18" + if not hgvs.alt_allele or re.match("^N+$", hgvs.alt_allele): + alt = get_alt_from_sequence(hgvs, genome, transcript) + # Left-pad alternate allele. alt = ref[0] + alt return chrom, start, end, ref, alt +def get_alt_from_sequence(hgvs, genome, transcript): + """ returns ready for VCF """ + + chrom, start, end = hgvs.get_raw_coords(transcript) + return get_genomic_sequence(genome, chrom, start, end) + + def matches_ref_allele(hgvs, genome, transcript=None): """Return True if reference allele matches genomic sequence.""" - ref, alt = hgvs.get_ref_alt( + ref, _ = hgvs.get_ref_alt( transcript.tx_position.is_forward_strand if transcript else True) - chrom, start, end = hgvs.get_coords(transcript) + chrom, start, end = hgvs.get_ref_coords(transcript) genome_ref = get_genomic_sequence(genome, chrom, start, end) return genome_ref == ref @@ -1146,8 +1161,8 @@ def format_genome(self): """ return self.format_coords() + self.format_dna_allele() - def get_coords(self, transcript=None): - """Return genomic coordinates of reference allele.""" + def get_raw_coords(self, transcript=None): + """ return genomic coordinates """ if self.kind == 'c': chrom = transcript.tx_position.chrom start = cdna_to_genomic_coord(transcript, self.cdna_start) @@ -1158,27 +1173,14 @@ def get_coords(self, transcript=None): raise AssertionError( "cdna_start cannot be greater than cdna_end") start, end = end, start - else: - if start > end: - raise AssertionError( - "cdna_start cannot be greater than cdna_end") - - if self.mutation_type == "ins": - # Inserts have empty interval. - if start < end: - start += 1 - end -= 1 - else: - end = start - 1 - - elif self.mutation_type == "dup": - end = start - 1 + if start > end: + raise AssertionError( + "cdna_start cannot be greater than cdna_end") elif self.kind == 'g': chrom = self.chrom start = self.start end = self.end - else: raise NotImplementedError( 'Coordinates are not available for this kind of HGVS name "%s"' @@ -1186,9 +1188,26 @@ def get_coords(self, transcript=None): return chrom, start, end + def get_ref_coords(self, transcript=None): + """Return genomic coordinates of reference allele.""" + + chrom, start, end = self.get_raw_coords(transcript) + + if self.mutation_type == "ins": + # Inserts have empty interval. + if start < end: + start += 1 + end -= 1 + else: + end = start - 1 + + elif self.mutation_type == "dup": + end = start - 1 + return chrom, start, end + def get_vcf_coords(self, transcript=None): """Return genomic coordinates of reference allele in VCF-style.""" - chrom, start, end = self.get_coords(transcript) + chrom, start, end = self.get_ref_coords(transcript) # Inserts and deletes require left-padding by 1 base if self.mutation_type in ("=", ">"): @@ -1344,7 +1363,8 @@ def hgvs_normalize_variant(chrom, offset, ref, alt, genome, transcript=None): def parse_hgvs_name(hgvs_name, genome, transcript=None, get_transcript=lambda name: None, - flank_length=30, normalize=True, lazy=False): + flank_length=30, normalize=True, lazy=False, + indels_start_with_same_base=True): """ Parse an HGVS name into (chrom, start, end, ref, alt) @@ -1353,15 +1373,16 @@ def parse_hgvs_name(hgvs_name, genome, transcript=None, transcript: Transcript corresponding to HGVS name. normalize: If True, normalize allele according to VCF standard. lazy: If True, discard version information from incoming transcript/gene. + indels_start_with_same_base: When normalizing, don't strip common prefix from indels """ hgvs = HGVSName(hgvs_name) # Determine transcript. if hgvs.kind == 'c' and not transcript: if '.' in hgvs.transcript and lazy: - hgvs.transcript, version = hgvs.transcript.split('.') + hgvs.transcript, _ = hgvs.transcript.split('.') elif '.' in hgvs.gene and lazy: - hgvs.gene, version = hgvs.gene.split('.') + hgvs.gene, _ = hgvs.gene.split('.') if get_transcript: if hgvs.transcript: transcript = get_transcript(hgvs.transcript) @@ -1377,11 +1398,12 @@ def parse_hgvs_name(hgvs_name, genome, transcript=None, transcript.tx_position.chrom_stop, hgvs.transcript) - chrom, start, end, ref, alt = get_vcf_allele(hgvs, genome, transcript) + chrom, start, _, ref, alt = get_vcf_allele(hgvs, genome, transcript) if normalize: - chrom, start, ref, [alt] = normalize_variant( - chrom, start, ref, [alt], genome, - flank_length=flank_length).variant + nv = normalize_variant(chrom, start, ref, [alt], genome, + flank_length=flank_length, + indels_start_with_same_base=indels_start_with_same_base) + chrom, start, ref, [alt] = nv.variant return (chrom, start, ref, alt) @@ -1408,11 +1430,19 @@ def variant_to_hgvs_name(chrom, offset, ref, alt, genome, transcript, hgvs = HGVSName() # Populate coordinates. + if mutation_type == 'ins': + # Insert uses coordinates around the insert point. + offset_start = offset - 1 + offset_end = offset + else: + offset_start = offset + offset_end = offset + len(ref) - 1 + if not transcript: # Use genomic coordinate when no transcript is available. hgvs.kind = 'g' - hgvs.start = offset - hgvs.end = offset + len(ref) - 1 + hgvs.start = offset_start + hgvs.end = offset_end else: # Use cDNA coordinates. hgvs.kind = 'c' @@ -1425,14 +1455,6 @@ def variant_to_hgvs_name(chrom, offset, ref, alt, genome, transcript, hgvs.cdna_start = genomic_to_cdna_coord(transcript, offset) hgvs.cdna_end = hgvs.cdna_start else: - # Use a range of coordinates. - if mutation_type == 'ins': - # Insert uses coordinates around the insert point. - offset_start = offset - 1 - offset_end = offset - else: - offset_start = offset - offset_end = offset + len(ref) - 1 if transcript.strand == '-': offset_start, offset_end = offset_end, offset_start hgvs.cdna_start = genomic_to_cdna_coord(transcript, offset_start) diff --git a/pyhgvs/models.py b/pyhgvs/models.py index b26b8c4..656e67b 100644 --- a/pyhgvs/models.py +++ b/pyhgvs/models.py @@ -27,7 +27,7 @@ def __init__(self, name): class Transcript(object): - """RefGene Transcripts for hg19 + """ A gene may have multiple transcripts with different combinations of exons. """ diff --git a/pyhgvs/tests/data/test_name_to_variant.genome b/pyhgvs/tests/data/test_name_to_variant.genome index 90eb59e..a36abfb 100644 --- a/pyhgvs/tests/data/test_name_to_variant.genome +++ b/pyhgvs/tests/data/test_name_to_variant.genome @@ -61,6 +61,7 @@ chr1 76205650 76205691 TTTATATATTCAAGGCTTATTGTGTAACAGAACCTGGAGCA chr1 76205639 76205669 TTTCTCTTGTTTTTATATATTCAAGGCTTA chr1 76205670 76205700 TGTGTAACAGAACCTGGAGCAGGCTCTGAT chr1 76227048 76227049 C +chr1 76227049 76227050 T chr1 76227029 76227070 TAATGAGGGATGCCAAAATCTATCAGGTAAGGTTAAAGATG chr1 76227019 76227049 GTAGAAAAACTAATGAGGGATGCCAAAATC chr1 76227049 76227079 TATCAGGTAAGGTTAAAGATGATTTTTTTG @@ -84,7 +85,13 @@ chr7 117176661 117176664 TAT chr7 117176642 117176684 CCTCAGAAATGATTGAAAATATCCAATCTGTTAAGGCATACT chr7 117176630 117176660 GACTTGTGATTACCTCAGAAATGATTGAAA chr7 117176662 117176692 ATCCAATCTGTTAAGGCATACTGCTGGGAA +chr7 117182084 117182126 CAAGAATATAAGACATTGGAATATAACTTAACGACTACAGAA +chr7 117182103 117182104 A +chr7 117182103 117182106 AAT +chr7 117182106 117182109 ATA chr7 117182106 117182107 A +chr7 117182107 117182109 TA +chr7 117182077 117182107 TACAAAAGCAAGAATATAAGACATTGGAATA chr7 117182087 117182129 GAATATAAGACATTGGAATATAACTTAACGACTACAGAAGTA chr7 117182074 117182104 CTTACAAAAGCAAGAATATAAGACATTGGA chr7 117182104 117182134 ATATAACTTAACGACTACAGAAGTAGTGAT @@ -98,7 +105,11 @@ chr17 41245340 41245341 T chr17 41245340 41245341 T chr7 117307164 117307165 A chr11 17496507 17496508 T +chr17 7125998 7126028 AAGAAGATGGGCATCAAGGCTTCAAACACA +chr17 7126009 7126051 CATCAAGGCTTCAAACACAGCAGAGGTGTTCTTTGATGGAGT chr17 7126027 7126037 AGCAGAGGTG +chr17 7126028 7126029 A +chr17 7126028 7126058 GCAGAGGTGTTCTTTGATGGAGTACGGGTG chr1 76216233 76216235 AA chr1 76199213 76199220 AGGTCTT chr1 76199194 76199240 AACATTTTGATACTGTAGGAGGTCTTGGACTTGGAACTTTTGATGC diff --git a/pyhgvs/tests/data/test_variant_to_name.genome b/pyhgvs/tests/data/test_variant_to_name.genome index f310c9c..1f0b257 100644 --- a/pyhgvs/tests/data/test_variant_to_name.genome +++ b/pyhgvs/tests/data/test_variant_to_name.genome @@ -66,3 +66,9 @@ chr7 117188682 117188712 tTTTTTTAACAGGGATTTGGGGAATTATTT chr7 117188582 117188783 CCATGTGCTTTTCAAACTAATTGTACATAAAACAAGCATCTATTGAAAATATCTGACAAACTCATCTTTTATTTTTGAtgtgtgtgtgtgtgtgtgtgtgtTTTTTTAACAGGGATTTGGGGAATTATTTGAGAAAGCAAAACAAAACAATAACAATAGAAAAACTTCTAATGGTGATGACAGCCTCTTCTTCAGTAATTT chr7 117188687 117188689 TT chr7 117188689 117188691 AA +chr17 7125928 7126129 TCCCCCAGTGACAACCTGTTGAACACACCTCTGCTTTCCCACACTGCCCTGACACAGTGGGCCCCCTGAGAAGAAGATGGGCATCAAGGCTTCAAACACAGCAGAGGTGTTCTTTGATGGAGTACGGGTGCCATCGGAGAACGTGCTGGGTGAGGTTGGGAGTGGCTTCAAGGTTGCCATGCACATCCTCAACAATGGAAG +chr17 7126008 7126050 GCATCAAGGCTTCAAACACAGCAGAGGTGTTCTTTGATGGAG +chr17 7126027 7126029 AG +chr17 7126028 7126058 GCAGAGGTGTTCTTTGATGGAGTACGGGTG +chr17 7126029 7126031 CA +chr17 7125998 7126028 AAGAAGATGGGCATCAAGGCTTCAAACACA \ No newline at end of file diff --git a/pyhgvs/tests/genome.py b/pyhgvs/tests/genome.py index 5c53da6..0887dfd 100644 --- a/pyhgvs/tests/genome.py +++ b/pyhgvs/tests/genome.py @@ -77,9 +77,11 @@ def __init__(self, lookup=None, filename=None, db_filename=None, if SequenceFileDB is None: raise ValueError('pygr is not available.') self._genome = SequenceFileDB(db_filename) + self._source_filename = db_filename elif filename: # Read genome sequence from lookup table. self.read(filename) + self._source_filename = filename def __contains__(self, chrom): """Return True if genome contains chromosome.""" @@ -113,8 +115,8 @@ def get_seq(self, chrom, start, end): None, end - start)) else: raise MockGenomeError( - 'Sequence not in test data: %s:%d-%d' % - (chrom, start, end)) + 'Sequence not in test data: %s:%d-%d source: %s' % + (chrom, start, end, self._source_filename)) def read(self, filename): """Read a sequence lookup table from a file. diff --git a/pyhgvs/utils.py b/pyhgvs/utils.py index 83e239e..55b20f4 100644 --- a/pyhgvs/utils.py +++ b/pyhgvs/utils.py @@ -11,57 +11,52 @@ def read_refgene(infile): - """ - Iterate through a refGene file. + """ refGene = genePred with extra column at front (and ignored ones after) """ + return read_genepred(infile, skip_first_column=True) + +def read_genepred(infile, skip_first_column=False): + """ GenePred extension format: http://genome.ucsc.edu/FAQ/FAQformat.html#GenePredExt Column definitions: - 0. uint undocumented id - 1. string name; "Name of gene (usually transcript_id from GTF)" - 2. string chrom; "Chromosome name" - 3. char[1] strand; "+ or - for strand" - 4. uint txStart; "Transcription start position" - 5. uint txEnd; "Transcription end position" - 6. uint cdsStart; "Coding region start" - 7. uint cdsEnd; "Coding region end" - 8. uint exonCount; "Number of exons" - 9. uint[exonCount] exonStarts; "Exon start positions" - 10. uint[exonCount] exonEnds; "Exon end positions" - 11. uint id; "Unique identifier" - 12. string name2; "Alternate name (e.g. gene_id from GTF)" - 13. string cdsStartStat; "enum('none','unk','incmpl','cmpl')" - 14. string cdsEndStat; "enum('none','unk','incmpl','cmpl')" - 15. lstring exonFrames; "Exon frame offsets {0,1,2}" + 0. string name; "Name of gene (usually transcript_id from GTF)" + 1. string chrom; "Chromosome name" + 2. char[1] strand; "+ or - for strand" + 3. uint txStart; "Transcription start position" + 4. uint txEnd; "Transcription end position" + 5. uint cdsStart; "Coding region start" + 6. uint cdsEnd; "Coding region end" + 7. uint exonCount; "Number of exons" + 8. uint[exonCount] exonStarts; "Exon start positions" + 9. uint[exonCount] exonEnds; "Exon end positions" + 10. uint id; "Unique identifier" + 11. string name2; "Alternate name (e.g. gene_id from GTF)" """ for line in infile: # Skip comments. if line.startswith('#'): continue row = line.rstrip('\n').split('\t') - if len(row) != 16: - raise ValueError( - 'File has incorrect number of columns ' - 'in at least one line.') + if skip_first_column: + row = row[1:] # Skip trailing , - exon_starts = list(map(int, row[9].split(',')[:-1])) - exon_ends = list(map(int, row[10].split(',')[:-1])) - exon_frames = list(map(int, row[15].split(',')[:-1])) + exon_starts = list(map(int, row[8].split(',')[:-1])) + exon_ends = list(map(int, row[9].split(',')[:-1])) exons = list(zip(exon_starts, exon_ends)) yield { - 'chrom': row[2], - 'start': int(row[4]), - 'end': int(row[5]), - 'id': row[1], - 'strand': row[3], - 'cds_start': int(row[6]), - 'cds_end': int(row[7]), - 'gene_name': row[12], + 'chrom': row[1], + 'start': int(row[3]), + 'end': int(row[4]), + 'id': row[0], + 'strand': row[2], + 'cds_start': int(row[5]), + 'cds_end': int(row[6]), + 'gene_name': row[11], 'exons': exons, - 'exon_frames': exon_frames } diff --git a/pyhgvs/variants.py b/pyhgvs/variants.py index fca0cdc..36c7383 100644 --- a/pyhgvs/variants.py +++ b/pyhgvs/variants.py @@ -115,7 +115,7 @@ def justify_genomic_indel(genome, chrom, start, end, indel, justify, def normalize_variant(chrom, offset, ref_sequence, alt_sequences, genome, - justify='left', flank_length=30): + justify='left', flank_length=30, indels_start_with_same_base=True): """ Normalize variant according to the GATK/VCF standard. @@ -133,7 +133,7 @@ def normalize_variant(chrom, offset, ref_sequence, alt_sequences, genome, chrom_stop=end, is_forward_strand=True) return NormalizedVariant(position, ref_sequence, alt_sequences, - genome=genome, justify=justify) + genome=genome, justify=justify, indels_start_with_same_base=indels_start_with_same_base) class NormalizedVariant(object): @@ -142,7 +142,8 @@ class NormalizedVariant(object): """ def __init__(self, position, ref_allele, alt_alleles, - seq_5p='', seq_3p='', genome=None, justify='left'): + seq_5p='', seq_3p='', genome=None, justify='left', + indels_start_with_same_base=True): """ position: a 0-index genomic Position. ref_allele: the reference allele sequence. @@ -150,6 +151,9 @@ def __init__(self, position, ref_allele, alt_alleles, seq_5p: 5 prime flanking sequence of variant. seq_3p: 3 prime flanking sequence of variant. genome: a pygr compatible genome object (optional). + + indels_start_with_same_base: DML - I have no idea why this is required + but am keeping for backwards compat """ self.position = position self.alleles = [ref_allele] + list(alt_alleles) @@ -157,6 +161,7 @@ def __init__(self, position, ref_allele, alt_alleles, self.seq_3p = seq_3p self.genome = genome self.log = [] + self.indels_start_with_same_base = indels_start_with_same_base self._on_forward_strand() self._trim_common_prefix() @@ -275,7 +280,7 @@ def _1bp_pad(self): # Pad sequences with one 5-prime base before the mutation event. empty_seq = any(not allele for allele in self.alleles) uniq_starts = set(allele[0] for allele in self.alleles if allele) - if empty_seq or len(uniq_starts) > 1: + if empty_seq or (self.indels_start_with_same_base and len(uniq_starts) > 1): # Fetch more 5p flanking sequence if needed. if self.genome and self.seq_5p == '': start = self.position.chrom_start @@ -299,9 +304,10 @@ def _1bp_pad(self): self.seq_3p = self.seq_3p[1:] self.position.chrom_stop += 1 - if len(set(a[0] for a in self.alleles)) != 1: - raise AssertionError( - "All INDEL alleles should start with same base.") + if self.indels_start_with_same_base: + if len(set(a[0] for a in self.alleles)) != 1: + raise AssertionError( + "All INDEL alleles should start with same base.") def _set_1based_position(self): """