# Highly Mutated Gene Tables

Exports info on highly mutated genes.

In [1]:
%run "Header.ipynb"
%run "GeneUtils.ipynb"
import pileup
seq2pos2pileup = pileup.load()

## Define `Gene` class

In [2]:
import math
import skbio
from statistics import mean

class Gene:
    def __init__(self, gene_data, seq, fasta, minfreq=0.5):
        self.leftend = gene_data.LeftEnd
        self.rightend = gene_data.RightEnd
        if self.leftend >= self.rightend:
            raise ValueError("Left end of gene must be < right end of gene")
        
        self.positions = get_pos_interval_from_gene(gene_data)
        
        # Code is from within-gene mutation spectrum notebook
        self.length = len(self.positions)
        if self.length % 3 != 0:
            raise ValueError("Gene length not divisible by 3")

        # Name of sequence this gene is on (e.g. "edge_6104")
        self.seq = seq
        
        # Actual sequence of the, well, sequence -- loaded by skbio
        self.fasta = fasta
        
        # Orientation of gene
        #
        # if the strand is +, then this gene should look something like
        # ATG...TGA
        # (start codon)...(stop codon)
        #
        # and if the strand is -, then this gene should look something like
        # TCA...CAT
        # (reverse complemented)
        #
        # (of course there are multiple possible start/stop codons; this is just an example)
        self.strand = gene_data.Strand
        if self.strand not in ["+", "-"]:
            # yeah I'm aware that technically you could have a "." here for GFF files
            # (https://en.wikipedia.org/wiki/General_feature_format#GFF_general_structure) but I don't
            # think Prodigal will generate genes like that -- so better to be conservative and avoid confusion
            raise ValueError("Unrecognized gene strand value")
        
        # Number of this gene in the Prodigal .sco file
        self.num = gene_data.Index
        
        # Numbers of mutations in this gene - we'll compute these soon
        self.num_nonsyn_mutations = None
        self.num_total_mutations = None
        
        # Mutation rates - we'll also compute these after computing the numbers, listed above
        self.nonsyn_mutation_rate = None
        self.total_mutation_rate = None
        
        self._count_mutations(minfreq)
        
    def _count_mutations(self, minfreq):
        # "minfreq" is as described in the paper -- a percentage, where a given position has to have more
        # than this percentage of mismatches in the alignment in order for this position to be considered
        # "mutated" or not. We changed this value to "p" later on, but the definition is the same.
        
        self.num_total_mutations = 0
        self.num_nonsyn_mutations = 0
        minfreq_pct = minfreq / 100
        for pos in self.positions:
            if pileup.naively_call_mutation(seq2pos2pileup[seq][pos], minfreq_pct):
                self.num_total_mutations += 1
                
                # See if we should update the nonsyn mutation rate.
                # NOTE: This considers each single position independently, so this might result in some
                # codons being counted 2 or 3 times if 2 or 3 positions in this codon are mutated. I guess
                # we might want to consider mutations together? Or consider the actual aligned codon data
                # using the mutation matrix JSON stuff...
                codon = self.get_codon(pos)
                max_freq_alt_nt = pileup.get_max_freq_alt_nt(seq2pos2pileup[seq][pos])
                alt_codon = self.get_codon(pos, mutate_to=max_freq_alt_nt)
                
                if str(codon.translate()) != str(alt_codon.translate()):
                    self.num_nonsyn_mutations += 1
                    
        self.nonsyn_mutation_rate = self.num_nonsyn_mutations / self.length
        self.total_mutation_rate = self.num_total_mutations / self.length
                
    def get_codon(self, pos, mutate_to=None):
        """Given a (1-indexed) position within this gene, retrieves the codon of this position.
        
        The codon is returned as a skbio.DNA object, although it's easy to convert this into a str
        if preferred (just call str() on it).
        
        For example, say our gene sequence is ATGCCCTGA (it's a + strand gene).
        
        If pos is 1, 2, or 3, then the codon returned will have a sequence of "ATG".
        If pos is 4, 5, or 6, then the codon returned will have a sequence of "CCC".
        If pos is 7, 8, or 9, then the codon returned will have a sequence of "TGA".
        
        The optional mutate_to parameter lets us mutate this codon at a single position to whatever
        character mutate_to is: for example, if pos is 2 and mutate_to is "C", then this will return
        "ACG" instead of "ATG" for the above example.
        """
        
        # Since the length of the gene is guaranteed to be divisible by 3, we can "order" codons however
        # we want: we could go 000111222333444555... or
        #                      ...555444333222111000.
        # For the sake of simplicity we just use the first option. In either case, a position associates
        # with the same two other positions in its triplet, since counting from the end or from the beginning
        # gives us the same thing!
        #
        # Follows pattern   000111222333...
        # for 1-indexed pos 123456789012...
        codon_num = math.floor((pos - 1) / 3)

        codon_left = codon_num * 3
        codon_right = codon_left + 3

        # Produces a skbio.DNA object
        codon = self.fasta[codon_left: codon_right]

        # Modify the codon, if requested
        if mutate_to is not None:
            # Follows pattern 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, ...
            codon_pos = (pos - 1) % 3
            
            # skbio.DNA objects are immutable, so -- to avoid converting from DNA to str and
            # then back again -- we use the skbio.Sequence.replace() method. This is the simplest
            # way of doing this that I know of in "pure" skbio (...though I could totes be missing
            # something obvious).
            bl = [False, False, False]
            bl[codon_pos] = True
            codon = codon.replace(bl, mutate_to)
            
        # Finally, return the reverse complement of this codon if the strand is "-"
        # It is important that we do this AFTER mutating the codon, if requested -- since this way
        # the effect of the mutation will be correct (in this predicted gene it'd be reverse complemented
        # into another nucleotide, and then the reverse-complement codon would code for something)
        if self.strand == "-":
            return codon.reverse_complement()
        else:
            return codon

## Actually go through and output highly mutated genes (in LaTeX code!)

Settled on having this code actually output the LaTeX code for the rows in a table. Kinda fancy!

Idea for this based on https://stackoverflow.com/questions/49223962#comment85460602_49223962, sort of -- I was thinking of just outputting raw data and having LaTeX mess around and load it somehow, but just straight-up outputting the LaTeX from Python is probably much less of a pain than whatever I would've wound up doing.

In [3]:
seq2gene_ct = {}
seq2nonsynonymous_mutated_gene_ct = {}
seq2mutated_gene_ct = {}
seq2avg_gene_len = {}

for seq in SEQS:
    fasta = skbio.DNA.read("../seqs/{}.fasta".format(seq))
    genes = []
    df = parse_sco("../seqs/genes/{}.sco".format(seq))
    for gene_data in df.itertuples():
        genes.append(Gene(gene_data, seq, fasta))
    genes_by_mutrate = sorted(genes, key=lambda g: g.total_mutation_rate)
    
    with open("misc-text/{}-highest-mutated-genes.tex".format(seq), "w") as tblfile:
        # Select top 10 genes for each sequence, going by total mutation rate.
        # We reverse the order so that the highest-mutation-rate gene is written first, etc.
        # (like a "leaderboard")
        for g in genes_by_mutrate[-10:][::-1]:
            tblfile.write("{:,} & {:,} & {:,} & {} & {:.2f}\% & {:.2f}\% \\\\ \\hline\n".format(
                g.leftend, g.rightend, g.length, g.num, g.nonsyn_mutation_rate * 100, g.total_mutation_rate * 100
            ))
            
    # Save some extra info for the report
    seq2gene_ct[seq] = len(genes)
    seq2nonsynonymous_mutated_gene_ct[seq] = len([g for g in genes if g.num_nonsyn_mutations > 0])
    seq2mutated_gene_ct[seq] = len([g for g in genes if g.num_total_mutations > 0])
    seq2avg_gene_len[seq] = round(mean([g.length for g in genes]))

Multiple max-freq alt nucleotides: {'A': 38, 'C': 38, 'G': 0} for pileup [[38, 38, 0, 1414], 3, 0]
	(Arbitrarily breaking tie: selecting max alt = A.)


## Also, output some stats on the numbers of mutated genes

In [4]:
with open("misc-text/gene-mutation-stats.tex", "w") as of:
    # The \endinput is needed to prevent LaTeX from inserting a bunch of space after this text is
    # included using \input{}. See https://tex.stackexchange.com/a/18018.
    # (For some reason, using \unskip removed ALL whitespace, which looked ugly -- this is the best
    # solution I've found thus far.)
    of.write(
        (
            "Out of a total of {:,} / {:,} / {:,} genes in the {} / {} / {} genomes, respectively, "
            "{:,} / {:,} / {:,} genes have nonzero $p$-mutation rates (using the threshold $p=0.5\%$). "
            "Of these genes, {:,} / {:,} / {:,} genes have nonzero nonsynonymous mutation rates. "
            "(The rounded average gene lengths for each genome are {:,} / {:,} / {:,} bp, respectively.)\endinput"
        ).format(
            *[seq2gene_ct[seq] for seq in SEQS],
            *[seq2name[seq] for seq in SEQS],
            *[seq2mutated_gene_ct[seq] for seq in SEQS],
            *[seq2nonsynonymous_mutated_gene_ct[seq] for seq in SEQS],
            *[seq2avg_gene_len[seq] for seq in SEQS]
        )
    )