# Highly Mutated Gene Tables

- For every sequence:
  - Initialize a list that we'll use to store "gene" objects.
  - For every gene in this sequence:
    - Store "number of mutated positions", "number of nonsynonymous mutated positions"
    - For every position in this gene:
      - See if this position is mutated (> 0.5% mutation rate; same as elsewhere in the code)
      - If this position is mutated:
        - Increment number of mutated positions
        - Get the codon of this position (will depend on gene being on +/- strand). Call it "C1".
        - Look at `seq2pos2mismatches` JSON for this sequence and position. Get the maximum-frequency alternate nucleotide (__we could change this behavior if desired__).
        - Change the codon of this position, substituting the max-frequency alternate nucleotide we just found. Call it "C2".
        - If C1 and C2 don't code for the same amino acid / stop codon, this is a nonsynonymous mutation. Increment the number of nonsynonymous mutated positions.
    - Compute nonsynonymous mutation rate and total mutation rate by just dividing the numbers we figured out by the total number of positions in this gene.
    - Update the list of "gene" objects for this sequence with a dict containing some gene info (gene name

## Define `Gene` class

In [1]:
import math
import skbio

class Gene:
    def __init__(self, gene_data, seq, fasta, minfreq=0.5):
        self.leftend = gene_data.LeftEnd
        self.rightend = gene_data.RightEnd
        if self.leftend >= self.rightend:
            raise ValueError("Left end of gene must be < right end of gene")
        
        self.positions = get_pos_interval_from_gene(gene_data)
        
        # Code is from within-gene mutation spectrum notebook
        self.length = len(self.positions)
        if self.length % 3 != 0:
            raise ValueError("Gene length not divisible by 3")

        # Name of sequence this gene is on (e.g. "edge_6104")
        self.seq = seq
        
        # Actual sequence of the, well, sequence -- loaded by skbio
        self.fasta = fasta
        
        # Orientation of gene
        #
        # if the strand is +, then this gene should look something like
        # ATG...TGA
        # (start codon)...(stop codon)
        #
        # and if the strand is -, then this gene should look something like
        # AGT...GTA
        # (stop codon in reverse)...(start codon in reverse)
        #
        # (of course there are multiple possible stop [and, more rarely, start] codons; this is just an example)
        self.strand = gene_data.Strand
        
        # Number of this gene in the Prodigal .sco file
        self.num = gene_data.Index
        
        # Numbers of mutations in this gene - we'll compute these soon
        self.num_nonsyn_mutations = None
        self.num_total_mutations = None
        
        # Mutation rates - we'll also compute these after computing the numbers, listed above
        self.nonsyn_mutation_rate = None
        self.total_mutation_rate = None
        
        self._count_mutations(minfreq)
        
    def _count_mutations(self, minfreq):
        # "minfreq" is as described in the paper -- a percentage, where a given position has to have more
        # than this percentage of mismatches in the alignment in order for this position to be considered
        # "mutated" or not.
        
        self.num_total_mutations = 0
        self.num_nonsyn_mutations = 0
        minfreq_pct = minfreq / 100
        for pos in self.positions:
            is_mut = get_val(seq, pos, lambda cov, mismatches: 1 if (mismatches / cov) > minfreq_pct else 0)
            if is_mut == 1:
                self.num_total_mutations += 1
                codon = self.get_codon(pos)
                alts = seq2pos2mismatches[seq][str(pos)]
                # Retrieve max-freq alternate nucleotide.
                # Based on https://stackoverflow.com/a/280156.
                # (Note that if there's a tie, the result is arbitrary. Shouldn't be a big deal. Making note
                # of in the paper.)
                max_freq_alt_nt = max(alts, key=alts.get)
                alt_codon = self.get_codon(pos, mutate_to=max_freq_alt_nt)
                
                if str(codon.translate()) != str(alt_codon.translate()):
                    self.num_nonsyn_mutations += 1
                    
        self.nonsyn_mutation_rate = self.num_nonsyn_mutations / self.length
        self.total_mutation_rate = self.num_total_mutations / self.length
                
    def get_codon(self, pos, mutate_to=None):
        # pos is 1-indexed
        
        def replace_pos_in_codon(codon_dna, codon_pos):                      
            # skbio.DNA objects are immutable, so -- to avoid converting from DNA to str and
            # then back again -- we use the skbio.Sequence.replace() method. This is the simplest
            # way of doing this that I know of in "pure" skbio (...though I could totes be missing
            # something obvious).
            bl = [False, False, False]
            bl[codon_pos] = True
            return codon_dna.replace(bl, mutate_to)
        
        if self.strand == "+":
            # Follows pattern 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, ...
            codon_num = math.floor((pos - 1) / 3)
            
            codon_left = codon_num * 3
            codon_right = codon_left + 3
            
            # Produces a skbio.DNA object
            codon = self.fasta[codon_left: codon_right]
            
            if mutate_to is not None:
                # Follows pattern 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, ...
                codon_pos = (pos - 1) % 3
                return replace_pos_in_codon(codon, codon_pos)
            else:
                return codon
                
        elif self.strand == "-":
            seqlen = len(self.fasta)
            # Follows pattern 5, 5, 5, 4, 4, 4, 3, 3, 3, 2, 2, 2, 1, 1, 1, 0, 0, 0
            codon_num = math.floor((seqlen - pos) / 3)
            
            codon_right = seqlen - (codon_num * 3)
            codon_left = codon_right - 3
            
            # Reverse the codon
            codon = self.fasta[codon_left: codon_right][::-1]
            
            if mutate_to is not None:
                # Follows pattern 2, 1, 0, 2, 1, 0, ...
                codon_pos = (seqlen - pos) % 3
                return replace_pos_in_codon(codon, codon_pos)
            else:
                return codon
        
        else:
            raise ValueError("Invalid gene strand: {}".format(self.strand))

## Actually go through and output highly mutated genes (in LaTeX code!)

Settled on having this code actually output the LaTeX code for the rows in a table. Kinda fancy!

Idea for this based on https://stackoverflow.com/questions/49223962#comment85460602_49223962, sort of -- I was thinking of just outputting raw data and having LaTeX mess around and load it somehow, but just straight-up outputting the LaTeX from Python is probably much less of a pain than whatever I would've wound up doing.

In [None]:
for seq in SEQS:
    fasta = skbio.DNA.read("../seqs/{}.fasta".format(seq))
    genes = []
    df = parse_sco("../seqs/genes/{}.sco".format(seq))
    for gene_data in df.itertuples():
        genes.append(Gene(gene_data, seq, fasta))
    genes_by_mutrate = sorted(genes, key=lambda g: g.total_mutation_rate)
    
    with open("gene-tables/{}-highest-mutated-genes.tex".format(seq), "w") as tblfile:
        # Select top 10 genes for each sequence, going by total mutation rate.
        # We reverse the order so that the highest-mutation-rate gene is written first, etc.
        # (like a "leaderboard")
        for g in genes_by_mutrate[-10:][::-1]:
            tblfile.write("{} & {} & {} & {} \\\\ \\hline\n".format(
                g.num, g.length, g.nonsyn_mutation_rate, g.total_mutation_rate
            ))