# Highly Mutated Gene Tables

Exports info on highly mutated genes.

In [1]:
%run "Header.ipynb"
%run "GeneUtils.ipynb"
import pileup
from parse_sco import parse_sco
seq2pos2pileup = pileup.load()

## Define `Gene` class

In [47]:
import math
import skbio
from statistics import mean

class Gene:
    def __init__(self, gene_data, seq, fasta, minfreq=0.5):
        self.leftend = gene_data.LeftEnd
        self.rightend = gene_data.RightEnd
        if self.leftend >= self.rightend:
            raise ValueError("Left end of gene must be < right end of gene")
        
        # from GeneUtils.ipynb, for reference. This is a range of 1-indexed positions.
        self.positions = get_pos_interval_from_gene(gene_data)
        
        # Code is from within-gene mutation spectrum notebook
        self.length = len(self.positions)
        if self.length % 3 != 0:
            raise ValueError("Gene length not divisible by 3")

        # Name of sequence this gene is on (e.g. "edge_6104")
        self.seq = seq
        
        # Actual sequence of the, well, sequence -- loaded by skbio
        self.fasta = fasta
        
        # Orientation of gene
        #
        # if the strand is +, then this gene should look something like
        # ATG...TGA
        # (start codon)...(stop codon)
        #
        # and if the strand is -, then this gene should look something like
        # TCA...CAT
        # (reverse complemented)
        #
        # (of course there are multiple possible start/stop codons; this is just an example)
        self.strand = gene_data.Strand
        if self.strand not in ["+", "-"]:
            # yeah I'm aware that technically you could have a "." here for GFF files
            # (https://en.wikipedia.org/wiki/General_feature_format#GFF_general_structure) but I don't
            # think Prodigal will generate genes like that -- so better to be conservative and avoid confusion
            raise ValueError("Unrecognized gene strand value")
        
        # Number of this gene in the Prodigal .sco file
        self.num = gene_data.Index
        
        # Numbers of mutations in this gene - we'll compute these soon
        self.num_nonsyn_mutations = None
        self.num_total_mutations = None
        
        # Mutation rates - we'll also compute these after computing the numbers, listed above
        self.nonsyn_mutation_rate = None
        self.total_mutation_rate = None
        
        self._count_mutations(minfreq)
        
    def _count_mutations(self, minfreq):
        # "minfreq" is as described in the paper -- a percentage, where a given position has to have more
        # than this percentage of mismatches in the alignment in order for this position to be considered
        # "mutated" or not. We changed this value's name to "p" later on, but the definition is the same.
        
        self.num_total_mutations = 0
        self.num_nonsyn_mutations = 0
        minfreq_pct = minfreq / 100
        # NOTE: We count CPs as 1, 2, 3, 1, 2, 3, ... regardless of the gene strand (+ or -). This is ok,
        # because we only use this "cp" variable here to determine how to extract the codon from the genome.
        # We account for the gene strand by reverse-complementing the codons later.
        cp = 1
        for pos in self.positions:
            if pileup.naively_call_mutation(seq2pos2pileup[seq][pos], minfreq_pct):
                
                # We only consider "reasonable" positions for this table -- that is, those where the reference
                # nucleotide is either the consensus or tied as the consensus. We do this in order to make it
                # possible to easily define nonsynonymous mutations at this position.
                max_freq_alt_nt = pileup.get_alt_nt_if_reasonable(seq2pos2pileup[seq][pos])
                
                if max_freq_alt_nt is not None:
                    # OK, we know this position is reasonable. Therefore, call it a mutation.
                    self.num_total_mutations += 1
                    
                    # See if we should update the nonsyn mutation rate.
                    # NOTE: This considers each single position independently, so this might result in some
                    # codons being counted 2 or 3 times if 2 or 3 positions in this codon are mutated. I guess
                    # we might want to consider mutations together? Or consider the actual aligned codon data
                    # using the mutation matrix JSON stuff.
                    
                    # Since we know this position in the genome and its modulo-3 position within this gene, it's
                    # relatively simple to extract the codon this position is located within.
                    #
                    # As noted above, "cp" here is a simplification -- this ignores the strand of this gene, which
                    # is ok because all we care about right now is just extracting this codon and altering
                    # it at the current position.
                    #
                    # NOTE that it's possible to write this if statement out in two lines of code, using cp
                    # to alter the slicing. But... this way is easier to read for me.
                    if cp == 1:
                        codon = str(self.fasta[pos - 1: pos + 2])
                        alt_codon = max_freq_alt_nt + codon[1] + codon[2]
                    elif cp == 2:
                        codon = str(self.fasta[pos - 2: pos + 1])
                        alt_codon = codon[0] + max_freq_alt_nt + codon[2]
                    elif cp == 3:
                        codon = str(self.fasta[pos - 3: pos])
                        alt_codon = codon[0] + codon[1] + max_freq_alt_nt
                    else:
                        raise ValueError("If we're here, Marcus REALLY can't do math")
                
                    # It is important that we do this AFTER mutating the codon -- since this way
                    # the effect of the mutation will be correct (in this predicted gene it'd be
                    # reverse complemented into another nucleotide, and then this reverse-complemented
                    # codon would code for something).
                    #
                    # Another thing worth noting: we are a bit inefficient here -- we always store codons,
                    # aas, etc. here as strings, not skbio.DNA / etc. objects. This probably slows things
                    # down a bit, but it makes it a lot easier to manipulate these objects.
                    if self.strand == "-":
                        codon = str(skbio.DNA(codon).reverse_complement())
                        alt_codon = str(skbio.DNA(alt_codon).reverse_complement())
                        
                    aa1 = str(skbio.DNA(codon).translate())
                    aa2 = str(skbio.DNA(alt_codon).translate())
                    if aa1 != aa2:
                        self.num_nonsyn_mutations += 1
                        
                    # Sanity check
#                     if self.seq == "edge_6104" and self.num == 1217:
#                         print(
#                             f"pos = {pos:,}, alt nt = {max_freq_alt_nt}, codon = {codon}, "
#                             f"alt_codon = {alt_codon}, aa1 = {aa1}, aa2 = {aa2}, nonsyn = {aa1 != aa2}"
#                         )
            cp += 1
            if cp == 4:
                cp = 1
            elif cp > 4:
                raise ValueError("marcus can't do basic math, go complain to him")
                    
        self.nonsyn_mutation_rate = self.num_nonsyn_mutations / self.length
        self.total_mutation_rate = self.num_total_mutations / self.length

## Actually go through and output highly mutated genes (in LaTeX code!)

Settled on having this code actually output the LaTeX code for the rows in a table. Kinda fancy!

Idea for this based on https://stackoverflow.com/questions/49223962#comment85460602_49223962, sort of -- I was thinking of just outputting raw data and having LaTeX mess around and load it somehow, but just straight-up outputting the LaTeX from Python is probably much less of a pain than whatever I would've wound up doing.

In [46]:
# # For testing this
# seq="edge_6104"
# fasta = skbio.DNA.read("../seqs/{}.fasta".format(seq))
# df = parse_sco("../seqs/genes/{}.sco".format(seq))
# for ii, di in enumerate(df.itertuples(), 1):
#     if ii == 1217:
#         Gene(di, seq, fasta)

In [48]:
seq2gene_ct = {}
seq2nonsynonymous_mutated_gene_ct = {}
seq2mutated_gene_ct = {}
seq2avg_gene_len = {}

for seq in SEQS:
    fasta = skbio.DNA.read("../seqs/{}.fasta".format(seq))
    genes = []
    df = parse_sco("../seqs/genes/{}.sco".format(seq))
    for gene_data in df.itertuples():
        genes.append(Gene(gene_data, seq, fasta))
    genes_by_mutrate = sorted(genes, key=lambda g: g.total_mutation_rate)
    
    with open(f"misc-text/{seq}-highest-mutated-genes.tex", "w") as tblfile:
        # Select top 10 genes for each sequence, going by total mutation rate.
        # We reverse the order so that the highest-mutation-rate gene is written first, etc.
        # (like a "leaderboard")
        for g in genes_by_mutrate[-10:][::-1]:
            tblfile.write(
                f"{g.leftend:,} & {g.rightend:,} & {g.length:,} & {g.num} & {g.nonsyn_mutation_rate * 100:.2f}\% "
                f"& {g.total_mutation_rate * 100:.2f}\% \\\\ \\hline\n"
            )
            
    # Save some extra info for the report
    seq2gene_ct[seq] = len(genes)
    seq2nonsynonymous_mutated_gene_ct[seq] = len([g for g in genes if g.num_nonsyn_mutations > 0])
    seq2mutated_gene_ct[seq] = len([g for g in genes if g.num_total_mutations > 0])
    seq2avg_gene_len[seq] = round(mean([g.length for g in genes]))

## Also, output some stats on the numbers of mutated genes

In [50]:
with open("misc-text/gene-mutation-stats.tex", "w") as of:
    # The \endinput is needed to prevent LaTeX from inserting a bunch of space after this text is
    # included using \input{}. See https://tex.stackexchange.com/a/18018.
    # (For some reason, using \unskip removed ALL whitespace, which looked ugly -- this is the best
    # solution I've found thus far.)
    out_text = (
        (
            "Out of a total of {:,} / {:,} / {:,} genes in the {} / {} / {} genomes, respectively, "
            "{:,} / {:,} / {:,} genes have nonzero $p$-mutation rates (using the threshold $p=0.5\%$, "
            "and only considering ``reasonable'' $p$-mutations where the reference nucleotide is also "
            "the most frequent, or one of the most frequent, aligned nucleotides). "
            "Of these genes, {:,} / {:,} / {:,} genes have nonzero nonsynonymous mutation rates. "
            "(The rounded average gene lengths for each genome are {:,} / {:,} / {:,} bp, respectively.)\endinput"
        ).format(
            *[seq2gene_ct[seq] for seq in SEQS],
            *[seq2name[seq] for seq in SEQS],
            *[seq2mutated_gene_ct[seq] for seq in SEQS],
            *[seq2nonsynonymous_mutated_gene_ct[seq] for seq in SEQS],
            *[seq2avg_gene_len[seq] for seq in SEQS]
        )
    )
    print(out_text)
    of.write(out_text)

Out of a total of 1,297 / 1,761 / 2,567 genes in the CAMP / BACT1 / BACT2 genomes, respectively, 110 / 1,522 / 677 genes have nonzero $p$-mutation rates (using the threshold $p=0.5\%$, and only considering ``reasonable'' $p$-mutations where the reference nucleotide is also the most frequent, or one of the most frequent, aligned nucleotides). Of these genes, 101 / 1,331 / 572 genes have nonzero nonsynonymous mutation rates. (The rounded average gene lengths for each genome are 919 / 1,106 / 897 bp, respectively.)\endinput
