# Syn vs Nonsyn and Non-nonsense vs Nonsense mutation rates

## Based on positions, instead of codons!

This makes it easier to use these for FDR estimation.

In [3]:
%run "Header.ipynb"
%run "GeneUtils.ipynb"

In [4]:
import json
import pickle
import skbio
import pileup
from collections import defaultdict
from parse_sco import parse_sco

In [5]:
seq2pos2pileup = pileup.load()

## Percentages (values of $p$) for naive mutation calling

### Values we use for plotting
Currently the same as in the codon mutation notebook. I guess these should be a shared variable, ideally? But that might be too much work for its own good.

In [6]:
percentages = [2, 1, 0.5, 0.25, 0.15]

### Full values, used for plotting a FDR curve elsewhere

Copied from the target/decoy notebook. may be nice to generalize to shared code too i guess

In [7]:
# Percentages go from 4.99%, 4.98%, ..., 0.16%, 0.15%
full_percentages = [p / 100 for p in range(15, 500, 1)][::-1]
print(f"First two percentages: {full_percentages[:2]}")
print(f"Last two percentages: {full_percentages[-2:]}")
print(f"Number of percentages: {len(full_percentages):,}")

First two percentages: [4.99, 4.98]
Last two percentages: [0.16, 0.15]
Number of percentages: 485


In [8]:
# Sanity check -- we can just compute this data for all of the stuff in full_percentages
# and then for the stuff we wanna plot in this notebook just focus on the values in percentages
for p in percentages:
    assert p in full_percentages, f"{p}% not in full_percentages"

## Compute possible (non)synonymous and (non)nonsense mutations, in general

Not specific to a given MAG -- just in the context of all possible codons, using the standard genetic code.

In [9]:
codons = []
dna = "ACGT"
for x in dna:
    for y in dna:
        for z in dna:
            codons.append(x+y+z)
            
# Maps the 64 codons --> [1, 2, 3] --> an integer 0, 1, 2, or 3, indicating how many of the 3 mutations
# from this CP in this codon into another nucleotide are synonymous (si) or nonsynonymous (ni)
codon2cp2si = defaultdict(dict)
codon2cp2ni = defaultdict(dict)

# Like above, but for non-nonsense (nnsi) or nonsense (nsi) mutations and only for the 61 sense codons
codon2cp2nnsi = defaultdict(dict)
codon2cp2nsi = defaultdict(dict)

stop_codons = []

for c in codons:
    is_not_stop = not (str(skbio.DNA(c).translate()) == "*")
    if not is_not_stop:
        stop_codons.append(c)
    for pos in [0, 1, 2]:
        si = 0
        ni = 0
        nnsi = 0
        nsi = 0
        for alt_nt in set(dna) - set(c[pos]):
            alt_codon = c[:pos] + alt_nt + c[pos + 1:]
            aa1 = str(skbio.DNA(c).translate())
            aa2 = str(skbio.DNA(alt_codon).translate())
            if aa1 == aa2:
                si += 1
                if is_not_stop:
                    nnsi += 1
            else:
                ni += 1
                if is_not_stop:
                    if aa2 == "*":
                        nsi += 1
                    else:
                        nnsi += 1
                    
        assert si + ni == 3
        codon2cp2si[c][pos + 1] = si
        codon2cp2ni[c][pos + 1] = ni
        
        if is_not_stop:
            assert nsi + nnsi == 3
            codon2cp2nnsi[c][pos + 1] = nnsi
            codon2cp2nsi[c][pos + 1] = nsi
            
assert len(codon2cp2si) == len(codon2cp2ni) == 64
assert len(codon2cp2nnsi) == len(codon2cp2nsi) == 61
assert len(stop_codons) == 3

## Use these numbers to compute $\sum_i S_i$ (and $N_i$, $NNS_i$, $NS_i$) for all MAGs

In [18]:
# seq --> list of 1-indexed positions where a mutation here can be nonsyn or nonsense
seq2poss_nonsyn_positions = {}
seq2poss_nonsense_positions = {}

# Total poss numbers of single-nucleotide (non)synonymous and (non)nonsense mutations throughout the genomes
seq2poss_si = defaultdict(int)
seq2poss_ni = defaultdict(int)
seq2poss_nnsi = defaultdict(int)
seq2poss_nsi = defaultdict(int)

# Defined the same as in the earlier codon-based S/N notebook.
p2seq2obs_si = {p: defaultdict(int) for p in full_percentages}
p2seq2obs_ni = {p: defaultdict(int) for p in full_percentages}
p2seq2obs_nnsi = {p: defaultdict(int) for p in full_percentages}
p2seq2obs_nsi = {p: defaultdict(int) for p in full_percentages}

for seq in SEQS:
    df = parse_sco(f"../seqs/genes/{seq}.sco")
    fasta = skbio.DNA.read(f"../seqs/{seq}.fasta")
    # Figure out quickly which positions are in multiple genes, so we can ignore them
    pos_to_genes = get_parent_gene_info_of_many_positions(df)
    
    poss_nonsyn_pos = []
    poss_nonsense_pos = []
    
    for gene in df.itertuples():
        print(
            f"{seq2name[seq]}: On gene {gene.Index:,} / {len(df.index):,}. "
            f"So far {len(poss_nonsyn_pos):,} poss nonsyn and {len(poss_nonsense_pos):,} "
            f"poss nonsense positions."
        )
        # The order of positions we go through is dependent on gene orientation:
        # left --> right if this is a "+" strand gene,
        # right --> left if this is a "-" strand gene.
        pos_interval = get_pos_interval_from_gene(gene)
        
        # We'll update these values as we walk through the gene.
        # curr_codon_cp1_pos indicates CP1 (the left end for + genes, the right end for - genes)
        # of the current gene. It's 1-indexed, along with the gene coordinates.
        curr_codon_cp1_pos = pos_interval[0]
        # Always either 1, 2, or 3
        cp = 1
        
        for pos in pos_interval:
            print(f"On position {pos:,} (CP {cp})")
            if cp == 1:
                # It's easier to sync this at the top rather than bottom of the loop because the positions
                # in pos_interval could be increasing or decreasing depending on the gene's strand
                curr_codon_cp1_pos = pos
            
            # We don't attempt mutation calling at multi-gene positions, or at
            # unreasonable (ref != consensus) positions.
            
            # ... And now we actually naively try to call a mutation at this position (trying lots of
            # values of p at once so we can create an FDR curve, barplots, etc).
            pos_pileup = seq2pos2pileup[seq][pos]
            
            if len(pos_to_genes[pos]) == 1 and pileup.is_reasonable(pos_pileup):
                
                print("\tPosition is reasonable and in just 1 gene")
                if gene.Strand == "+":
                    # The skbio sequence is 0-indexed, so we gotta subtract by 1
                    parent_codon_fasta = fasta[curr_codon_cp1_pos - 1: curr_codon_cp1_pos + 2]
                    parent_codon_str = str(parent_codon_fasta)
                    
                elif gene.Strand == "-":
                    parent_codon_fasta = fasta[curr_codon_cp1_pos - 3: curr_codon_cp1_pos]
                    parent_codon_fwd_str = str(parent_codon_fasta)
                    parent_codon_str = str(parent_codon_fasta.reverse_complement())
                
                else:
                    raise ValueError(f"Invalid strand: {gene.Strand}")
                
                # Update possible Si|Ni and NNSi|NSi values based on this position and its parent codon.
                seq2poss_si[seq] += codon2cp2si[parent_codon_str][cp]
                seq2poss_ni[seq] += codon2cp2ni[parent_codon_str][cp]
                if codon2cp2ni[parent_codon_str][cp] > 0:
                    poss_nonsyn_pos.append(pos)
                    
                in_sense_codon = (parent_codon_str not in stop_codons)
                if in_sense_codon:
                    seq2poss_nnsi[seq] += codon2cp2nnsi[parent_codon_str][cp]
                    seq2poss_nsi[seq] += codon2cp2nsi[parent_codon_str][cp]
                    if codon2cp2nsi[parent_codon_str][cp] > 0:
                        poss_nonsense_pos.append(pos)
                
                for p in full_percentages:
                    
                    # Since we currently define mutations as a binary thing (either a position is mutated
                    # or it isn't), we can precompute the step of figuring out the mutated codon resulting from
                    # a position being called a mutation -- and we can thus figure out in advance if this position
                    # will be a (non)synonymous or (non)nonsense mutation, before actually calling it a mutation
                    # (...should save some time).
                    
                    if pileup.any_mismatches(pos_pileup):
                        alt_nt = pileup.get_alt_nt_if_reasonable(pos_pileup)
                        
                        # if alt_nt is None, it means this position was unreasonable -- but we've already
                        # verified above that this position is reasonable!
                        assert alt_nt is not None
                        
                        # Construct a "mutated" version of this position's parent codon, with just this
                        # alternate nucleotide changed. This doesn't take into account the possibility
                        # of other positions within this codon being mutated, which is a notable limitation
                        # of this approach.
                        if gene.Strand == "+":
                            mutated_parent_codon_str = parent_codon_str[:cp - 1] + alt_nt + parent_codon_str[cp:]
                        else:
                            if cp == 1:
                                mutated_parent_codon_fwd_str = parent_codon_fwd_str[:2] + alt_nt
                            elif cp == 2:
                                mutated_parent_codon_fwd_str = parent_codon_fwd_str[:1] + alt_nt + parent_codon_fwd_str[2:]
                            else:
                                mutated_parent_codon_fwd_str = alt_nt + parent_codon_fwd_str[1:]
                                
                            mutated_parent_codon_str = str(skbio.DNA(mutated_parent_codon_fwd_str).reverse_complement())
                        
                        aa1 = str(skbio.DNA(parent_codon_str).translate())
                        aa2 = str(skbio.DNA(mutated_parent_codon_str).translate())
                        
                        if pileup.naively_call_mutation(pos_pileup, p, only_call_if_rare=True):
                            if aa1 == aa2:
                                p2seq2obs_si[p][seq] += 1
                            else:
                                p2seq2obs_ni[p][seq] += 1
                                
                            if in_sense_codon:
                                if aa2 == "*":
                                    p2seq2obs_nsi[p][seq] += 1
                                else:
                                    p2seq2obs_nnsi[p][seq] += 1
                        
            cp = next_cp(cp)
        raise ValueError("yeetus")
    seq2poss_nonsyn_positions[seq] = poss_nonsyn_pos
    seq2poss_nonsense_positions[seq] = poss_nonsense_pos

CAMP: On gene 1 / 1,297. So far 0 poss nonsyn and 0 poss nonsense positions.
On position 712 (CP 1)
	Position is reasonable and in just 1 gene
On position 711 (CP 2)
	Position is reasonable and in just 1 gene
On position 710 (CP 3)
	Position is reasonable and in just 1 gene
On position 709 (CP 1)
	Position is reasonable and in just 1 gene
On position 708 (CP 2)
	Position is reasonable and in just 1 gene
On position 707 (CP 3)
	Position is reasonable and in just 1 gene
On position 706 (CP 1)
	Position is reasonable and in just 1 gene
On position 705 (CP 2)
	Position is reasonable and in just 1 gene
On position 704 (CP 3)
	Position is reasonable and in just 1 gene
On position 703 (CP 1)
	Position is reasonable and in just 1 gene
On position 702 (CP 2)
	Position is reasonable and in just 1 gene
On position 701 (CP 3)
	Position is reasonable and in just 1 gene
On position 700 (CP 1)
	Position is reasonable and in just 1 gene
On position 699 (CP 2)
	Position is reasonable and in just 1 gene

KeyboardInterrupt: 

In [13]:
p2seq2obs_si[0.15]

defaultdict(int, {})

In [None]:
for seq in SEQS:
    n = len(seq2poss_nonsyn_positions[seq])
    ns = len(seq2poss_nonsense_positions[seq])
    ratio = n / ns
    print(
        f"{seq}: {n:,} possible nonsyn and "
        f"{ns:,} possible nonsense mutations. Ratio of #nonsyn to #nonsense: {ratio:.2f}"
    )