# Demonstrate the "target-decoy" approach, as applied to metagenomic variant calling

In [1]:
%run "Header.ipynb"

In [2]:
import skbio
import pileup
from parse_sco import parse_sco

In [3]:
seq2pos2pileup = pileup.load()

## First: naive variant calling (call a position as "$p$-mutated" if $freq(pos) > p$)

We don't limit to "sufficiently-covered" positions here -- so we consider all regions throughout a genome.

In [4]:
# Percentages go from 50%, 49.95%, 49.90%, ..., 0.10%, 0.05%
percentages = [p / 100 for p in range(5, 5001, 5)][::-1]
print(f"First two percentages: {percentages[:2]}")
print(f"Last two percentages: {percentages[-2:]}")
print(f"Number of percentages: {len(percentages):,}")
p2pct = get_p2pct(percentages)

First two percentages: [50.0, 49.95]
Last two percentages: [0.1, 0.05]
Number of percentages: 1,000


In [5]:
def naive_calling(seq, positions_to_consider, verbose=True, superverbose=False):
    """seq should be in SEQS.
    
    positions_to_consider should be a collection of 1-indexed positions in the sequence to consider when
    calling mutations. This makes it possible to, for example, just consider the CP 2 positions in a sequence.
    
    Returns a tuple with three elements:
    
    1. p2called_mutations: A dict mapping values in "percentages" (defined above) to a list of 1-indexed
       "called" p-mutations in the positions to consider in the sequence, using this percentage for p.
       
    2. p2numpermb: A dict with the same keys as p2called_mutations, but the values are the number of called
       p-mutations per megabase (1,000,000 bp = 1 Mbp) in the positions to consider in this sequence.
       IT'S DEFINITELY WORTH NOTING that we scale this by the number of positions to consider, not the full
       sequence length (although it's possible that those values could be equal if positions_to_consider
       is equal to range(1, seq2len[seq] + 1)). So, if you select a subset of positions where most of them
       are mutations, this'll result in a really high number of mutations per megabase!
       ...I recognize "numpermb" doesn't really roll off the tongue, but I couldn't think
       of a better name for this :P
       
    3. poslen: Length of positions_to_consider, for reference.
    """
    poslen = len(positions_to_consider)
    seqlen = seq2len[seq]
    positions_to_consider_pct = 100 * (poslen / seqlen)
    if verbose:
        print(f"Naively calling mutations in {seq2name[seq]}.")
        print(f"\tConsidering {poslen:,} / {seqlen:,} ({positions_to_consider_pct:.2f}%) positions.")
    p2called_mutations = {p: [] for p in percentages}
    p2numpermb = {}
    for pi, pos in enumerate(sorted(positions_to_consider), 1):
        if verbose and (pi == 1 or pi % 100000 == 0):
            print(f"\tOn the {pi:,}-th position ({pos:,}) of the specified {poslen:,} positions ({100 * (pi / poslen):.2f}%).")
        pos_pileup = seq2pos2pileup[seq][pos]
        mfa_pct = pileup.get_max_freq_alt_nt_pct(pos_pileup)
        for p in percentages:
            if mfa_pct > p2pct[p]:
                p2called_mutations[p].append(pos)
                
    # We have the equation
    #
    #      # called mutations              f
    # ---------------------------- = --------------
    #  # of positions to consider     1,000,000 bp
    #
    # We know everything except for f. We can solve for f by multiplying the left side of the
    # equation by 1,000,000 bp. Since the only variable across different thresholds of p is the number
    # of called mutations, we can pre-compute this "constant length factor" (aka 1,000,000 / poslen).
    constant_length_factor = 1000000 / poslen
    for p in p2called_mutations:
        num_called_mutations = len(p2called_mutations[p])
        # solve the equation above
        f = num_called_mutations * constant_length_factor
        p2numpermb[p] = f
        # We add an extra layer of verbosity here because printing out 2 lines per value of p gets
        # ridiculous when there are 1,000 values of p .____.
        if superverbose:
            print(f"\tp = {p}%: {num_called_mutations:,} called p-mutations in {seq2name[seq]}.")
            print(f"\t\tNumber of called p-mutations per megabase: f = {f:,.2f}.")
        
    return (p2called_mutations, p2numpermb, poslen)


def naive_calling_fullseq(seq):
    """Does naive variant calling across all positions in a sequence (should be in SEQS)."""
    return naive_calling(seq, range(1, seq2len[seq] + 1))


def naive_calling_cp2seq(seq):
    """Does naive variant calling across just the CP 2 positions in a sequence (should be in SEQS).
    
    NOTE that this will filter only to positions that meet the exact criteria:
    
    - In a single gene (not in a position that is covered by overlapping genes).
    - In CP 2 within this single gene.
    
    Even if a position is in CP 2 of all the multiple genes it's covered by, we'll still ignore it.
    I'm pretty sure there should be very few positions that get tossed out as a result; my take is that
    it isn't worth the trouble to try to handle these positions.
    """
    print(f"Identifying CP 2 positions in {seq2name[seq]} so we can use them as a decoy genome...")
    cp2_positions = set()
    multi_gene_positions = set()
    
    seqlen = seq2len[seq]
    genes_df = parse_sco(f"../seqs/genes/{seq}.sco")
    
    # Code here is adapted from get_parent_gene_info_of_many_positions (in Header.ipynb) a bit
    # Faster to compute everything at once, rather than iterate through the genes multiple times
    pos_to_genes = defaultdict(list)
    
    for gene in genes_df.itertuples():
        gene_left = int(gene.LeftEnd)
        gene_right = int(gene.RightEnd)
        gene_num = int(gene.Index)
        gene_strand = gene.Strand
        
        def complainAboutCPs(gn, gs, gcp):
            raise ValueError(f"CP got out of whack: gene {gn}, strand {gs}, cp {gcp}?")
        
        if gene_strand == "+":
            cp = 1
        else:
            cp = 3
        
        for pos in range(gene_left, gene_right + 1):
            pos_to_genes[pos].append(gene_num)
            if len(pos_to_genes[pos]) > 1:
                multi_gene_positions.add(pos)
                
            if cp == 2:
                cp2_positions.add(pos)
                
            # Adjust the CP. I already have some code that does this (in a different context) in the within-
            # gene mutation spectrum notebook; ideally this code would be generalized between the notebooks.
            if gene_strand == "+":
                # For + strand genes, this goes 123123123123...
                if cp == 1 or cp == 2: cp += 1
                elif cp == 3: cp = 1
                else: complainAboutCPs(gene_num, gene_strand, cp)
            else:
                # For - strand genes, this goes 321321321321...
                if cp == 3 or cp == 2: cp -= 1
                elif cp == 1: cp = 3
                else: complainAboutCPs(gene_num, gene_strand, cp)
    
    single_gene_cp2_positions = cp2_positions - multi_gene_positions
    print(f"In {seq2name[seq]}:")
    print(f"\tThere were {len(cp2_positions):,} CP 2 positions.")
    print(f"\tThere were {len(multi_gene_positions):,} positions in multiple genes.")
    print(f"\tThere were {len(single_gene_cp2_positions):,} CP 2 positions in only a single gene.")
    return naive_calling(seq, single_gene_cp2_positions)

### Naively call mutations in CAMP and compute $\mathrm{frac}_{\mathrm{decoy}}$

(We're treating CAMP as a "decoy" genome, where we assume that all called mutations within it will be incorrect.)

In [None]:
camp_naive_p2called_mutations, camp_naive_p2numpermb, _ = naive_calling_fullseq("edge_6104")

Naively calling mutations in CAMP.
	Considering 1,289,244 / 1,289,244 (100.00%) positions.
	On the 1-th position (1) of the specified 1,289,244 positions (0.00%).
	On the 100,000-th position (100,000) of the specified 1,289,244 positions (7.76%).
	On the 200,000-th position (200,000) of the specified 1,289,244 positions (15.51%).
	On the 300,000-th position (300,000) of the specified 1,289,244 positions (23.27%).


In [None]:
camp_cp2_naive_p2called_mutations, camp_cp2_naive_p2numpermb, num_camp_cp2_pos = naive_calling_cp2seq("edge_6104")

### For comparison, naively call mutations in BACT1 and compute $\mathrm{frac}_{\mathrm{BACT1}}$

In [None]:
bact1_naive_p2called_mutations, bact1_naive_p2numpermb, _ = naive_calling_fullseq("edge_1671")

### Just so we can update the `misc-text/` file, also do this for  BACT2

probs possible to get this info from another notebook but this is the easiest way to handle this imo

In [None]:
bact2_naive_p2called_mutations, bact2_naive_p2numpermb, _ = naive_calling_fullseq("edge_2358")

### Output info about FDR estimation for $p=0.5\%$ to `misc-text/`

In [None]:
# Total numbers of identified p-mutations
camp_nump = len(camp_naive_p2called_mutations[0.5])
bact1_nump = len(bact1_naive_p2called_mutations[0.5])
bact2_nump = len(bact2_naive_p2called_mutations[0.5])
# Scaled numbers of identified p-mutations per megabase (comparable across different-length genomes
# [at least, if you assume that genome length is the only confounding factor here, which it isn't -- we
# should mention this in the paper ofc])
camp_numpermb = camp_naive_p2numpermb[0.5]
bact1_numpermb = bact1_naive_p2numpermb[0.5]
bact2_numpermb = bact2_naive_p2numpermb[0.5]

bact1_fdr = camp_numpermb / bact1_numpermb

naiveinfo = (
    f"At $p=0.5$\\%, we identified {camp_nump:,}, {bact1_nump:,}, and {bact2_nump:,} $p$-mutations "
    f"in the {seq2name['edge_6104']}, {seq2name['edge_1671']}, and {seq2name['edge_2358']} genomes, "
    f"respectively. This illustrates that there exists a difference of nearly two orders of magnitude "
    f"in the numer of $p$-mutations per megabase across these genomes "
    f"({camp_numpermb:,.2f}, {bact1_numpermb:,.2f}, and {bact2_numpermb:,.2f} for "
    f"{seq2name['edge_6104']}, {seq2name['edge_1671']}, and {seq2name['edge_2358']}, respectively). "
    f"If the {seq2name['edge_6104']} genome, which has a relatively low mutation rate, is "
    f"selected as a decoy, then we estimate the FDR for the {seq2name['edge_1671']} genome at $p=0.5\\%$ as "
    "$\\frac{" + f"{camp_numpermb:,.2f}" + "}" + "{" + f"{bact1_numpermb:,.2f}" + "}" + f" \\approx {fdr:.4f}$."
)
with open("misc-text/naive-calling-target-decoy.tex", "w") as of:
    # see https://tex.stackexchange.com/a/18018
    of.write("{}\endinput".format(naiveinfo))

## Plot estimated BACT1 FDR vs. scaled number of identified mutations

In [None]:
def plot_bact1_fdr(decoy_p2numpermb, decoy_info, fig_basename):
    
    p2bact1fdr = {p: decoy_p2numpermb[p] / bact1_naive_p2numpermb[p] for p in percentages}
    
    fig, ax = pyplot.subplots(1)

    # FDR
    x = []
    # number of mutations per megabase
    y = []
    special_xys = []
    for p in percentages:
        cx = p2bact1fdr[p]
        cy = bact1_naive_p2numpermb[p]
        x.append(cx)
        y.append(cy)
        # add labels (manually positioned). yeah, i know i know
        dy = None
        dx = None
        if p == 50: dy = -15; dx = -0.01
        elif p == 25: dy = 20; dx = -0.01
        elif p == 13: dy = -5; dx = 0.005
        elif p == 10: dy = 80; dx = 0.0
        elif p == 5: dy = 0; dx = 0.005
        elif p == 2: dy = -10**3; dx = 0.005
        elif p == 1: dy = 10**3.3; dx = -0.015
        elif p == 0.5: dy = 10**3.5; dx = -0.005
        elif p == 0.25: dy = -10**3.6; dx = -0.01
        elif p == 0.2: dy = 10**3.65; dx = -0.01
        elif p == 0.15: dy = -10**3.8; dx = -0.01
        elif p == 0.1: dy = -10**4.25; dx = -0.02
        elif p == 0.05: dy = 0; dx = -0.03
        if dy is not None:
            if p >= 1:
                text = f"$p = {p:.0f}\%$"
            elif p >= 0.5:
                text = f"$p = {p:.1f}\%$"
            else:
                text = f"$p = {p:.2f}\%$"
            ax.text(cx + dx, cy + dy, text)
            special_xys.append((cx, cy))

    ax.plot(x, y, marker="o", color="#666666")
    ax.scatter([xy[0] for xy in special_xys], [xy[1] for xy in special_xys], color="#ff0000", zorder=2000, s=20)
    ax.set_xlabel(f"Estimated FDR for called $p$-mutations in {seq2name['edge_1671']}")
    ax.set_ylabel("Number of called $p$-mutations per megabase")
    ax.set_title(f"{seq2name['edge_1671']} FDR curve, using 1,000 values of $p$ from {max(percentages):.0f}% to {min(percentages):.2f}%\nDecoy genome: {decoy_info}", fontsize=16)
    use_thousands_sep(ax.yaxis)
    ax.set_yscale("symlog")
    fig.set_size_inches(15, 8)
    fig.savefig(f"figs/{fig_basename}.png", bbox_inches="tight")

### Plot BACT1 FDR with all of CAMP as a decoy

In [None]:
plot_bact1_fdr(
    camp_naive_p2numpermb,
    f"all {seq2len['edge_6104']:,} positions within {seq2name['edge_6104']}",
    "BACT1_FDR_CAMP_decoy"
)

### Plot BACT1 FDR with just CP 2 positions in CAMP as a decoy

In [None]:
plot_bact1_fdr(
    camp_cp2_naive_p2numpermb,
    f"only the {num_camp_cp2_pos:,} CP 2 positions in {seq2name['edge_6104']}",
    "BACT1_FDR_CAMP_CP2_decoy"
)