# Demonstrate the "target-decoy" approach, as applied to metagenomic variant calling

In [1]:
%run "Header.ipynb"

In [2]:
import skbio
import pileup
from parse_sco import parse_sco

In [3]:
seq2pos2pileup = pileup.load()

## First: naive variant calling (call a position as "$p$-mutated" if $freq(pos) > p$)

We don't limit to "sufficiently-covered" positions here -- so we consider all regions throughout a genome.

In [6]:
percentages = [50, 25, 10, 5, 2, 1, 0.5, 0.25, 0.1, 0.05]
p2pct = get_p2pct(percentages)

In [7]:
# Maps seq name -> list of 1-indexed positions where we "call" a mutation
seq2naively_called_mutations = {}

In [34]:
def naive_calling(seq, verbose=True):
    """seqname should be in SEQS.
    
    Returns a tuple with two elements:
    
    1. p2called_mutations: A dict mapping values in "percentages" (defined above) to a list of 1-indexed
       "called" p-mutations in the sequence, using this percentage for p.
       
    2. p2numpermb: A dict with the same keys as p2called_mutations, but the values are the number of called
       p-mutations per megabase (1,000,000 bp = 1 Mbp) in this sequence.   
    """
    p2called_mutations = {p: [] for p in percentages}
    p2numpermb = {}
    seqlen = seq2len[seq]
    for pos in range(1, seqlen + 1):
        if verbose and pos % 100000 == 0:
            print(f"On {seq2name[seq]} position {pos:,} / {seqlen:,} ({100 * (pos / seqlen):.2f}%).")
        pos_pileup = seq2pos2pileup[seq][pos]
        for p in percentages:
            if pileup.naively_call_mutation(pos_pileup, p2pct[p]):
                p2called_mutations[p].append(pos)
                
    # We have the equation
    #
    #   # called muts              f
    # ------------------  = ----------------
    #  seq length (bp)        1,000,000 bp
    #
    # We know everything except for f. We can solve for f by multiplying the left side of the
    # equation by 1,000,000 bp. Since the only variable across different thresholds of p is the number
    # of called mutations, we can pre-compute this "constant length factor" (aka 1,000,000 / seq length).
    constant_length_factor = 1000000 / seq2len[seq]
    for p in p2called_mutations:
        num_called_mutations = len(p2called_mutations[p])
        # solve the equation above
        f = num_called_mutations * constant_length_factor
        p2numpermb[p] = f
        if verbose:
            print(f"p = {p}%: {num_called_mutations:,} called p-mutations in {seq2name[seq]}.")
            print(f"\tNumber of called p-mutations per megabase: f = {f:,.2f}.")
        
    return (p2called_mutations, p2numpermb)

### Naively call mutations in CAMP and compute $\mathrm{frac}_{\mathrm{decoy}}$

(We're treating CAMP as a "decoy" genome, where we assume that all called mutations within it will be incorrect.)

In [35]:
camp_naive_p2called_mutations, camp_naive_p2numpermb = naive_calling("edge_6104")

On CAMP position 100,000 / 1,289,244 (7.76%).
On CAMP position 200,000 / 1,289,244 (15.51%).
On CAMP position 300,000 / 1,289,244 (23.27%).
On CAMP position 400,000 / 1,289,244 (31.03%).
On CAMP position 500,000 / 1,289,244 (38.78%).
On CAMP position 600,000 / 1,289,244 (46.54%).
On CAMP position 700,000 / 1,289,244 (54.30%).
On CAMP position 800,000 / 1,289,244 (62.05%).
On CAMP position 900,000 / 1,289,244 (69.81%).
On CAMP position 1,000,000 / 1,289,244 (77.56%).
On CAMP position 1,100,000 / 1,289,244 (85.32%).
On CAMP position 1,200,000 / 1,289,244 (93.08%).
p = 50%: 0 called p-mutations in CAMP.
	Number of called p-mutations per megabase: f = 0.00.
p = 25%: 0 called p-mutations in CAMP.
	Number of called p-mutations per megabase: f = 0.00.
p = 10%: 35 called p-mutations in CAMP.
	Number of called p-mutations per megabase: f = 27.15.
p = 5%: 35 called p-mutations in CAMP.
	Number of called p-mutations per megabase: f = 27.15.
p = 2%: 52 called p-mutations in CAMP.
	Number of called

### For comparison, naively call mutations in BACTERIA and compute $\mathrm{frac}_{\mathrm{BACTERIA}}$

In [36]:
bacteria_naive_p2called_mutations, bacteria_naive_p2numpermb = naive_calling("edge_1671")

On BACTERIA position 100,000 / 2,153,394 (4.64%).
On BACTERIA position 200,000 / 2,153,394 (9.29%).
On BACTERIA position 300,000 / 2,153,394 (13.93%).
On BACTERIA position 400,000 / 2,153,394 (18.58%).
On BACTERIA position 500,000 / 2,153,394 (23.22%).
On BACTERIA position 600,000 / 2,153,394 (27.86%).
On BACTERIA position 700,000 / 2,153,394 (32.51%).
On BACTERIA position 800,000 / 2,153,394 (37.15%).
On BACTERIA position 900,000 / 2,153,394 (41.79%).
On BACTERIA position 1,000,000 / 2,153,394 (46.44%).
On BACTERIA position 1,100,000 / 2,153,394 (51.08%).
On BACTERIA position 1,200,000 / 2,153,394 (55.73%).
On BACTERIA position 1,300,000 / 2,153,394 (60.37%).
On BACTERIA position 1,400,000 / 2,153,394 (65.01%).
On BACTERIA position 1,500,000 / 2,153,394 (69.66%).
On BACTERIA position 1,600,000 / 2,153,394 (74.30%).
On BACTERIA position 1,700,000 / 2,153,394 (78.95%).
On BACTERIA position 1,800,000 / 2,153,394 (83.59%).
On BACTERIA position 1,900,000 / 2,153,394 (88.23%).
On BACTERIA p

### Estimate FDR of BACTERIA mutations, using CAMP as a decoy

In [40]:
p2bacteriafdr = {p: camp_naive_p2numpermb[p] / bacteria_naive_p2numpermb[p] for p in percentages}
for p in percentages:
    print(
        f"Estimated FDR for p = {p}% = "
        f"{camp_naive_p2numpermb[p]:,.2f} / {bacteria_naive_p2numpermb[p]:,.2f} = "
        f"{p2bacteriafdr[p]:.2f}"
    )

Estimated FDR for p = 50% = 0.00 / 71.98 = 0.00
Estimated FDR for p = 25% = 0.00 / 79.87 = 0.00
Estimated FDR for p = 10% = 27.15 / 322.28 = 0.08
Estimated FDR for p = 5% = 27.15 / 3,455.94 = 0.01
Estimated FDR for p = 2% = 40.33 / 8,345.43 = 0.00
Estimated FDR for p = 1% = 64.38 / 10,512.24 = 0.01
Estimated FDR for p = 0.5% = 219.51 / 11,397.82 = 0.02
Estimated FDR for p = 0.25% = 1,310.85 / 14,342.01 = 0.09
Estimated FDR for p = 0.1% = 17,977.20 / 51,605.98 = 0.35
Estimated FDR for p = 0.05% = 101,366.38 / 280,698.28 = 0.36


### Output info about FDR estimation for $p=0.5\%$ to `misc-text/`

In [48]:
# Total numbers of identified p-mutations
camp1 = len(camp_naive_p2called_mutations[0.5])
bact1 = len(bacteria_naive_p2called_mutations[0.5])
# Scaled numbers of identified p-mutations per megabase (comparable across different-length genomes
# [at least, if you assume that genome length is the only confounding factor here, which it isn't -- we
# should mention this in the paper ofc])
camp2 = camp_naive_p2numpermb[0.5]
bact2 = bacteria_naive_p2numpermb[0.5]
naiveinfo = (
    f"For example, at $p=0.5$\\%, we identified {camp1:,} $p$-mutations in the CAMP genome and {bact1:,} "
    f"$p$-mutations in the BACTERIA genome, corresponding respectively to {camp2:,.2f} and {bact2:,.2f} "
    "$p$-mutations per megabase. If the CAMP genome, which has a relatively low mutation rate, is "
    "selected as a decoy, then we estimate the FDR for the BACTERIA genome as "
    "$\\frac{" + f"{camp2:,.2f}" + "}" + "{" + f"{bact2:,.2f}" + "}" + f" \\approx {p2bacteriafdr[0.5]:.2f}$."
)
with open("misc-text/naive-calling-target-decoy.tex", "w") as of:
    # see https://tex.stackexchange.com/a/18018
    of.write("{}\endinput".format(naiveinfo))