# Generate misc. text files that can be loaded directly into LaTeX

The objective of this is to reduce the amount of stuff I have to keep updating in the report manually -- this should both reduce errors and save time when re-running things.

This file does not necessarily generate all of the stuff in `misc-text/` -- I will probably split this up if it gets unwieldy.

In [1]:
%run "Header.ipynb"
%run "GeneUtils.ipynb"
import pileup
seq2pos2pileup = pileup.load()

## Describe the number of mutated positions with "ties" in the most-common mutation

As might be expected, there are not a lot of these positions.

In [10]:
p = 0.5

numties = 0
for seq in SEQS:
    for pos in range(1, seq2len[seq] + 1):
        if pileup.naively_call_mutation(seq2pos2pileup[seq][pos], p):
            alts = pileup.get_mismatch_cts(seq2pos2pileup[seq][pos])
            if alts.count(max(alts)) > 1:
                numties += 1
                # Printing this is just for my own sanity
                # ALSO: If you're wondering why we only got one warning in the highly mutated gene tables
                # ntbk about this (as of writing), it's because just one of these two positions is in
                # a gene (in edge 1671); the other of these two positions is in an intergenic region in edge 2358
                print(f"Max-freq-alt-nt tie at seq {seq}, pos {pos:,}, pileup {seq2pos2pileup[seq][pos]}.")

with open("misc-text/num-alt-nt-ties.tex", "w") as of:
    # see https://tex.stackexchange.com/a/18018
    of.write("{}\endinput".format(numties))

Max-freq-alt-nt tie at seq edge_1671, pos 1,381,119, pileup [[38, 38, 0, 1414], 3, 0].
Max-freq-alt-nt tie at seq edge_2358, pos 47,221, pileup [[14, 2637, 6, 14], 1, 4].


## Count the number of reads with supplementary alignments, and with _overlapping_ supplementary alignments

In [4]:
import pysam
from itertools import combinations
from collections import defaultdict
from statistics import mean

bf = pysam.AlignmentFile("../main-workflow/output/aln-sorted.bam", "rb")
bf2 = pysam.AlignmentFile("../main-workflow/output/overlap-supp-aln-filtered-and-sorted-aln.bam", "rb")

outputtext = (
    "Before filtering overlapping supplementary alignments and before filtering\n"
    "partially-mapped reads, the following supplementary alignment statistics held.\n"
)

for seqi, seq in enumerate(SEQS):
    print(f"On seq {seq2name[seq]}.")
    read2refranges = defaultdict(list)
    read2atleast_one_supp_seen = defaultdict(bool)
    for ri, read in enumerate(bf.fetch(seq), 1):
        rn = read.query_name
        rng = range(read.reference_start, read.reference_end)
        read2refranges[rn].append(rng)
        if read.is_supplementary:
            read2atleast_one_supp_seen[rn] = True

    reads_with_supp_ct = 0
    overlap_lens = []
    num_alns_of_reads_with_supp = []
    reads_with_overlap_ct = 0
    for r in read2refranges:
        if len(read2refranges[r]) > 1:
            reads_with_supp_ct += 1
            num_alns_of_reads_with_supp.append(len(read2refranges[r]))
            aln_overlaps = []
            for combo in combinations(read2refranges[r], 2):
                range_overlap = set(combo[0]) & set(combo[1])
                if range_overlap:
                    aln_overlaps.append(len(range_overlap))
            if len(aln_overlaps) > 0:
                reads_with_overlap_ct += 1
                overlap_lens += aln_overlaps

    numreads = len(read2refranges)
    pctreadswithsupp = 100 * (reads_with_supp_ct / numreads)
    pctreadswithoverlaps = 100 * (reads_with_overlap_ct / numreads)
    avgoverlap = mean(overlap_lens)
    avg_num_alignments = mean(num_alns_of_reads_with_supp)
    
    print(f"Average number of alignments of reads with supp alignments: {avg_num_alignments:,.5f}")
    
    print(f"Unique reads in unfiltered aln: {numreads:,}")
    
    # This small block of code verifies that the OSA-filtered alignment doesn't actually remove any UNIQUE
    # reads. Eventually this code should be removed from here and stored in a dedicated test module.
    unique_reads_in_filtered_aln = set()
    for ri2, read2 in enumerate(bf2.fetch(seq), 1):
        rn2 = read2.query_name
        unique_reads_in_filtered_aln.add(rn2)
    numreads2 = len(unique_reads_in_filtered_aln)
    print(f"Unique reads in filtered aln: {numreads2:,}")
    
    # Add extra line break btwn. adjacent sentences
    outputtext += "\n"
        
    outputtext += (
        f"In the {seq2name[seq]} genome, {reads_with_supp_ct:,} / {numreads:,}"
        f" ({pctreadswithsupp:.2f}\\%) unique reads aligned to within the genome had supplementary alignments"
        f" within {seq2name[seq]}.\nOn average, these {reads_with_supp_ct:,} reads had {avg_num_alignments:.2f}"
        f" alignments within {seq2name[seq]}.\n"
        f"Furthermore, {reads_with_overlap_ct:,} / {numreads:,} ({pctreadswithoverlaps:.2f}\\%)"
        f" unique reads aligned to within {seq2name[seq]} had supplementary alignments within {seq2name[seq]}"
        " \emph{and} had reference overlap between at least one pair of their alignments within"
        f" {seq2name[seq]}.\nThe"
        f" average length of these overlaps (considering all pairs of overlapping alignments"
        f" within {seq2name[seq]} from the same read) was {avgoverlap:,.2f} bp.\n"
    )
    
    # Also really quick, validate that none of the reads with multiple alignments lack supp alignments.
    # This could happen if we forgot to filter secondary reads or something.
    for r in read2refranges:
        if len(read2refranges[r]) > 1:
            if not read2atleast_one_supp_seen[r]:
                print(f"Read {r} had no supplementary alignments but still has multiple alignments???")
                
with open("misc-text/overlapping-supp-aln-stats.tex", "w") as of:
    of.write("{}\endinput".format(outputtext))

On seq CAMP.
Average number of alignments of reads with supp alignments: 2.07484
Unique reads in unfiltered aln: 503,385
Unique reads in filtered aln: 498,584
On seq BACT1.
Average number of alignments of reads with supp alignments: 2.04318
Unique reads in unfiltered aln: 268,075
Unique reads in filtered aln: 267,556
On seq BACT2.
Average number of alignments of reads with supp alignments: 2.04184
Unique reads in unfiltered aln: 745,461
Unique reads in filtered aln: 744,512


## Count the codon positions of the $p = 0.5\%$ mutations in the highest-mutation-rate gene in CAMP

In [12]:
import parse_sco
camp_genes = parse_sco.parse_sco("../seqs/genes/edge_6104.sco")

# Check that this gene exists -- imaybe its coordinates might slightly shift or something if we change some
# prodigal parameters, but in that case I'll need to change the number I'm using here
g = camp_genes.loc[1217]
assert g.LeftEnd == 1208927 and g.RightEnd == 1210075

In [14]:
pCeil = 20
pHi = 15
pLo = 0.5

max_non_mutated_alt_pct = float("-inf")

mutation_count = 0
# Records number of mutations at CP 1, 2, 3 within this gene
mutated_cp_count = [0, 0, 0]
cp = 1
for pos in range(g.LeftEnd, g.RightEnd + 1):
    pospileup = seq2pos2pileup["edge_6104"][pos]
    if pileup.naively_call_mutation(pospileup, pLo):
        mutated_cp_count[cp - 1] += 1
        mutation_count += 1
        print(f"Found CP {cp} mutation at position {pos:,}. Pileup: {pospileup}")
        
        # ensure that all p = 0.5% mutations are also the close-to-p=16% mutations. Not a big deal if not,
        # but the text does describe this so we should make sure we're correct.
        assert pileup.naively_call_mutation(seq2pos2pileup["edge_6104"][pos], pHi)
        assert not pileup.naively_call_mutation(seq2pos2pileup["edge_6104"][pos], pCeil)
    else:
        max_non_mutated_alt_pct = max(max_non_mutated_alt_pct, pileup.get_alt_nt_pct(pospileup))
        
    cp += 1
    
    # yes i'm aware we could just use modulos, but i'm tired
    if cp == 4:
        cp = 1
    elif cp > 4:
        raise ValueError(
            "marcus can't do basic math, if you get this error message send him an email laughing at him"
        )

# If this isn't true, something went very wrong!
assert mutation_count == sum(mutated_cp_count)

cp1, cp2, cp3 = mutated_cp_count
text = (
    f"{mutation_count:,} positions ({cp1:,} in CP1, {cp2:,} in CP2, and {cp3:,} in CP3) have mutation "
    f"frequencies close to 16\%, while all remaining positions in this gene have mutation rates of at "
    f"most {max_non_mutated_alt_pct * 100:.2f}\%."
)
print("-" * 79)
print(text)

with open("misc-text/camp-g1217-cpstats.tex", "w") as of:
    of.write("{}\endinput".format(text))

Found CP 3 mutation at position 1,209,001. Pileup: [[3757, 1, 709, 3], 0, 0]
Found CP 3 mutation at position 1,209,010. Pileup: [[3744, 703, 0, 0], 0, 26]
Found CP 3 mutation at position 1,209,022. Pileup: [[3758, 0, 711, 1], 0, 0]
Found CP 3 mutation at position 1,209,058. Pileup: [[0, 3773, 0, 718], 1, 0]
Found CP 1 mutation at position 1,209,104. Pileup: [[1, 3777, 0, 713], 1, 4]
Found CP 3 mutation at position 1,209,115. Pileup: [[706, 3773, 0, 1], 1, 10]
Found CP 3 mutation at position 1,209,121. Pileup: [[713, 1, 3778, 0], 2, 2]
Found CP 2 mutation at position 1,209,126. Pileup: [[3777, 1, 712, 1], 0, 3]
Found CP 3 mutation at position 1,209,133. Pileup: [[710, 0, 3775, 2], 2, 6]
Found CP 3 mutation at position 1,209,136. Pileup: [[1, 717, 1, 3772], 3, 1]
Found CP 3 mutation at position 1,209,142. Pileup: [[717, 0, 3777, 0], 2, 0]
Found CP 3 mutation at position 1,209,145. Pileup: [[1, 3776, 0, 715], 1, 1]
Found CP 3 mutation at position 1,209,148. Pileup: [[3778, 0, 0, 714], 0, 

## How many $p = 1\%$ mutations across each of the three MAGs have at least one deletion aligned?

This is likely a reason why the BACT1 smooth phasing graph looks so simple, in comparison -- a lot of stuff has been filtered out.

In [22]:
p = 1

num_muts_total = defaultdict(int)
num_muts_with_at_least_1_deletion = defaultdict(int)
num_muts_with_at_least_5_deletions = defaultdict(int)
num_muts_with_at_least_15_deletions = defaultdict(int)

for seq in SEQS:
    for pos in range(1, seq2len[seq] + 1):
        if pileup.naively_call_mutation(seq2pos2pileup[seq][pos], p):
            num_muts_total[seq] += 1
            num_del = pileup.get_deletions(seq2pos2pileup[seq][pos])
            if num_del >= 1:
                num_muts_with_at_least_1_deletion[seq] += 1
                # print(f"Seq {seq} has {num_del:,} deletions at pos {pos:,}")
                if num_del >= 5:
                    num_muts_with_at_least_5_deletions[seq] += 1
                    if num_del >= 15:
                        num_muts_with_at_least_15_deletions[seq] += 1

                    
    print(
        f"{seq2name[seq]}:\n  {num_muts_with_at_least_1_deletion[seq]:,} / {num_muts_total[seq]:,} "
        f"({(num_muts_with_at_least_1_deletion[seq] / num_muts_total[seq]) * 100:.2f}%) "
        f"p = 1% muts have ≥ 1 deletion in the pileup."
    )
    print(
        f"  {num_muts_with_at_least_5_deletions[seq]:,} / {num_muts_total[seq]:,} "
        f"({(num_muts_with_at_least_5_deletions[seq] / num_muts_total[seq]) * 100:.2f}%) "
        f"p = 1% muts have ≥ 5 deletions in the pileup."
    )
    print(
        f"  {num_muts_with_at_least_15_deletions[seq]:,} / {num_muts_total[seq]:,} "
        f"({(num_muts_with_at_least_15_deletions[seq] / num_muts_total[seq]) * 100:.2f}%) "
        f"p = 1% muts have ≥ 15 deletions in the pileup."
    )
                    
# with open("misc-text/num-alt-nt-ties.tex", "w") as of:
#     # see https://tex.stackexchange.com/a/18018
#     of.write("{}\endinput".format(numties))

CAMP:
  57 / 83 (68.67%) p = 1% muts have ≥ 1 deletion in the pileup.
  23 / 83 (27.71%) p = 1% muts have ≥ 5 deletions in the pileup.
  12 / 83 (14.46%) p = 1% muts have ≥ 15 deletions in the pileup.
BACT1:
  8,269 / 22,415 (36.89%) p = 1% muts have ≥ 1 deletion in the pileup.
  2,159 / 22,415 (9.63%) p = 1% muts have ≥ 5 deletions in the pileup.
  872 / 22,415 (3.89%) p = 1% muts have ≥ 15 deletions in the pileup.
BACT2:
  274 / 380 (72.11%) p = 1% muts have ≥ 1 deletion in the pileup.
  189 / 380 (49.74%) p = 1% muts have ≥ 5 deletions in the pileup.
  151 / 380 (39.74%) p = 1% muts have ≥ 15 deletions in the pileup.


In [17]:
num_muts_with_at_least_1_deletion

defaultdict(int, {'edge_6104': 57, 'edge_1671': 8269, 'edge_2358': 274})

In [18]:
num_muts_with_at_least_5_deletions

defaultdict(int, {'edge_6104': 23, 'edge_1671': 2159, 'edge_2358': 189})