# Generate misc. text files that can be loaded directly into LaTeX

The objective of this is to reduce the amount of stuff I have to keep updating in the report manually -- this should both reduce errors and save time when re-running things.

This file does not necessarily generate all of the stuff in `misc-text/` -- I will probably split this up if it gets unwieldy.

In [1]:
%run "Header.ipynb"
%run "GeneUtils.ipynb"
import pileup
seq2pos2pileup = pileup.load()

## Describe the number of mutated positions with "ties" in the most-common mutation

As might be expected, there are not a lot of these positions.

In [2]:
p = 0.5 / 100

numties = 0
for seq in SEQS:
    for pos in range(1, seq2len[seq] + 1):
        if pileup.naively_call_mutation(seq2pos2pileup[seq][pos], p):
            alts = pileup.get_mismatch_cts(seq2pos2pileup[seq][pos])
            if alts.count(max(alts)) > 1:
                numties += 1

with open("misc-text/num-alt-nt-ties.tex", "w") as of:
    # see https://tex.stackexchange.com/a/18018
    of.write("{}\endinput".format(numties))

## Count the number of reads with supplementary alignments, and with _overlapping_ supplementary alignments

In [3]:
import pysam
from itertools import combinations
from collections import defaultdict
from statistics import mean

bf = pysam.AlignmentFile("../main-workflow/output/aln-sorted.bam", "rb")
bf2 = pysam.AlignmentFile("../main-workflow/output/overlap-supp-aln-filtered-and-sorted-aln.bam", "rb")

outputtext = (
    "Before filtering overlapping supplementary alignments and before filtering\n"
    "partially-mapped reads, the following supplementary alignment statistics held.\n"
)

for seqi, seq in enumerate(SEQS):
    print(f"On seq {seq2name[seq]}.")
    read2refranges = defaultdict(list)
    read2atleast_one_supp_seen = defaultdict(bool)
    for ri, read in enumerate(bf.fetch(seq), 1):
        rn = read.query_name
        rng = range(read.reference_start, read.reference_end)
        read2refranges[rn].append(rng)
        if read.is_supplementary:
            read2atleast_one_supp_seen[rn] = True

    reads_with_supp_ct = 0
    overlap_lens = []
    num_alns_of_reads_with_supp = []
    reads_with_overlap_ct = 0
    for r in read2refranges:
        if len(read2refranges[r]) > 1:
            reads_with_supp_ct += 1
            num_alns_of_reads_with_supp.append(len(read2refranges[r]))
            aln_overlaps = []
            for combo in combinations(read2refranges[r], 2):
                range_overlap = set(combo[0]) & set(combo[1])
                if range_overlap:
                    aln_overlaps.append(len(range_overlap))
            if len(aln_overlaps) > 0:
                reads_with_overlap_ct += 1
                overlap_lens += aln_overlaps

    numreads = len(read2refranges)
    pctreadswithsupp = 100 * (reads_with_supp_ct / numreads)
    pctreadswithoverlaps = 100 * (reads_with_overlap_ct / numreads)
    avgoverlap = mean(overlap_lens)
    avg_num_alignments = mean(num_alns_of_reads_with_supp)
    
    print(f"Average number of alignments of reads with supp alignments: {avg_num_alignments:,.5f}")
    
    print(f"Unique reads in unfiltered aln: {numreads:,}")
    
    # This small block of code verifies that the OSA-filtered alignment doesn't actually remove any UNIQUE
    # reads. Eventually this code should be removed from here and stored in a dedicated test module.
    unique_reads_in_filtered_aln = set()
    for ri2, read2 in enumerate(bf2.fetch(seq), 1):
        rn2 = read2.query_name
        unique_reads_in_filtered_aln.add(rn2)
    numreads2 = len(unique_reads_in_filtered_aln)
    print(f"Unique reads in filtered aln: {numreads2:,}")
    if numreads2 != numreads:
        raise ValueError(f"Looks like the OSA-filtered alignment removed unique reads? {numreads:,} -> {numreads2:,}")
    
    # Add extra line break btwn. adjacent sentences
    outputtext += "\n"
        
    outputtext += (
        f"In the {seq2name[seq]} genome, {reads_with_supp_ct:,} / {numreads:,}"
        f" ({pctreadswithsupp:.2f}\\%) unique reads aligned to within the genome had supplementary alignments"
        f" within the genome.\nOn average, these {reads_with_supp_ct:,} reads had {avg_num_alignments:.2f} alignments.\n"
        f"Furthermore, {reads_with_overlap_ct:,} / {numreads:,} ({pctreadswithoverlaps:.2f}\\%)"
        " unique reads aligned to within the genome had supplementary alignments within the genome"
        " \emph{and} had overlap between at least one pair of their alignments on the reference genome.\nThe"
        " average length of these overlaps (considering all pairs of overlapping alignments"
        f" from the same read) was {avgoverlap:,.2f} bp.\n" 
    )
    
    # Also really quick, validate that none of the reads with multiple alignments lack supp alignments.
    # This could happen if we forgot to filter secondary reads or something.
    for r in read2refranges:
        if len(read2refranges[r]) > 1:
            if not read2atleast_one_supp_seen[r]:
                print(f"Read {r} had no supplementary alignments but still has multiple alignments???")
                
with open("misc-text/overlapping-supp-aln-stats.tex", "w") as of:
    of.write("{}\endinput".format(outputtext))

On seq CAMP.
Average number of alignments of reads with supp alignments: 2.07484
Unique reads in unfiltered aln: 503,385
Unique reads in filtered aln: 503,385
On seq BACTERIA.
Average number of alignments of reads with supp alignments: 2.04318
Unique reads in unfiltered aln: 268,075
Unique reads in filtered aln: 268,075
On seq BACTEROIDALES.
Average number of alignments of reads with supp alignments: 2.04184
Unique reads in unfiltered aln: 745,461
Unique reads in filtered aln: 745,461
