# Generate misc. text files that can be loaded directly into LaTeX

The objective of this is to reduce the amount of stuff I have to keep updating in the report manually -- this should both reduce errors and save time when re-running things.

This file does not necessarily generate all of the stuff in `misc-text/` -- I will probably split this up if it gets unwieldy.

In [1]:
%run "Header.ipynb"
%run "LoadMutationJSONData.ipynb"
%run "GeneUtils.ipynb"

## Describe the number of mutated positions with "ties" in the most-common mutation

As might be expected, there are not a lot of these positions.

In [2]:
minfreq_pct = 0.5 / 100

# "pseudo variant caller"
pvc = lambda cov, mismatches: 1 if (mismatches / cov) > minfreq_pct else 0

numties = 0
for seq in SEQS:
    for pos in seq2pos2totalcov[seq]:
        if get_val(seq, pos, pvc):
            alts = seq2pos2mismatches[seq][pos]
            maxalt = max(alts, key=alts.get)
            maxalts = [a for a in alts if alts[a] == alts[maxalt]]
            if len(maxalts) > 1:
                # print(seq, pos, alts)
                # print(seq2pos2mismatchct[seq][pos])
                # print(seq2pos2matchct[seq][pos])
                numties += 1

with open("misc-text/num-alt-nt-ties.tex", "w") as of:
    # see https://tex.stackexchange.com/a/18018
    of.write("{}\endinput".format(numties))

## Count the number of reads with supplementary alignments, and with _overlapping_ supplementary alignments

In [3]:
import pysam
from itertools import combinations
from collections import defaultdict

bf = pysam.AlignmentFile("../main-workflow/output/fully-filtered-and-sorted-aln.bam", "rb")

outputtext = ""

for seqi, seq in enumerate(SEQS):
    read2refranges = defaultdict(list)
    read2atleast_one_supp_seen = defaultdict(bool)
    for ri, read in enumerate(bf.fetch(seq), 1):
        rn = read.query_name
        rng = range(read.reference_start, read.reference_end)
        read2refranges[rn].append(rng)
        if read.is_supplementary:
            read2atleast_one_supp_seen[rn] = True

    reads_with_supp_ct = 0
    overlaps = set()
    for r in read2refranges:
        if len(read2refranges[r]) > 1:
            reads_with_supp_ct += 1
            for combo in combinations(read2refranges[r], 2):
                if set(combo[0]) & set(combo[1]):
                    overlaps.add(r)

    numreads = len(read2refranges)
    pctreadswithsupp = 100 * (reads_with_supp_ct / numreads)
    pctreadswithoverlaps = 100 * (len(overlaps) / numreads)
    
    # Add extra line break btwn. adjacent sentences
    if seqi > 0:
        outputtext += "\n"
        
    outputtext += (
        f"In the {seq2name[seq]} genome, {reads_with_supp_ct:,} / {numreads:,}"
        f" ({pctreadswithsupp:.2f}\\%) unique reads have supplementary alignments."
        f" {len(overlaps):,} / {numreads:,} ({pctreadswithoverlaps:.2f}\\%) unique"
        f" reads with supplementary alignments have overlap between at least one pair of alignments.\n"
    )
    
    # Also really quick, validate that none of the reads with multiple alignments lack supp alignments.
    # This could happen if we forgot to filter secondary reads or something.
    for r in read2refranges:
        if len(read2refranges[r]) > 1:
            if not read2atleast_one_supp_seen[r]:
                print(f"Read {r} had no supplementary alignments but still has multiple alignments???")
                
with open("misc-text/overlapping-supp-aln-stats.tex", "w") as of:
    of.write("{}\endinput".format(outputtext))