# Plot various distributions of mutated position information

In [2]:
%run "Header.ipynb"
import pickle
from linked_mutations_utils import find_mutated_positions

## Compute info

In [None]:
# In all of these three dicts, each seq points to a list where each value is from one
# pair of consecutive mutated positions.

# Describes distances between positions.
seq2dists = {}

# Describes number of reads supporting the most common nucleotide pair at these positions.
seq2reads = {}

# Describes link value of this most common nucleotide pair at these positions.
seq2links = {}

for i, seq in enumerate(SEQS):
    print(f"Computing distribution info for seq {seq2name[seq]}...")
    
    print(f"\tFinding mutated positions for seq {seq2name[seq]}...", end=" ", flush=True)
    mutated_positions = find_mutated_positions(seq)
    print("Done.")
    
    with open(f"phasing-data/{seq}_pospair2ntpair2freq.pickle", "rb") as loadster:
        pospair2ntpair2freq = pickle.load(loadster)
        
    with open(f"phasing-data/{seq}_pos2nt2freq.pickle", "rb") as loadster:
        pos2nt2freq = pickle.load(loadster)
    
    seq2dists[seq] = []
    seq2reads[seq] = []
    seq2links[seq] = []
    
    print(f"\tLooking through pairs of consecutive mutated positions in seq {seq2name[seq]}...")
    
    for mpi, left_mutpos in enumerate(mutated_positions):
        if mpi == len(mutated_positions) - 1 and seq in ("edge_2358", "edge_1671"):
            # Loop around the genome if it's a circular sequence.
            # Edge 6104 is excluded from this because it's just a linear sequence --
            # its full "genome" likely has not been completely assembled.
            #
            # Note that although the reads here probably go from left_mutpos -> mutated_positions[0], we
            # sort these positions the other way in this pair because the pairs are stored in sorted order.
            pospair = (mutated_positions[0], left_mutpos)
            
            # NOTE: mutated_positions, as with pospair2ntpair2freq, uses 0-indexed positions.
            # I don't thiiiink it makes a difference here (since in e.g. (p + x) - (p + x + 5), the x's
            # representing 1-indexing cancel out), though.
            #
            # This holds up under scrutiny of some examples: in the case where left_mutpos is the
            # actual last position (not just the last mutated position, the last POSITION) in the
            # genome (before looping), and mutated_positions[0] is similarly the first POSITION in
            # the genome, the distance is (length + 0 - (length - 1)) = 1.
            seq2dists[seq].append(seq2len[seq] + mutated_positions[0] - left_mutpos)
        else:
            pospair = (left_mutpos, mutated_positions[mpi + 1])
            seq2dists[seq].append(mutated_positions[mpi + 1] - left_mutpos)
            
        if pospair in pospair2ntpair2freq:
            max_freq_nt_pair = max(
                pospair2ntpair2freq[pospair].keys(),
                key=lambda ntpair: pospair2ntpair2freq[pospair][ntpair]
            )
            pair_reads = pospair2ntpair2freq[pospair][max_freq_nt_pair]
            seq2reads[seq].append(pair_reads)
            
            allele_1_freq = pos2nt2freq[pospair[0]][max_freq_nt_pair[0]]
            allele_2_freq = pos2nt2freq[pospair[1]][max_freq_nt_pair[1]]
            # Computation is done the same way as when adding link data to the graph in the previous phasing ntbk
            # I'm pretty dang sure that the denominator here can never be zero, b/c that would imply neither of
            # these alleles were observed in a read and we KNOW that can't be the case
            # ...but if this turns into a ZeroDivisionError in the future, feel free to send me an angry email 
            link = pair_reads / max(allele_1_freq, allele_2_freq)
            seq2links[seq].append(link)
        else:
            # This pair was never seen together on the same read :(
            seq2reads[seq].append(0)
            seq2links[seq].append(0)

with open(f"phasing-data/seq2dists.pickle", "wb") as dumpster:
    dumpster.write(pickle.dumps(seq2dists))
            
with open(f"phasing-data/seq2reads.pickle", "wb") as dumpster:
    dumpster.write(pickle.dumps(seq2reads))
    
with open(f"phasing-data/seq2links.pickle", "wb") as dumpster:
    dumpster.write(pickle.dumps(seq2links))

Computing distribution info for seq CAMP...
	Finding mutated positions for seq CAMP... 