# Compute "link graph" for phasing

This'll be continued in the next notebook, `Phasing-02-VizGraph.ipynb`.

In [1]:
%run "Header.ipynb"

In [2]:
import time
import pickle
import pysam
import skbio
import networkx as nx
from collections import defaultdict
from itertools import combinations
from linked_mutations_utils import (
    find_mutated_positions, gen_ddi, MINSPAN, MINLINK_EXCLUSIVE,
    MIN_ALLELE_FREQ_EXCLUSIVE
)

In [3]:
# This probably won't save a noticeable amount of memory, but humor me
i2n = "ACGT"
n2i = {"A": 0, "C": 1, "G": 2, "T": 3}

## 1. For each read, identify all nucleotides aligned to mutated positions spanned by this read

This takes about 1.8 hours for the three selected genomes. (That said, these genomes have super high coverage, so for less-well-covered genomes this will probably go faster.)

In [4]:
bf = pysam.AlignmentFile("../main-workflow/output/fully-filtered-and-sorted-aln.bam", "rb")

t1 = time.time()
for seq in SEQS:
    fasta = skbio.DNA.read("../seqs/{}.fasta".format(seq))
    
    # Identify all mutated positions in this genome up front to save time.
    print(f"Identifying mutated positions in genome {seq2name[seq]}...")
    mutated_positions = find_mutated_positions(seq)
    print(f"Found {len(mutated_positions):,} mutated positions in {seq2name[seq]}.")
    print("Going through these positions...")
    
    # This should already be implicitly sorted, I think, but the code below relies on mutated_positions being
    # in the exact same order as expected. So we may as well be paranoid.
    mutated_positions = sorted(mutated_positions)
    
    # Maps read name to another dict of mutated position -> aligned nucleotide (in A, C, G, T).
    # We build this up all at once so that we can take supplementary alignments of the same read into account.
    readname2mutpos2nt = defaultdict(dict)
    
    # Go through all linear alignments of each read to this genome...
    ts1 = time.time()
    for ai, aln in enumerate(bf.fetch(seq), 1):
        if ai % 1000 == 0:
            print(
                f"\tOn aln {ai:,} in seq {seq2name[seq]}. "
                f"Time spent on {seq2name[seq]} so far: {time.time() - ts1:,.2f} sec."
            )
        ap = aln.get_aligned_pairs(matches_only=True)
        
        # Iterating through the aligned pairs is expensive. Since read lengths are generally in the thousands
        # to tens of thousands of bp (which is much less than the > 1 million bp length of any bacterial genome),
        # we set things up so that we only iterate through the aligned pairs once. We maintain an integer, mpi,
        # that is a poor man's "pointer" to an index in mutated_positions.
        
        mpi = 0
        
        # Go through this aln's aligned pairs. As we see each pair, compare the pair's reference position
        # (refpos) to the mpi-th mutated position (herein referred to as "mutpos").
        #
        # If refpos >  mutpos, increment mpi until refpos <= mutpos (stopping as early as possible).
        # If refpos == mutpos, we have a match! Update readname2mutpos2ismutated[mutpos] based on
        #                      comparing the read to the reference at the aligned positions.
        # If refpos <  mutpos, continue to the next pair.
        
        readname = aln.query_name
        for pair in ap:
            
            refpos = pair[1]
            mutpos = mutated_positions[mpi]
            
            no_mutations_to_right_of_here = False
            
            # Increment mpi until we get to the next mutated position at or after the reference pos for this
            # aligned pair (or until we run out of mutated positions).
            while refpos > mutpos:
                mpi += 1
                if mpi < len(mutated_positions):
                    mutpos = mutated_positions[mpi]
                else:
                    no_mutations_to_right_of_here = True
                    break
            
            # I expect this should happen only for reads aligned near the right end of the genome.
            if no_mutations_to_right_of_here:
                break
            
            # If the next mutation occurs after this aligned pair, continue on to a later pair.
            if refpos < mutpos:
                continue
                
            # If we've made it here, refpos == mutpos!
            # (...unless I messed something up in how I designed this code.)
            if refpos != mutpos:
                raise ValueError("This should never happen!")
                
            # Finally, get the nucleotide aligned to this mutated position from this read.
            readpos = pair[0]
            # (Convert the nucleotide to an integer in the range [0, 3] using n2i)
            readval = n2i[aln.query_sequence[readpos]]
            
            # Record this specific "allele" for this read. We can use this to link alleles that co-occur
            # on the same read.
            readname2mutpos2nt[readname][mutpos] = readval
            
    with open(f"phasing-data/{seq}_readname2mutpos2nt.pickle", "wb") as dumpster:
        dumpster.write(pickle.dumps(readname2mutpos2nt))
        
print(f"Time taken: {time.time() - t1:,} sec.")

Identifying mutated positions in genome CAMP...
Found 284 mutated positions in CAMP.
Going through these positions...
	On aln 1,000 in seq CAMP. Time spent on CAMP so far: 1.78 sec.
	On aln 2,000 in seq CAMP. Time spent on CAMP so far: 3.41 sec.
	On aln 3,000 in seq CAMP. Time spent on CAMP so far: 5.23 sec.
	On aln 4,000 in seq CAMP. Time spent on CAMP so far: 6.89 sec.
	On aln 5,000 in seq CAMP. Time spent on CAMP so far: 8.96 sec.
	On aln 6,000 in seq CAMP. Time spent on CAMP so far: 12.06 sec.
	On aln 7,000 in seq CAMP. Time spent on CAMP so far: 15.06 sec.
	On aln 8,000 in seq CAMP. Time spent on CAMP so far: 18.08 sec.
	On aln 9,000 in seq CAMP. Time spent on CAMP so far: 21.14 sec.
	On aln 10,000 in seq CAMP. Time spent on CAMP so far: 24.28 sec.
	On aln 11,000 in seq CAMP. Time spent on CAMP so far: 27.33 sec.
	On aln 12,000 in seq CAMP. Time spent on CAMP so far: 30.40 sec.
	On aln 13,000 in seq CAMP. Time spent on CAMP so far: 33.41 sec.
	On aln 14,000 in seq CAMP. Time spent

	On aln 122,000 in seq CAMP. Time spent on CAMP so far: 379.23 sec.
	On aln 123,000 in seq CAMP. Time spent on CAMP so far: 382.45 sec.
	On aln 124,000 in seq CAMP. Time spent on CAMP so far: 385.71 sec.
	On aln 125,000 in seq CAMP. Time spent on CAMP so far: 388.91 sec.
	On aln 126,000 in seq CAMP. Time spent on CAMP so far: 392.13 sec.
	On aln 127,000 in seq CAMP. Time spent on CAMP so far: 395.34 sec.
	On aln 128,000 in seq CAMP. Time spent on CAMP so far: 398.54 sec.
	On aln 129,000 in seq CAMP. Time spent on CAMP so far: 401.76 sec.
	On aln 130,000 in seq CAMP. Time spent on CAMP so far: 404.99 sec.
	On aln 131,000 in seq CAMP. Time spent on CAMP so far: 408.30 sec.
	On aln 132,000 in seq CAMP. Time spent on CAMP so far: 411.56 sec.
	On aln 133,000 in seq CAMP. Time spent on CAMP so far: 414.80 sec.
	On aln 134,000 in seq CAMP. Time spent on CAMP so far: 418.03 sec.
	On aln 135,000 in seq CAMP. Time spent on CAMP so far: 421.29 sec.
	On aln 136,000 in seq CAMP. Time spent on CAMP 

	On aln 243,000 in seq CAMP. Time spent on CAMP so far: 770.91 sec.
	On aln 244,000 in seq CAMP. Time spent on CAMP so far: 774.19 sec.
	On aln 245,000 in seq CAMP. Time spent on CAMP so far: 777.44 sec.
	On aln 246,000 in seq CAMP. Time spent on CAMP so far: 780.75 sec.
	On aln 247,000 in seq CAMP. Time spent on CAMP so far: 783.99 sec.
	On aln 248,000 in seq CAMP. Time spent on CAMP so far: 787.21 sec.
	On aln 249,000 in seq CAMP. Time spent on CAMP so far: 790.47 sec.
	On aln 250,000 in seq CAMP. Time spent on CAMP so far: 793.77 sec.
	On aln 251,000 in seq CAMP. Time spent on CAMP so far: 797.08 sec.
	On aln 252,000 in seq CAMP. Time spent on CAMP so far: 800.28 sec.
	On aln 253,000 in seq CAMP. Time spent on CAMP so far: 803.52 sec.
	On aln 254,000 in seq CAMP. Time spent on CAMP so far: 806.77 sec.
	On aln 255,000 in seq CAMP. Time spent on CAMP so far: 810.06 sec.
	On aln 256,000 in seq CAMP. Time spent on CAMP so far: 813.40 sec.
	On aln 257,000 in seq CAMP. Time spent on CAMP 

	On aln 363,000 in seq CAMP. Time spent on CAMP so far: 1,164.23 sec.
	On aln 364,000 in seq CAMP. Time spent on CAMP so far: 1,167.45 sec.
	On aln 365,000 in seq CAMP. Time spent on CAMP so far: 1,170.80 sec.
	On aln 366,000 in seq CAMP. Time spent on CAMP so far: 1,174.16 sec.
	On aln 367,000 in seq CAMP. Time spent on CAMP so far: 1,177.52 sec.
	On aln 368,000 in seq CAMP. Time spent on CAMP so far: 1,180.87 sec.
	On aln 369,000 in seq CAMP. Time spent on CAMP so far: 1,184.15 sec.
	On aln 370,000 in seq CAMP. Time spent on CAMP so far: 1,187.41 sec.
	On aln 371,000 in seq CAMP. Time spent on CAMP so far: 1,190.79 sec.
	On aln 372,000 in seq CAMP. Time spent on CAMP so far: 1,194.02 sec.
	On aln 373,000 in seq CAMP. Time spent on CAMP so far: 1,197.38 sec.
	On aln 374,000 in seq CAMP. Time spent on CAMP so far: 1,200.63 sec.
	On aln 375,000 in seq CAMP. Time spent on CAMP so far: 1,203.91 sec.
	On aln 376,000 in seq CAMP. Time spent on CAMP so far: 1,207.17 sec.
	On aln 377,000 in s

	On aln 3,000 in seq BACT1. Time spent on BACT1 so far: 7.62 sec.
	On aln 4,000 in seq BACT1. Time spent on BACT1 so far: 10.94 sec.
	On aln 5,000 in seq BACT1. Time spent on BACT1 so far: 14.26 sec.
	On aln 6,000 in seq BACT1. Time spent on BACT1 so far: 17.48 sec.
	On aln 7,000 in seq BACT1. Time spent on BACT1 so far: 20.91 sec.
	On aln 8,000 in seq BACT1. Time spent on BACT1 so far: 24.44 sec.
	On aln 9,000 in seq BACT1. Time spent on BACT1 so far: 28.04 sec.
	On aln 10,000 in seq BACT1. Time spent on BACT1 so far: 31.56 sec.
	On aln 11,000 in seq BACT1. Time spent on BACT1 so far: 35.05 sec.
	On aln 12,000 in seq BACT1. Time spent on BACT1 so far: 38.48 sec.
	On aln 13,000 in seq BACT1. Time spent on BACT1 so far: 41.76 sec.
	On aln 14,000 in seq BACT1. Time spent on BACT1 so far: 45.30 sec.
	On aln 15,000 in seq BACT1. Time spent on BACT1 so far: 48.88 sec.
	On aln 16,000 in seq BACT1. Time spent on BACT1 so far: 52.46 sec.
	On aln 17,000 in seq BACT1. Time spent on BACT1 so far:

	On aln 122,000 in seq BACT1. Time spent on BACT1 so far: 551.86 sec.
	On aln 123,000 in seq BACT1. Time spent on BACT1 so far: 557.55 sec.
	On aln 124,000 in seq BACT1. Time spent on BACT1 so far: 563.36 sec.
	On aln 125,000 in seq BACT1. Time spent on BACT1 so far: 569.11 sec.
	On aln 126,000 in seq BACT1. Time spent on BACT1 so far: 574.83 sec.
	On aln 127,000 in seq BACT1. Time spent on BACT1 so far: 580.64 sec.
	On aln 128,000 in seq BACT1. Time spent on BACT1 so far: 586.63 sec.
	On aln 129,000 in seq BACT1. Time spent on BACT1 so far: 592.34 sec.
	On aln 130,000 in seq BACT1. Time spent on BACT1 so far: 598.08 sec.
	On aln 131,000 in seq BACT1. Time spent on BACT1 so far: 603.96 sec.
	On aln 132,000 in seq BACT1. Time spent on BACT1 so far: 610.07 sec.
	On aln 133,000 in seq BACT1. Time spent on BACT1 so far: 615.90 sec.
	On aln 134,000 in seq BACT1. Time spent on BACT1 so far: 621.46 sec.
	On aln 135,000 in seq BACT1. Time spent on BACT1 so far: 627.41 sec.
	On aln 136,000 in s

	On aln 238,000 in seq BACT1. Time spent on BACT1 so far: 1,334.05 sec.
	On aln 239,000 in seq BACT1. Time spent on BACT1 so far: 1,341.78 sec.
	On aln 240,000 in seq BACT1. Time spent on BACT1 so far: 1,349.59 sec.
	On aln 241,000 in seq BACT1. Time spent on BACT1 so far: 1,357.44 sec.
	On aln 242,000 in seq BACT1. Time spent on BACT1 so far: 1,365.39 sec.
	On aln 243,000 in seq BACT1. Time spent on BACT1 so far: 1,373.48 sec.
	On aln 244,000 in seq BACT1. Time spent on BACT1 so far: 1,381.36 sec.
	On aln 245,000 in seq BACT1. Time spent on BACT1 so far: 1,389.18 sec.
	On aln 246,000 in seq BACT1. Time spent on BACT1 so far: 1,397.10 sec.
	On aln 247,000 in seq BACT1. Time spent on BACT1 so far: 1,405.05 sec.
	On aln 248,000 in seq BACT1. Time spent on BACT1 so far: 1,412.96 sec.
	On aln 249,000 in seq BACT1. Time spent on BACT1 so far: 1,420.95 sec.
	On aln 250,000 in seq BACT1. Time spent on BACT1 so far: 1,428.90 sec.
	On aln 251,000 in seq BACT1. Time spent on BACT1 so far: 1,436.

	On aln 92,000 in seq BACT2. Time spent on BACT2 so far: 289.74 sec.
	On aln 93,000 in seq BACT2. Time spent on BACT2 so far: 292.93 sec.
	On aln 94,000 in seq BACT2. Time spent on BACT2 so far: 296.18 sec.
	On aln 95,000 in seq BACT2. Time spent on BACT2 so far: 299.43 sec.
	On aln 96,000 in seq BACT2. Time spent on BACT2 so far: 302.68 sec.
	On aln 97,000 in seq BACT2. Time spent on BACT2 so far: 305.93 sec.
	On aln 98,000 in seq BACT2. Time spent on BACT2 so far: 309.12 sec.
	On aln 99,000 in seq BACT2. Time spent on BACT2 so far: 312.37 sec.
	On aln 100,000 in seq BACT2. Time spent on BACT2 so far: 315.65 sec.
	On aln 101,000 in seq BACT2. Time spent on BACT2 so far: 318.91 sec.
	On aln 102,000 in seq BACT2. Time spent on BACT2 so far: 322.22 sec.
	On aln 103,000 in seq BACT2. Time spent on BACT2 so far: 325.45 sec.
	On aln 104,000 in seq BACT2. Time spent on BACT2 so far: 328.68 sec.
	On aln 105,000 in seq BACT2. Time spent on BACT2 so far: 331.93 sec.
	On aln 106,000 in seq BACT2

	On aln 210,000 in seq BACT2. Time spent on BACT2 so far: 678.87 sec.
	On aln 211,000 in seq BACT2. Time spent on BACT2 so far: 682.18 sec.
	On aln 212,000 in seq BACT2. Time spent on BACT2 so far: 685.49 sec.
	On aln 213,000 in seq BACT2. Time spent on BACT2 so far: 688.74 sec.
	On aln 214,000 in seq BACT2. Time spent on BACT2 so far: 692.10 sec.
	On aln 215,000 in seq BACT2. Time spent on BACT2 so far: 695.42 sec.
	On aln 216,000 in seq BACT2. Time spent on BACT2 so far: 698.73 sec.
	On aln 217,000 in seq BACT2. Time spent on BACT2 so far: 702.03 sec.
	On aln 218,000 in seq BACT2. Time spent on BACT2 so far: 705.38 sec.
	On aln 219,000 in seq BACT2. Time spent on BACT2 so far: 708.68 sec.
	On aln 220,000 in seq BACT2. Time spent on BACT2 so far: 711.98 sec.
	On aln 221,000 in seq BACT2. Time spent on BACT2 so far: 715.31 sec.
	On aln 222,000 in seq BACT2. Time spent on BACT2 so far: 718.68 sec.
	On aln 223,000 in seq BACT2. Time spent on BACT2 so far: 722.06 sec.
	On aln 224,000 in s

	On aln 327,000 in seq BACT2. Time spent on BACT2 so far: 1,072.75 sec.
	On aln 328,000 in seq BACT2. Time spent on BACT2 so far: 1,076.13 sec.
	On aln 329,000 in seq BACT2. Time spent on BACT2 so far: 1,079.55 sec.
	On aln 330,000 in seq BACT2. Time spent on BACT2 so far: 1,082.90 sec.
	On aln 331,000 in seq BACT2. Time spent on BACT2 so far: 1,086.34 sec.
	On aln 332,000 in seq BACT2. Time spent on BACT2 so far: 1,089.76 sec.
	On aln 333,000 in seq BACT2. Time spent on BACT2 so far: 1,093.12 sec.
	On aln 334,000 in seq BACT2. Time spent on BACT2 so far: 1,096.56 sec.
	On aln 335,000 in seq BACT2. Time spent on BACT2 so far: 1,099.96 sec.
	On aln 336,000 in seq BACT2. Time spent on BACT2 so far: 1,103.37 sec.
	On aln 337,000 in seq BACT2. Time spent on BACT2 so far: 1,106.80 sec.
	On aln 338,000 in seq BACT2. Time spent on BACT2 so far: 1,110.23 sec.
	On aln 339,000 in seq BACT2. Time spent on BACT2 so far: 1,113.63 sec.
	On aln 340,000 in seq BACT2. Time spent on BACT2 so far: 1,117.

	On aln 441,000 in seq BACT2. Time spent on BACT2 so far: 1,465.30 sec.
	On aln 442,000 in seq BACT2. Time spent on BACT2 so far: 1,468.79 sec.
	On aln 443,000 in seq BACT2. Time spent on BACT2 so far: 1,472.21 sec.
	On aln 444,000 in seq BACT2. Time spent on BACT2 so far: 1,475.71 sec.
	On aln 445,000 in seq BACT2. Time spent on BACT2 so far: 1,479.10 sec.
	On aln 446,000 in seq BACT2. Time spent on BACT2 so far: 1,482.51 sec.
	On aln 447,000 in seq BACT2. Time spent on BACT2 so far: 1,485.98 sec.
	On aln 448,000 in seq BACT2. Time spent on BACT2 so far: 1,489.56 sec.
	On aln 449,000 in seq BACT2. Time spent on BACT2 so far: 1,493.02 sec.
	On aln 450,000 in seq BACT2. Time spent on BACT2 so far: 1,496.48 sec.
	On aln 451,000 in seq BACT2. Time spent on BACT2 so far: 1,499.87 sec.
	On aln 452,000 in seq BACT2. Time spent on BACT2 so far: 1,503.36 sec.
	On aln 453,000 in seq BACT2. Time spent on BACT2 so far: 1,506.82 sec.
	On aln 454,000 in seq BACT2. Time spent on BACT2 so far: 1,510.

	On aln 555,000 in seq BACT2. Time spent on BACT2 so far: 1,860.42 sec.
	On aln 556,000 in seq BACT2. Time spent on BACT2 so far: 1,863.90 sec.
	On aln 557,000 in seq BACT2. Time spent on BACT2 so far: 1,867.37 sec.
	On aln 558,000 in seq BACT2. Time spent on BACT2 so far: 1,870.79 sec.
	On aln 559,000 in seq BACT2. Time spent on BACT2 so far: 1,874.23 sec.
	On aln 560,000 in seq BACT2. Time spent on BACT2 so far: 1,877.74 sec.
	On aln 561,000 in seq BACT2. Time spent on BACT2 so far: 1,881.07 sec.
	On aln 562,000 in seq BACT2. Time spent on BACT2 so far: 1,884.55 sec.
	On aln 563,000 in seq BACT2. Time spent on BACT2 so far: 1,888.03 sec.
	On aln 564,000 in seq BACT2. Time spent on BACT2 so far: 1,891.48 sec.
	On aln 565,000 in seq BACT2. Time spent on BACT2 so far: 1,894.90 sec.
	On aln 566,000 in seq BACT2. Time spent on BACT2 so far: 1,898.33 sec.
	On aln 567,000 in seq BACT2. Time spent on BACT2 so far: 1,901.81 sec.
	On aln 568,000 in seq BACT2. Time spent on BACT2 so far: 1,905.

	On aln 669,000 in seq BACT2. Time spent on BACT2 so far: 2,258.87 sec.
	On aln 670,000 in seq BACT2. Time spent on BACT2 so far: 2,262.42 sec.
	On aln 671,000 in seq BACT2. Time spent on BACT2 so far: 2,265.88 sec.
	On aln 672,000 in seq BACT2. Time spent on BACT2 so far: 2,269.47 sec.
	On aln 673,000 in seq BACT2. Time spent on BACT2 so far: 2,272.94 sec.
	On aln 674,000 in seq BACT2. Time spent on BACT2 so far: 2,276.29 sec.
	On aln 675,000 in seq BACT2. Time spent on BACT2 so far: 2,279.54 sec.
	On aln 676,000 in seq BACT2. Time spent on BACT2 so far: 2,283.23 sec.
	On aln 677,000 in seq BACT2. Time spent on BACT2 so far: 2,286.73 sec.
	On aln 678,000 in seq BACT2. Time spent on BACT2 so far: 2,290.22 sec.
	On aln 679,000 in seq BACT2. Time spent on BACT2 so far: 2,293.41 sec.
	On aln 680,000 in seq BACT2. Time spent on BACT2 so far: 2,296.88 sec.
	On aln 681,000 in seq BACT2. Time spent on BACT2 so far: 2,300.44 sec.
	On aln 682,000 in seq BACT2. Time spent on BACT2 so far: 2,303.

## 2. Compute frequency information for individual nucleotides and pairs of nucleotides at mutated positions

We could use Hansel to store the pairs-of-nucleotides data as a matrix, but I opted to use a custom solution (for now, at least) for a few reasons:

1. Don't need anything fancy -- just need to store this, not use the probabilistic weighting stuff
2. I don't have time right now to learn Hansel's API (I've read through the docs and am still a bit confused)
3. I think we could probably use less storage (e.g. we only need to store one "triangle" of the matrix; as far as I can tell, Hansel treats H\[a, b, i, j\] as independent of H\[b, a, j, i\], which isn't necessary for haplotyping IMO

In [5]:
t1 = time.time()
for seq in SEQS:
    with open(f"phasing-data/{seq}_readname2mutpos2nt.pickle", "rb") as loadster:
        # NOTE: this won't necessarily include ALL reads aligned to a sequence -- for example, if a read
        # doesn't cover any mutated positions, it will be omitted from the top level of this dict. (This is
        # because these reads won't be useful for linking the mutated positions.)
        readname2mutpos2nt = pickle.load(loadster)
        print(f"{len(readname2mutpos2nt):,} unique reads described in the data for seq {seq2name[seq]}.")

    ts1 = time.time()
    
    # Now we've seen all alignments of each read, we can go through readname2mutpos2nt and compute
    # co-occurrence information (and create a graph, plot stuff, etc.)
    
    # Maps mutated position -> nucleotide seen at this position, summed across all reads included here -> freq.
    # This corresponds to Reads(i, N) as described in the paper.
    pos2nt2freq = defaultdict(gen_ddi)
    
    # This defaultdict has two levels:
    # OUTER: Keys are sorted (in ascending order) 0-indexed pairs (tuples) of mutated positions. The
    #        inclusion of a pair of mutated positions in this defaultdict implies that these two mutated
    #        positions were spanned by at least one read. The value of each pair is another defaultdict:
    #
    # INNER: The keys of this inner defaultdict are pairs of integers, each in the range [0, 3].
    #        These represent the 4 nucleotides (0 -> A, 1 -> C, 2 -> G, 3 -> T): the first entry represents
    #        the nucleotide seen at the first position in the pair (aka the position "earlier" in the genome),
    #        and the second entry represents the nucleotide seen at the second position in the pair (aka
    #        the position "later" in the genome). Of course, many bacterial genomes are circular, so "earlier"
    #        and "later" are kinda arbitrary. Anyway, there are 16 possible pairs in one of these defaultdicts,
    #        since there are 4^2 = 16 different possible combinations of two nucleotides (ignoring deletions,
    #        degenerate nucleotides, etc.) That said, I expect in practice only a handful of nucleotide pairs
    #        will be present for a given position pair. The value of each pair in this defaultdict is
    #        an integer representing the frequency with which this pair of nucleotides was observed on a
    #        spanning read at this pair of positions.
    #
    # So, as an example, if we only have two mutated positions in a genome (at 0-indexed positions 100 and 500),
    # and we saw:
    #
    # - 30    reads with an A at both positions
    # - 1,000 reads with an A at position 100 and a T at position 500
    # - 5     reads with a T at position 100 and an A at position 500
    # - 100   reads with a T at both positions
    # - 3     reads with a C at position 100 and a T at position 500
    # - 1     read  with a G at position 100 and a T at position 500
    #
    # ... then pospair2ntpair2freq would look like
    # {
    #     (100, 500): {
    #         {
    #             (0, 0): 30,
    #             (0, 3): 1000,
    #             (3, 0): 5,
    #             (3, 3): 100,
    #             (1, 3): 3,
    #             (2, 3): 1
    #         }
    #     }
    # }
    pospair2ntpair2freq = defaultdict(gen_ddi)
    for ri, readname in enumerate(readname2mutpos2nt, 1):
        if ri % 100000 == 0:
            print(
                f"\tOn read {ri:,} in seq {seq2name[seq]}. "
                f"Time spent on {seq2name[seq]} so far: {time.time() - ts1:,.2f} sec."
            )
        # TODO: see if we can avoid sorting here -- inefficient when done once for every read, maybe?
        mutated_positions_covered_in_read = sorted(readname2mutpos2nt[readname].keys())
        
        # NOTE: it may be possible to include this in the combinations() loop below, but we'd need some
        # snazzy logic to prevent updating the same position multiple times. Easiest for my sanity to just
        # be a bit inefficient and make this two separate loops.
        for mutpos in mutated_positions_covered_in_read:
            pos2nt2freq[mutpos][readname2mutpos2nt[readname][mutpos]] += 1
            
        for (i, j) in combinations(mutated_positions_covered_in_read, 2):
            
            # We can assume that i and j are sorted because mutated_positions_covered_in_read is sorted:
            # see https://docs.python.org/3.10/library/itertools.html#itertools.combinations. This is
            # guaranteed, but let's be paranoid just in case:
            if j <= i:
                raise ValueError("Something went horribly wrong with combinations()")
                
            # these are integers in the range [0, 3]
            i_nt = readname2mutpos2nt[readname][i]
            j_nt = readname2mutpos2nt[readname][j]
            
            # We know these mutated positions were observed on the same read, and we know the exact nucleotides
            # this read had at both positions -- update this in pospair2ntpair2freq
            pospair2ntpair2freq[(i, j)][(i_nt, j_nt)] += 1
            
            # print(f"Read {readname} has {i2n[i_nt]} at pos {i} and {i2n[j_nt]} at pos {j}.")

    print(f"Finished going through reads in {seq2name[seq]}.")
    
    # We use the file suffix ".pickle" and "wb" based on the conventions described in
    # https://stackoverflow.com/a/40433504 (...which in turn just reference the python docs).
    with open(f"phasing-data/{seq}_pospair2ntpair2freq.pickle", "wb") as dumpster:
        dumpster.write(pickle.dumps(pospair2ntpair2freq))
        
    with open(f"phasing-data/{seq}_pos2nt2freq.pickle", "wb") as dumpster:
        dumpster.write(pickle.dumps(pos2nt2freq))
        
print(f"Time taken: {time.time() - t1:,} sec.")

348,613 unique reads described in the data for seq CAMP.
	On read 100,000 in seq CAMP. Time spent on CAMP so far: 0.32 sec.
	On read 200,000 in seq CAMP. Time spent on CAMP so far: 0.66 sec.
	On read 300,000 in seq CAMP. Time spent on CAMP so far: 2.57 sec.
Finished going through reads in CAMP.
257,428 unique reads described in the data for seq BACT1.
	On read 100,000 in seq BACT1. Time spent on BACT1 so far: 743.90 sec.
	On read 200,000 in seq BACT1. Time spent on BACT1 so far: 1,831.70 sec.
Finished going through reads in BACT1.
700,066 unique reads described in the data for seq BACT2.
	On read 100,000 in seq BACT2. Time spent on BACT2 so far: 5.73 sec.
	On read 200,000 in seq BACT2. Time spent on BACT2 so far: 7.59 sec.
	On read 300,000 in seq BACT2. Time spent on BACT2 so far: 8.72 sec.
	On read 400,000 in seq BACT2. Time spent on BACT2 so far: 9.75 sec.
	On read 500,000 in seq BACT2. Time spent on BACT2 so far: 10.70 sec.
	On read 600,000 in seq BACT2. Time spent on BACT2 so far: 

## 3. Convert position pair + nucleotide pair information to a graph structure

We now know, for every pair of positions spanned by at least one read, the frequencies of nucleotide pairs seen together at these positions.

We can now construct a graph where nodes represent _alleles_ (position + nucleotide seen at this position), and edges connect alleles seen together.

We only connect two allele nodes if (ignoring exact nucleotides) at least _minSpan_ reads cover both positions, and the __link__ between two allele nodes (defined in the paper) is at least _minLink_. These parameters' values are given in `linked_mutations_utils.py`.

In [6]:
t1 = time.time()
for seq in SEQS:
    print(f"Generating link graph for seq {seq}...")
    with open(f"phasing-data/{seq}_pos2nt2freq.pickle", "rb") as loadster:
        pos2nt2freq = pickle.load(loadster)
        
    g = nx.Graph()
    
    # Add nodes to the graph -- one per seen nucleotide at every mutated position
    for pos in pos2nt2freq.keys():
        
        pos_cov = sum(pos2nt2freq[pos].values())
        
        # Since this data structure is a defaultdict, this will only iterate over the defined (i.e. seen)
        # nucleotide indices (integers in the range [0, 3]).
        for nt in pos2nt2freq[pos].keys():
            # Set the "freq" attribute of this allele node to the number of times this nucleotide was seen
            # at this position in the reads. This corresponds to Reads(i, N) for position i and nt N.
            freq = pos2nt2freq[pos][nt]
            
            if freq > MIN_ALLELE_FREQ_EXCLUSIVE:
                # Also, set the "relfreq" attribute to freq divided by the total number of matching operations
                # at this nucleotide -- so we can see what percentage of reads at this position had a given
                # nucleotide.
                g.add_node((pos, nt), freq=freq, relfreq=(freq / pos_cov))
    
    # Next step: add edges to the graph based on co-occurrence information
    with open(f"phasing-data/{seq}_pospair2ntpair2freq.pickle", "rb") as loadster:
        pospair2ntpair2freq = pickle.load(loadster)
    
    for pospair in pospair2ntpair2freq:
        i = pospair[0]
        j = pospair[1]
        
        # NOTE: possible to speed this up by bundling this computation into the for loop below, maybe
        # also note that "num spanning reads" only includes reads that meet criteria about not having
        # skips/indels at either position, etc.
        num_spanning_reads = sum(pospair2ntpair2freq[pospair].values())
        
        if num_spanning_reads > MINSPAN:
            for ntpair in pospair2ntpair2freq[pospair]:
                # these are still ints in the range [0, 3]
                i_nt = ntpair[0]
                j_nt = ntpair[1]
                # if one or both of the nodes failed the Reads(i, N) check above due to
                # MIN_ALLELE_FREQ_EXCLUSIVE, definitely don't create an edge adjacent to them!
                if g.has_node((i, i_nt)) and g.has_node((j, j_nt)):
                    link = pospair2ntpair2freq[pospair][ntpair] / max(pos2nt2freq[i][i_nt], pos2nt2freq[j][j_nt])
                    if link > MINLINK_EXCLUSIVE:
                        # Yay, add an edge between these alleles!
                        g.add_edge((i, i_nt), (j, j_nt), link=link)
                    
    with open(f"phasing-data/{seq}_linkgraph.pickle", "wb") as dumpster:
        dumpster.write(pickle.dumps(g))
        
print(f"Time taken: {time.time() - t1:,} sec.")

Generating link graph for seq edge_6104...
Generating link graph for seq edge_1671...
Generating link graph for seq edge_2358...
Time taken: 75.77433681488037 sec.
