# Perform read smoothing then assemble with LJA

In [1]:
%run "Header.ipynb"

In [2]:
import os
import time
import pickle
import pysam
import skbio
from collections import defaultdict
from linked_mutations_utils import find_mutated_positions

## 1. Smooth reads

Lots of this code is duplicated from the `Phasing-01-MakeGraph.ipynb` notebook in this folder.

In [3]:
bf = pysam.AlignmentFile("../main-workflow/output/fully-filtered-and-sorted-aln.bam", "rb")
output_dir = "phasing-data/smoothed-reads/"

# verbose?
no_indoor_voice = False

def write_out_reads(filepath, readname2seq):
    # Notably, this uses the "a" (append) method in order to add to the end of a file
    with open(filepath, "a") as of:
        for readname in readname2seq:
            # Write out both the header and the sequence for each read
            of.write(f">{readname}\n{str(readname2seq[readname])}\n")
            
ALN_UPDATE_FREQ = 1000
ALN_BUFFER_FREQ = 1000

P = 1
            
t1 = time.time()
for seq in SEQS:
    fasta = skbio.DNA.read(f"../seqs/{seq}.fasta")
    
    output_smoothed_reads_file = os.path.join(output_dir, f"{seq}_smoothed_reads.fasta")
    
    # Identify all (0-indexed, so compatible with skbio / pysam!)
    # mutated positions in this genome up front to save time.
    #
    # Equivalently, we could also just take in an arbitrary VCF as input
    # (e.g. one produced from another variant calling tool), although we'd
    # need to be careful to only include SNVs and not indels/etc...
    
    print(f"Identifying mutated positions (p = {P}%) in genome {seq2name[seq]}...")
    mutated_positions = find_mutated_positions(seq, p_to_use=P)
    print(f"Found {len(mutated_positions):,} mutated positions (p = {P}%) in {seq2name[seq]}.")
    
    print("Going through these positions...")
    
    # This should already be implicitly sorted, I think, but the code below relies on mutated_positions being
    # in the exact same order as expected. So we may as well be paranoid.
    mutated_positions = sorted(mutated_positions)
    
    # Instead of just writing out every smoothed alignment as soon as we generate it, we build up a "buffer"
    # of these alignments and then write a bunch out at once. This way we limit slowdown due to constantly
    # having to open/close files. I don't really have a good source for this as best practice, but I remembered
    # to do it while writing this code, so somewhere in College Park the CS faculty at Maryland are smiling
    #
    # Also fyi this maps read name to smoothed alignment (well, at this point, just read) sequence. The read name
    # is useful to preserve in fasta files so we have some idea of provenance (where smoothed reads came from)
    smoothed_aln_buffer = {}
    
    # The first time we see an alignment of a read, it's 1; if we see a supp aln of this read, it's 2; etc.
    # Lets us distinguish alignments with different names
    readname2freq_so_far = defaultdict(int)
    
    # Go through all linear alignments of each read to this genome, focusing (for now) on just the primary
    # alignments...
    ts1 = time.time()
    for ai, aln in enumerate(bf.fetch(seq), 1):
        
        if ai % ALN_UPDATE_FREQ == 0:
            print(
                f"\tOn aln {ai:,} in seq {seq2name[seq]}. "
                f"Time spent on {seq2name[seq]} so far: {time.time() - ts1:,.2f} sec."
            )
            
        if aln.is_secondary:
            raise ValueError(
                "Not to get political or anything, but you should've already filtered secondary alns out"
            )
            
        # Note that supplementary alignments are ok, though! We implicitly handle these here.
        #
        # Different alignments of the same read will have different new_readnames, because we're gonna
        # be treating them as distinct "reads". We should have already filtered reference-overlapping
        # supp alns so this shouldn't be a problem
        
        readname = aln.query_name
        readname2freq_so_far[readname] += 1
        new_readname = f"{readname}_{readname2freq_so_far[readname]}"
        
        # should never happen
        if new_readname in smoothed_aln_buffer:
            raise ValueError("This exact read alignment has already been smoothed? Weird.")
            
        # Figure out where on the MAG this alignment "hits." These are 0-indexed positions from Pysam.
        # (reference_end points to the position after the actual final position, since these are designed to
        # be interoperable with Python's half-open intervals.)
        #
        # Of course, there likely will be indels within this range: we're purposefully ignoring those here.
        ref_start = aln.reference_start
        ref_end = aln.reference_end - 1
        
        # This should never happen (TM)
        if ref_start >= ref_end:
            # Du sollst jetzt mit Gott sprechen.
            raise ValueError(
                f"Ref start {ref_start:,} >= ref end {ref_end:,} for read {new_readname}?"
            )
        
        # Smoothed sequence; we'll edit this so that if this read has (mis)matches to any called mutated
        # positions, these positions are updated with the read's aligned nucleotides at these positions.
        smoothed_aln_seq = fasta[ref_start: ref_end + 1]
        
        # just for debugging: track the exact edits made to smoothed_aln_seq
        replacements_made = {}
        
        ap = aln.get_aligned_pairs(matches_only=True)
        
        # Iterating through the aligned pairs is expensive. Since read lengths are generally in the thousands
        # to tens of thousands of bp (which is much less than the > 1 million bp length of any bacterial genome),
        # we set things up so that we only iterate through the aligned pairs once. We maintain an integer, mpi,
        # that is a poor man's "pointer" to an index in mutated_positions.
        
        mpi = 0
        
        # Go through this aln's aligned pairs. As we see each pair, compare the pair's reference position
        # (refpos) to the mpi-th mutated position (herein referred to as "mutpos").
        #
        # If refpos >  mutpos, increment mpi until refpos <= mutpos (stopping as early as possible).
        # If refpos == mutpos, we have a match! Update readname2mutpos2ismutated[mutpos] based on
        #                      comparing the read to the reference at the aligned positions.
        # If refpos <  mutpos, continue to the next pair.
        
        for pair in ap:
            
            refpos = pair[1]
            mutpos = mutated_positions[mpi]
            
            no_mutations_to_right_of_here = False
            
            # Increment mpi until we get to the next mutated position at or after the reference pos for this
            # aligned pair (or until we run out of mutated positions).
            while refpos > mutpos:
                mpi += 1
                if mpi < len(mutated_positions):
                    mutpos = mutated_positions[mpi]
                else:
                    no_mutations_to_right_of_here = True
                    break
            
            # I expect this should happen only for reads aligned near the right end of the genome.
            if no_mutations_to_right_of_here:
                break
            
            # If the next mutation occurs after this aligned pair, continue on to a later pair.
            if refpos < mutpos:
                continue
                
            # If we've made it here, refpos == mutpos!
            # (...unless I messed something up in how I designed this code.)
            if refpos != mutpos:
                raise ValueError("This should never happen!")
                
            # Finally, get the nucleotide aligned to this mutated position from this read.
            readpos = pair[0]
            read_nt = aln.query_sequence[readpos]
            
            ref_nt = str(fasta[mutpos])
            # We don't need to do anything if this read already matches the reference MAG at this position
            if read_nt == ref_nt:
                if no_indoor_voice:
                    print(f"Read {new_readname} matches ref at mutpos {mutpos + 1:,}: both {read_nt}")
            else:
                # Record this specific "allele" for this read.
                relative_pos_on_aln = mutpos - ref_start
                smoothed_aln_seq = smoothed_aln_seq.replace([relative_pos_on_aln], read_nt)
                replacements_made[relative_pos_on_aln] = read_nt
                if no_indoor_voice:
                    print(
                        f"Read {new_readname} mismatches ref at mutpos {mutpos + 1:,}: "
                        f"ref = {ref_nt}, read = {read_nt}"
                    )
                
        if no_indoor_voice:
            print(f"Read {new_readname} required {len(replacements_made):,} replacements!")
        
        # Now that we've finished processing all called mutations that this alignment spans, prepare it
        # to be written out to a FASTA file. See comments above on smoothed_aln_buffer, and why we don't
        # just write everything out as soon as it's ready.
        #
        # (Also, we've already guaranteed readname isn't already in smoothed_aln_buffer, so no need to worry
        # about accidentally overwriting something from earlier.)
        smoothed_aln_buffer[new_readname] = smoothed_aln_seq
        
        # Notably, we don't necessarily write out *exactly* ALN_BUFFER_FREQ reads at once -- skipping alignments
        # due to them being supplementary, etc. (actually no need for an "etc.", that's literally the only
        # possible reason as of writing, but whatever) doesn't stop ai from going up. Shouldn't make a difference
        # unless we have a zillion supplementary alignments.
        if ai % ALN_BUFFER_FREQ == 0:
            write_out_reads(output_smoothed_reads_file, smoothed_aln_buffer)
            # Clear the buffer
            smoothed_aln_buffer = {}
        
    # We're probably going to have left over smoothed reads that we still haven't written out, unless things
    # worked out so that on the final alignment we saw ai was exactly divisible by ALN_BUFFER_FREQ (and that's
    # pretty unlikely unless you set the buffer freq to a low number). So make one last dump of the buffer.
    if len(smoothed_aln_buffer) > 0:
        write_out_reads(output_smoothed_reads_file, smoothed_aln_buffer)
    
    print(f"Done with {seq}! Took {time.time() - ts1:,.2f} sec.")
        
print(f"Time taken: {time.time() - t1:,} sec.")

Identifying mutated positions (p = 1%) in genome CAMP...
Found 83 mutated positions (p = 1%) in CAMP.
Going through these positions...
	On aln 1,000 in seq CAMP. Time spent on CAMP so far: 2.18 sec.
	On aln 2,000 in seq CAMP. Time spent on CAMP so far: 4.17 sec.
	On aln 3,000 in seq CAMP. Time spent on CAMP so far: 6.38 sec.
	On aln 4,000 in seq CAMP. Time spent on CAMP so far: 8.37 sec.
	On aln 5,000 in seq CAMP. Time spent on CAMP so far: 10.87 sec.
	On aln 6,000 in seq CAMP. Time spent on CAMP so far: 14.61 sec.
	On aln 7,000 in seq CAMP. Time spent on CAMP so far: 18.28 sec.
	On aln 8,000 in seq CAMP. Time spent on CAMP so far: 21.89 sec.
	On aln 9,000 in seq CAMP. Time spent on CAMP so far: 25.56 sec.
	On aln 10,000 in seq CAMP. Time spent on CAMP so far: 29.30 sec.
	On aln 11,000 in seq CAMP. Time spent on CAMP so far: 32.98 sec.
	On aln 12,000 in seq CAMP. Time spent on CAMP so far: 36.67 sec.
	On aln 13,000 in seq CAMP. Time spent on CAMP so far: 40.30 sec.
	On aln 14,000 in se

	On aln 122,000 in seq CAMP. Time spent on CAMP so far: 464.96 sec.
	On aln 123,000 in seq CAMP. Time spent on CAMP so far: 469.08 sec.
	On aln 124,000 in seq CAMP. Time spent on CAMP so far: 473.22 sec.
	On aln 125,000 in seq CAMP. Time spent on CAMP so far: 477.33 sec.
	On aln 126,000 in seq CAMP. Time spent on CAMP so far: 481.52 sec.
	On aln 127,000 in seq CAMP. Time spent on CAMP so far: 485.72 sec.
	On aln 128,000 in seq CAMP. Time spent on CAMP so far: 489.93 sec.
	On aln 129,000 in seq CAMP. Time spent on CAMP so far: 494.13 sec.
	On aln 130,000 in seq CAMP. Time spent on CAMP so far: 498.30 sec.
	On aln 131,000 in seq CAMP. Time spent on CAMP so far: 502.42 sec.
	On aln 132,000 in seq CAMP. Time spent on CAMP so far: 506.60 sec.
	On aln 133,000 in seq CAMP. Time spent on CAMP so far: 510.75 sec.
	On aln 134,000 in seq CAMP. Time spent on CAMP so far: 514.83 sec.
	On aln 135,000 in seq CAMP. Time spent on CAMP so far: 518.59 sec.
	On aln 136,000 in seq CAMP. Time spent on CAMP 

	On aln 243,000 in seq CAMP. Time spent on CAMP so far: 958.95 sec.
	On aln 244,000 in seq CAMP. Time spent on CAMP so far: 963.02 sec.
	On aln 245,000 in seq CAMP. Time spent on CAMP so far: 967.07 sec.
	On aln 246,000 in seq CAMP. Time spent on CAMP so far: 971.07 sec.
	On aln 247,000 in seq CAMP. Time spent on CAMP so far: 975.02 sec.
	On aln 248,000 in seq CAMP. Time spent on CAMP so far: 979.01 sec.
	On aln 249,000 in seq CAMP. Time spent on CAMP so far: 983.02 sec.
	On aln 250,000 in seq CAMP. Time spent on CAMP so far: 987.08 sec.
	On aln 251,000 in seq CAMP. Time spent on CAMP so far: 991.11 sec.
	On aln 252,000 in seq CAMP. Time spent on CAMP so far: 994.96 sec.
	On aln 253,000 in seq CAMP. Time spent on CAMP so far: 998.79 sec.
	On aln 254,000 in seq CAMP. Time spent on CAMP so far: 1,002.65 sec.
	On aln 255,000 in seq CAMP. Time spent on CAMP so far: 1,006.60 sec.
	On aln 256,000 in seq CAMP. Time spent on CAMP so far: 1,010.59 sec.
	On aln 257,000 in seq CAMP. Time spent on

	On aln 361,000 in seq CAMP. Time spent on CAMP so far: 1,403.90 sec.
	On aln 362,000 in seq CAMP. Time spent on CAMP so far: 1,407.72 sec.
	On aln 363,000 in seq CAMP. Time spent on CAMP so far: 1,411.56 sec.
	On aln 364,000 in seq CAMP. Time spent on CAMP so far: 1,415.36 sec.
	On aln 365,000 in seq CAMP. Time spent on CAMP so far: 1,419.30 sec.
	On aln 366,000 in seq CAMP. Time spent on CAMP so far: 1,423.22 sec.
	On aln 367,000 in seq CAMP. Time spent on CAMP so far: 1,427.17 sec.
	On aln 368,000 in seq CAMP. Time spent on CAMP so far: 1,431.08 sec.
	On aln 369,000 in seq CAMP. Time spent on CAMP so far: 1,434.87 sec.
	On aln 370,000 in seq CAMP. Time spent on CAMP so far: 1,438.68 sec.
	On aln 371,000 in seq CAMP. Time spent on CAMP so far: 1,442.60 sec.
	On aln 372,000 in seq CAMP. Time spent on CAMP so far: 1,446.39 sec.
	On aln 373,000 in seq CAMP. Time spent on CAMP so far: 1,450.26 sec.
	On aln 374,000 in seq CAMP. Time spent on CAMP so far: 1,454.07 sec.
	On aln 375,000 in s

Found 22,144 mutated positions (p = 1%) in BACT1.
Going through these positions...
	On aln 1,000 in seq BACT1. Time spent on BACT1 so far: 2.76 sec.
	On aln 2,000 in seq BACT1. Time spent on BACT1 so far: 7.33 sec.
	On aln 3,000 in seq BACT1. Time spent on BACT1 so far: 12.74 sec.
	On aln 4,000 in seq BACT1. Time spent on BACT1 so far: 17.39 sec.
	On aln 5,000 in seq BACT1. Time spent on BACT1 so far: 22.44 sec.
	On aln 6,000 in seq BACT1. Time spent on BACT1 so far: 27.88 sec.
	On aln 7,000 in seq BACT1. Time spent on BACT1 so far: 33.68 sec.
	On aln 8,000 in seq BACT1. Time spent on BACT1 so far: 39.57 sec.
	On aln 9,000 in seq BACT1. Time spent on BACT1 so far: 45.40 sec.
	On aln 10,000 in seq BACT1. Time spent on BACT1 so far: 52.47 sec.
	On aln 11,000 in seq BACT1. Time spent on BACT1 so far: 59.56 sec.
	On aln 12,000 in seq BACT1. Time spent on BACT1 so far: 66.06 sec.
	On aln 13,000 in seq BACT1. Time spent on BACT1 so far: 73.32 sec.
	On aln 14,000 in seq BACT1. Time spent on B

	On aln 119,000 in seq BACT1. Time spent on BACT1 so far: 866.70 sec.
	On aln 120,000 in seq BACT1. Time spent on BACT1 so far: 875.87 sec.
	On aln 121,000 in seq BACT1. Time spent on BACT1 so far: 883.73 sec.
	On aln 122,000 in seq BACT1. Time spent on BACT1 so far: 895.13 sec.
	On aln 123,000 in seq BACT1. Time spent on BACT1 so far: 904.71 sec.
	On aln 124,000 in seq BACT1. Time spent on BACT1 so far: 913.78 sec.
	On aln 125,000 in seq BACT1. Time spent on BACT1 so far: 923.85 sec.
	On aln 126,000 in seq BACT1. Time spent on BACT1 so far: 932.22 sec.
	On aln 127,000 in seq BACT1. Time spent on BACT1 so far: 942.22 sec.
	On aln 128,000 in seq BACT1. Time spent on BACT1 so far: 956.54 sec.
	On aln 129,000 in seq BACT1. Time spent on BACT1 so far: 966.20 sec.
	On aln 130,000 in seq BACT1. Time spent on BACT1 so far: 974.77 sec.
	On aln 131,000 in seq BACT1. Time spent on BACT1 so far: 983.18 sec.
	On aln 132,000 in seq BACT1. Time spent on BACT1 so far: 990.80 sec.
	On aln 133,000 in s

	On aln 234,000 in seq BACT1. Time spent on BACT1 so far: 2,008.37 sec.
	On aln 235,000 in seq BACT1. Time spent on BACT1 so far: 2,018.80 sec.
	On aln 236,000 in seq BACT1. Time spent on BACT1 so far: 2,029.37 sec.
	On aln 237,000 in seq BACT1. Time spent on BACT1 so far: 2,040.61 sec.
	On aln 238,000 in seq BACT1. Time spent on BACT1 so far: 2,052.01 sec.
	On aln 239,000 in seq BACT1. Time spent on BACT1 so far: 2,064.07 sec.
	On aln 240,000 in seq BACT1. Time spent on BACT1 so far: 2,074.69 sec.
	On aln 241,000 in seq BACT1. Time spent on BACT1 so far: 2,088.37 sec.
	On aln 242,000 in seq BACT1. Time spent on BACT1 so far: 2,104.22 sec.
	On aln 243,000 in seq BACT1. Time spent on BACT1 so far: 2,115.39 sec.
	On aln 244,000 in seq BACT1. Time spent on BACT1 so far: 2,125.68 sec.
	On aln 245,000 in seq BACT1. Time spent on BACT1 so far: 2,136.05 sec.
	On aln 246,000 in seq BACT1. Time spent on BACT1 so far: 2,146.84 sec.
	On aln 247,000 in seq BACT1. Time spent on BACT1 so far: 2,159.

	On aln 87,000 in seq BACT2. Time spent on BACT2 so far: 326.28 sec.
	On aln 88,000 in seq BACT2. Time spent on BACT2 so far: 330.15 sec.
	On aln 89,000 in seq BACT2. Time spent on BACT2 so far: 333.92 sec.
	On aln 90,000 in seq BACT2. Time spent on BACT2 so far: 337.96 sec.
	On aln 91,000 in seq BACT2. Time spent on BACT2 so far: 342.31 sec.
	On aln 92,000 in seq BACT2. Time spent on BACT2 so far: 346.83 sec.
	On aln 93,000 in seq BACT2. Time spent on BACT2 so far: 351.25 sec.
	On aln 94,000 in seq BACT2. Time spent on BACT2 so far: 355.07 sec.
	On aln 95,000 in seq BACT2. Time spent on BACT2 so far: 358.93 sec.
	On aln 96,000 in seq BACT2. Time spent on BACT2 so far: 362.75 sec.
	On aln 97,000 in seq BACT2. Time spent on BACT2 so far: 366.57 sec.
	On aln 98,000 in seq BACT2. Time spent on BACT2 so far: 370.34 sec.
	On aln 99,000 in seq BACT2. Time spent on BACT2 so far: 374.13 sec.
	On aln 100,000 in seq BACT2. Time spent on BACT2 so far: 377.94 sec.
	On aln 101,000 in seq BACT2. Tim

	On aln 205,000 in seq BACT2. Time spent on BACT2 so far: 788.68 sec.
	On aln 206,000 in seq BACT2. Time spent on BACT2 so far: 792.49 sec.
	On aln 207,000 in seq BACT2. Time spent on BACT2 so far: 796.39 sec.
	On aln 208,000 in seq BACT2. Time spent on BACT2 so far: 800.29 sec.
	On aln 209,000 in seq BACT2. Time spent on BACT2 so far: 804.12 sec.
	On aln 210,000 in seq BACT2. Time spent on BACT2 so far: 808.00 sec.
	On aln 211,000 in seq BACT2. Time spent on BACT2 so far: 811.83 sec.
	On aln 212,000 in seq BACT2. Time spent on BACT2 so far: 815.69 sec.
	On aln 213,000 in seq BACT2. Time spent on BACT2 so far: 819.55 sec.
	On aln 214,000 in seq BACT2. Time spent on BACT2 so far: 823.50 sec.
	On aln 215,000 in seq BACT2. Time spent on BACT2 so far: 827.42 sec.
	On aln 216,000 in seq BACT2. Time spent on BACT2 so far: 831.30 sec.
	On aln 217,000 in seq BACT2. Time spent on BACT2 so far: 835.19 sec.
	On aln 218,000 in seq BACT2. Time spent on BACT2 so far: 839.11 sec.
	On aln 219,000 in s

	On aln 321,000 in seq BACT2. Time spent on BACT2 so far: 1,252.53 sec.
	On aln 322,000 in seq BACT2. Time spent on BACT2 so far: 1,256.36 sec.
	On aln 323,000 in seq BACT2. Time spent on BACT2 so far: 1,260.27 sec.
	On aln 324,000 in seq BACT2. Time spent on BACT2 so far: 1,264.27 sec.
	On aln 325,000 in seq BACT2. Time spent on BACT2 so far: 1,268.26 sec.
	On aln 326,000 in seq BACT2. Time spent on BACT2 so far: 1,272.19 sec.
	On aln 327,000 in seq BACT2. Time spent on BACT2 so far: 1,276.13 sec.
	On aln 328,000 in seq BACT2. Time spent on BACT2 so far: 1,280.03 sec.
	On aln 329,000 in seq BACT2. Time spent on BACT2 so far: 1,283.94 sec.
	On aln 330,000 in seq BACT2. Time spent on BACT2 so far: 1,287.76 sec.
	On aln 331,000 in seq BACT2. Time spent on BACT2 so far: 1,291.69 sec.
	On aln 332,000 in seq BACT2. Time spent on BACT2 so far: 1,295.61 sec.
	On aln 333,000 in seq BACT2. Time spent on BACT2 so far: 1,299.47 sec.
	On aln 334,000 in seq BACT2. Time spent on BACT2 so far: 1,303.

	On aln 435,000 in seq BACT2. Time spent on BACT2 so far: 1,704.60 sec.
	On aln 436,000 in seq BACT2. Time spent on BACT2 so far: 1,708.28 sec.
	On aln 437,000 in seq BACT2. Time spent on BACT2 so far: 1,712.06 sec.
	On aln 438,000 in seq BACT2. Time spent on BACT2 so far: 1,715.86 sec.
	On aln 439,000 in seq BACT2. Time spent on BACT2 so far: 1,721.46 sec.
	On aln 440,000 in seq BACT2. Time spent on BACT2 so far: 1,725.12 sec.
	On aln 441,000 in seq BACT2. Time spent on BACT2 so far: 1,728.85 sec.
	On aln 442,000 in seq BACT2. Time spent on BACT2 so far: 1,732.61 sec.
	On aln 443,000 in seq BACT2. Time spent on BACT2 so far: 1,736.38 sec.
	On aln 444,000 in seq BACT2. Time spent on BACT2 so far: 1,740.24 sec.
	On aln 445,000 in seq BACT2. Time spent on BACT2 so far: 1,744.08 sec.
	On aln 446,000 in seq BACT2. Time spent on BACT2 so far: 1,747.90 sec.
	On aln 447,000 in seq BACT2. Time spent on BACT2 so far: 1,751.97 sec.
	On aln 448,000 in seq BACT2. Time spent on BACT2 so far: 1,756.

	On aln 549,000 in seq BACT2. Time spent on BACT2 so far: 2,162.75 sec.
	On aln 550,000 in seq BACT2. Time spent on BACT2 so far: 2,166.86 sec.
	On aln 551,000 in seq BACT2. Time spent on BACT2 so far: 2,170.90 sec.
	On aln 552,000 in seq BACT2. Time spent on BACT2 so far: 2,174.94 sec.
	On aln 553,000 in seq BACT2. Time spent on BACT2 so far: 2,178.82 sec.
	On aln 554,000 in seq BACT2. Time spent on BACT2 so far: 2,182.66 sec.
	On aln 555,000 in seq BACT2. Time spent on BACT2 so far: 2,186.59 sec.
	On aln 556,000 in seq BACT2. Time spent on BACT2 so far: 2,190.48 sec.
	On aln 557,000 in seq BACT2. Time spent on BACT2 so far: 2,194.41 sec.
	On aln 558,000 in seq BACT2. Time spent on BACT2 so far: 2,198.23 sec.
	On aln 559,000 in seq BACT2. Time spent on BACT2 so far: 2,202.22 sec.
	On aln 560,000 in seq BACT2. Time spent on BACT2 so far: 2,206.19 sec.
	On aln 561,000 in seq BACT2. Time spent on BACT2 so far: 2,210.01 sec.
	On aln 562,000 in seq BACT2. Time spent on BACT2 so far: 2,213.

	On aln 663,000 in seq BACT2. Time spent on BACT2 so far: 2,616.97 sec.
	On aln 664,000 in seq BACT2. Time spent on BACT2 so far: 2,621.09 sec.
	On aln 665,000 in seq BACT2. Time spent on BACT2 so far: 2,625.04 sec.
	On aln 666,000 in seq BACT2. Time spent on BACT2 so far: 2,628.94 sec.
	On aln 667,000 in seq BACT2. Time spent on BACT2 so far: 2,632.58 sec.
	On aln 668,000 in seq BACT2. Time spent on BACT2 so far: 2,635.69 sec.
	On aln 669,000 in seq BACT2. Time spent on BACT2 so far: 2,639.00 sec.
	On aln 670,000 in seq BACT2. Time spent on BACT2 so far: 2,642.95 sec.
	On aln 671,000 in seq BACT2. Time spent on BACT2 so far: 2,646.91 sec.
	On aln 672,000 in seq BACT2. Time spent on BACT2 so far: 2,651.00 sec.
	On aln 673,000 in seq BACT2. Time spent on BACT2 so far: 2,654.94 sec.
	On aln 674,000 in seq BACT2. Time spent on BACT2 so far: 2,658.76 sec.
	On aln 675,000 in seq BACT2. Time spent on BACT2 so far: 2,662.48 sec.
	On aln 676,000 in seq BACT2. Time spent on BACT2 so far: 2,666.

## 1.5. Stats about smoothed read lengths

We could have just figured this out while computing the stuff above, but ... I didn't have the foresight to think of this earlier, and I don't want to rerun that stuff for another >1 hour, so we just loop through the FASTA files we just generated quickly

In [7]:
for seq in SEQS:
    read_lengths = []
    
    # Parse a FASTA file -- I stole this code from myself in the Diversity Indices notebook
    with open(f"phasing-data/smoothed-reads/{seq}_smoothed_reads.fasta", "r") as fastafile:

        # Assumes that sequences are not split up over multiple lines (so a FASTA file with N sequences
        # should have only 2N lines, maybe 2N + 1 if there's an extra empty newline at the bottom of the file)
        for linenum, line in enumerate(fastafile):

            if line.startswith(">"):
                if linenum % 2 != 0:
                    raise ValueError("something weird with > location in all_edges.fasta. Go yell at Marcus.")
            else:
                if linenum % 2 != 1:
                    raise ValueError("something weird with non > location in all_edges.fasta. Go yell at Marcus.")

                read_lengths.append(len(line.strip()))

    num_reads = len(read_lengths)
    minlen = min(read_lengths)
    maxlen = max(read_lengths)
    avglen = mean(read_lengths)
    medlen = median(read_lengths)
    
    # Reads with length less than w + k = (threshold) will be ignored by jumboDB when constructing
    # the graph, so we output stats about this to verify that we're not dropping a TON of reads
    # (ofc ideally we wouldn't drop any tho...)
    threshold = 7001
    geq_threshold = len([rl for rl in read_lengths if rl >= threshold])
    pct = 100 * (geq_threshold / num_reads)
    
    print(f"{seq2name[seq]}: {num_reads:,} smoothed reads")
    print(f"\tmin / mean / median / max length = {minlen:,} / {avglen:,.1f} / {medlen:,} / {maxlen:,}")
    print(f"\tNum of reads with length \u2265 {threshold:,}: {geq_threshold:,} / {num_reads:,} ({pct:.1f}%)")

CAMP: 1,429,068 smoothed reads
	min / mean / median / max length = 200 / 11,264.7 / 11,154.0 / 35,099
	Num of reads with length ≥ 7,001: 1,315,002 / 1,429,068 (92.0%)
BACT1: 668,290 smoothed reads
	min / mean / median / max length = 202 / 11,598.7 / 11,517.0 / 39,723
	Num of reads with length ≥ 7,001: 618,287 / 668,290 (92.5%)
BACT2: 1,485,852 smoothed reads
	min / mean / median / max length = 200 / 11,318.0 / 11,259.0 / 40,346
	Num of reads with length ≥ 7,001: 1,346,440 / 1,485,852 (90.6%)


## 2. Run LJA on these smoothed reads

More specifically, we just use the jumboDBG module of LJA, since error-correcting reads should (hopefully) not be required after the smoothing process we just did (remember that this was already based on the metaFlye-assembled MAGs).

We use $k = 5{,}001$ and $w = 2{,}000$ (this is the default window size of jumboDBG as of writing), and also use `--coverage` (since this information could be useful when identifying low-coverage strains).

In [5]:
!/home/mfedarko/software/LJA/bin/jumboDBG \
    --reads phasing-data/smoothed-reads/edge_6104_smoothed_reads.fasta \
    -k 5001 \
    --coverage \
    --output-dir phasing-data/smoothed-reads/edge_6104_jumbodbg_k5001

!/home/mfedarko/software/LJA/bin/jumboDBG \
    --reads phasing-data/smoothed-reads/edge_1671_smoothed_reads.fasta \
    -k 5001 \
    --coverage \
    --output-dir phasing-data/smoothed-reads/edge_1671_jumbodbg_k5001

!/home/mfedarko/software/LJA/bin/jumboDBG \
    --reads phasing-data/smoothed-reads/edge_2358_smoothed_reads.fasta \
    -k 5001 \
    --coverage \
    --output-dir phasing-data/smoothed-reads/edge_2358_jumbodbg_k5001

00:00:00 395Mb  INFO: Hello! You are running jumboDBG, a tool for construction of de Bruijn graphs for arbitrarily large values of k
00:00:00 395Mb  INFO: Note that jumboDBG does not perform any error correction and ignores all reads shorter than k + w = 7001
00:00:00 0Mb  INFO: Reading reads
00:00:00 0Mb  INFO: Extracting minimizers
00:01:33 4.7Gb  INFO: Finished read processing
00:01:33 4.7Gb  INFO: 7623980 hashs collected. Starting sorting.
00:01:34 4.7Gb  INFO: Finished sorting. Total distinct minimizers: 3563
00:01:34 4.7Gb  INFO: Starting construction of sparse de Bruijn graph
00:01:34 4.7Gb  INFO: Vertex map constructed.
00:01:34 4.7Gb  INFO: Filling edge sequences.
00:03:07 5.6Gb  INFO: Finished sparse de Bruijn graph construction.
00:03:07 5.6Gb  INFO:  Collecting tips 
00:03:07 5.6Gb  INFO: Added 247 artificial minimizers from tips.
00:03:07 5.6Gb  INFO: Collected 7842 old edges.
00:03:07 5.6Gb  INFO: New minimizers added to sparse graph.
00:03:07 5.6Gb  INFO: Refilling graph

In [6]:
# Old approach: use all of LJA:
# 1. jumboDBG     [make the de Bruijn graph]
# 2. mowerDBG     [error correct reads]
# 3. multiplexDBG [create multiplex de Bruijn graph]
# !/home/mfedarko/software/LJA/bin/lja \
#     --reads phasing-data/smoothed-reads/edge_6104_smoothed_reads.fasta \
#     --output-dir phasing-data/smoothed-reads/edge_6104_lja

# !/home/mfedarko/software/LJA/bin/lja \
#     --reads phasing-data/smoothed-reads/edge_1671_smoothed_reads.fasta \
#     --output-dir phasing-data/smoothed-reads/edge_1671_lja

# !/home/mfedarko/software/LJA/bin/lja \
#     --reads phasing-data/smoothed-reads/edge_2358_smoothed_reads.fasta \
#     --output-dir phasing-data/smoothed-reads/edge_2358_lja