# Perform read smoothing then assemble with LJA

In [3]:
%run "Header.ipynb"

In [4]:
import os
import time
import pickle
import pysam
import skbio
from collections import defaultdict
from linked_mutations_utils import find_mutated_positions

## 1. Smooth reads

Lots of this code is duplicated from the `Phasing-01-MakeGraph.ipynb` notebook in this folder.

In [3]:
bf = pysam.AlignmentFile("../main-workflow/output/fully-filtered-and-sorted-aln.bam", "rb")
output_dir = "phasing-data/smoothed-reads/"

# verbose?
no_indoor_voice = False

def write_out_reads(filepath, readname2seq):
    # Notably, this uses the "a" (append) method in order to add to the end of a file
    with open(filepath, "a") as of:
        for readname in readname2seq:
            # Write out both the header and the sequence for each read
            of.write(f">{readname}\n{str(readname2seq[readname])}\n")
            
ALN_UPDATE_FREQ = 1000
ALN_BUFFER_FREQ = 1000
            
t1 = time.time()
for seq in SEQS:
    fasta = skbio.DNA.read(f"../seqs/{seq}.fasta")
    
    output_smoothed_reads_file = os.path.join(output_dir, f"{seq}_smoothed_reads.fasta")
    
    # Identify all (0-indexed, so compatible with skbio / pysam!)
    # mutated positions in this genome up front to save time.
    #
    # Equivalently, we could also just take in an arbitrary VCF as input
    # (e.g. one produced from another variant calling tool), although we'd
    # need to be careful to only include SNVs and not indels/etc...
    
    print(f"Identifying mutated positions in genome {seq2name[seq]}...")
    mutated_positions = find_mutated_positions(seq)
    print(f"Found {len(mutated_positions):,} mutated positions in {seq2name[seq]}.")
    
    print("Going through these positions...")
    
    # This should already be implicitly sorted, I think, but the code below relies on mutated_positions being
    # in the exact same order as expected. So we may as well be paranoid.
    mutated_positions = sorted(mutated_positions)
    
    # Instead of just writing out every smoothed alignment as soon as we generate it, we build up a "buffer"
    # of these alignments and then write a bunch out at once. This way we limit slowdown due to constantly
    # having to open/close files. I don't really have a good source for this as best practice, but I remembered
    # to do it while writing this code, so somewhere in College Park the CS faculty at Maryland are smiling
    #
    # Also fyi this maps read name to smoothed alignment (well, at this point, just read) sequence. The read name
    # is useful to preserve in fasta files so we have some idea of provenance (where smoothed reads came from)
    smoothed_aln_buffer = {}
    
    # The first time we see an alignment of a read, it's 1; if we see a supp aln of this read, it's 2; etc.
    # Lets us distinguish alignments with different names
    readname2freq_so_far = defaultdict(int)
    
    # Go through all linear alignments of each read to this genome, focusing (for now) on just the primary
    # alignments...
    ts1 = time.time()
    for ai, aln in enumerate(bf.fetch(seq), 1):
        
        if ai % ALN_UPDATE_FREQ == 0:
            print(
                f"\tOn aln {ai:,} in seq {seq2name[seq]}. "
                f"Time spent on {seq2name[seq]} so far: {time.time() - ts1:,.2f} sec."
            )
            
        if aln.is_secondary:
            raise ValueError(
                "Not to get political or anything, but you should've already filtered secondary alns out"
            )
            
        # Note that supplementary alignments are ok, though! We implicitly handle these here.
        #
        # Different alignments of the same read will have different new_readnames, because we're gonna
        # be treating them as distinct "reads". We should have already filtered reference-overlapping
        # supp alns so this shouldn't be a problem
        
        readname = aln.query_name
        readname2freq_so_far[readname] += 1
        new_readname = f"{readname}_{readname2freq_so_far[readname]}"
        
        # should never happen
        if new_readname in smoothed_aln_buffer:
            raise ValueError("This exact read alignment has already been smoothed? Weird.")
            
        # Figure out where on the MAG this alignment "hits." These are 0-indexed positions from Pysam.
        # (reference_end points to the position after the actual final position, since these are designed to
        # be interoperable with Python's half-open intervals.)
        #
        # Of course, there likely will be indels within this range: we're purposefully ignoring those here.
        ref_start = aln.reference_start
        ref_end = aln.reference_end - 1
        
        # This should never happen (TM)
        if ref_start >= ref_end:
            # Du sollst jetzt mit Gott sprechen.
            raise ValueError(
                f"Ref start {ref_start:,} >= ref end {ref_end:,} for read {new_readname}?"
            )
        
        # Smoothed sequence; we'll edit this so that if this read has (mis)matches to any called mutated
        # positions, these positions are updated with the read's aligned nucleotides at these positions.
        smoothed_aln_seq = fasta[ref_start: ref_end + 1]
        
        # just for debugging: track the exact edits made to smoothed_aln_seq
        replacements_made = {}
        
        ap = aln.get_aligned_pairs(matches_only=True)
        
        # Iterating through the aligned pairs is expensive. Since read lengths are generally in the thousands
        # to tens of thousands of bp (which is much less than the > 1 million bp length of any bacterial genome),
        # we set things up so that we only iterate through the aligned pairs once. We maintain an integer, mpi,
        # that is a poor man's "pointer" to an index in mutated_positions.
        
        mpi = 0
        
        # Go through this aln's aligned pairs. As we see each pair, compare the pair's reference position
        # (refpos) to the mpi-th mutated position (herein referred to as "mutpos").
        #
        # If refpos >  mutpos, increment mpi until refpos <= mutpos (stopping as early as possible).
        # If refpos == mutpos, we have a match! Update readname2mutpos2ismutated[mutpos] based on
        #                      comparing the read to the reference at the aligned positions.
        # If refpos <  mutpos, continue to the next pair.
        
        for pair in ap:
            
            refpos = pair[1]
            mutpos = mutated_positions[mpi]
            
            no_mutations_to_right_of_here = False
            
            # Increment mpi until we get to the next mutated position at or after the reference pos for this
            # aligned pair (or until we run out of mutated positions).
            while refpos > mutpos:
                mpi += 1
                if mpi < len(mutated_positions):
                    mutpos = mutated_positions[mpi]
                else:
                    no_mutations_to_right_of_here = True
                    break
            
            # I expect this should happen only for reads aligned near the right end of the genome.
            if no_mutations_to_right_of_here:
                break
            
            # If the next mutation occurs after this aligned pair, continue on to a later pair.
            if refpos < mutpos:
                continue
                
            # If we've made it here, refpos == mutpos!
            # (...unless I messed something up in how I designed this code.)
            if refpos != mutpos:
                raise ValueError("This should never happen!")
                
            # Finally, get the nucleotide aligned to this mutated position from this read.
            readpos = pair[0]
            read_nt = aln.query_sequence[readpos]
            
            ref_nt = str(fasta[mutpos])
            # We don't need to do anything if this read already matches the reference MAG at this position
            if read_nt == ref_nt:
                if no_indoor_voice:
                    print(f"Read {new_readname} matches ref at mutpos {mutpos + 1:,}: both {read_nt}")
            else:
                # Record this specific "allele" for this read.
                relative_pos_on_aln = mutpos - ref_start
                smoothed_aln_seq = smoothed_aln_seq.replace([relative_pos_on_aln], read_nt)
                replacements_made[relative_pos_on_aln] = read_nt
                if no_indoor_voice:
                    print(
                        f"Read {new_readname} mismatches ref at mutpos {mutpos + 1:,}: "
                        f"ref = {ref_nt}, read = {read_nt}"
                    )
                
        if no_indoor_voice:
            print(f"Read {new_readname} required {len(replacements_made):,} replacements!")
        
        # Now that we've finished processing all called mutations that this alignment spans, prepare it
        # to be written out to a FASTA file. See comments above on smoothed_aln_buffer, and why we don't
        # just write everything out as soon as it's ready.
        #
        # (Also, we've already guaranteed readname isn't already in smoothed_aln_buffer, so no need to worry
        # about accidentally overwriting something from earlier.)
        smoothed_aln_buffer[new_readname] = smoothed_aln_seq
        
        # Notably, we don't necessarily write out *exactly* ALN_BUFFER_FREQ reads at once -- skipping alignments
        # due to them being supplementary, etc. (actually no need for an "etc.", that's literally the only
        # possible reason as of writing, but whatever) doesn't stop ai from going up. Shouldn't make a difference
        # unless we have a zillion supplementary alignments.
        if ai % ALN_BUFFER_FREQ == 0:
            write_out_reads(output_smoothed_reads_file, smoothed_aln_buffer)
            # Clear the buffer
            smoothed_aln_buffer = {}
        
    # We're probably going to have left over smoothed reads that we still haven't written out, unless things
    # worked out so that on the final alignment we saw ai was exactly divisible by ALN_BUFFER_FREQ (and that's
    # pretty unlikely unless you set the buffer freq to a low number). So make one last dump of the buffer.
    if len(smoothed_aln_buffer) > 0:
        write_out_reads(output_smoothed_reads_file, smoothed_aln_buffer)
    
    print(f"Done with {seq}! Took {time.time() - ts1:,.2f} sec.")
        
print(f"Time taken: {time.time() - t1:,} sec.")

Identifying mutated positions in genome CAMP...
Found 284 mutated positions in CAMP.
Going through these positions...
	On aln 1,000 in seq CAMP. Time spent on CAMP so far: 1.73 sec.
	On aln 2,000 in seq CAMP. Time spent on CAMP so far: 3.31 sec.
	On aln 3,000 in seq CAMP. Time spent on CAMP so far: 5.09 sec.
	On aln 4,000 in seq CAMP. Time spent on CAMP so far: 6.67 sec.
	On aln 5,000 in seq CAMP. Time spent on CAMP so far: 8.66 sec.
	On aln 6,000 in seq CAMP. Time spent on CAMP so far: 11.59 sec.
	On aln 7,000 in seq CAMP. Time spent on CAMP so far: 14.43 sec.
	On aln 8,000 in seq CAMP. Time spent on CAMP so far: 17.31 sec.
	On aln 9,000 in seq CAMP. Time spent on CAMP so far: 20.22 sec.
	On aln 10,000 in seq CAMP. Time spent on CAMP so far: 23.20 sec.
	On aln 11,000 in seq CAMP. Time spent on CAMP so far: 26.11 sec.
	On aln 12,000 in seq CAMP. Time spent on CAMP so far: 29.03 sec.
	On aln 13,000 in seq CAMP. Time spent on CAMP so far: 31.90 sec.
	On aln 14,000 in seq CAMP. Time spent

## 1.5. Stats about smoothed read lengths

We could have just figured this out while computing the stuff above, but ... I didn't have the foresight to think of this earlier, and I don't want to rerun that stuff for another >1 hour, so we just loop through the FASTA files we just generated quickly

In [12]:
for seq in SEQS:
    read_lengths = []
    
    # Parse a FASTA file -- I stole this code from myself in the Diversity Indices notebook
    with open(f"phasing-data/smoothed-reads/{seq}_smoothed_reads.fasta", "r") as fastafile:

        # Assumes that sequences are not split up over multiple lines (so a FASTA file with N sequences
        # should have only 2N lines, maybe 2N + 1 if there's an extra empty newline at the bottom of the file)
        for linenum, line in enumerate(fastafile):

            if line.startswith(">"):
                if linenum % 2 != 0:
                    raise ValueError("something weird with > location in all_edges.fasta. Go yell at Marcus.")
            else:
                if linenum % 2 != 1:
                    raise ValueError("something weird with non > location in all_edges.fasta. Go yell at Marcus.")

                read_lengths.append(len(line.strip()))

    num_reads = len(read_lengths)
    minlen = min(read_lengths)
    maxlen = max(read_lengths)
    avglen = mean(read_lengths)
    medlen = median(read_lengths)
    
    # We'll use a k-mer size of 1,001 for jumboDBG, and the default window size (w) is 2,000. Reads with length
    # less than w + k = 3,001 will be ignored by jumboDB when constructing the graph, so we output stats about
    # this to verify that we're not dropping a TON of reads (ideally we wouldn't drop any tho...)
    threshold = 3001
    geq_threshold = len([rl for rl in read_lengths if rl >= threshold])
    pct = 100 * (geq_threshold / num_reads)
    
    print(f"{seq2name[seq]}: {num_reads:,} smoothed reads")
    print(f"\tmin / mean / median / max length = {minlen:,} / {avglen:,.1f} / {medlen:,} / {maxlen:,}")
    print(f"\tNum of reads with length \u2265 {threshold:,}: {geq_threshold:,} / {num_reads:,} ({pct:.1f}%)")

CAMP: 476,356 smoothed reads
	min / mean / median / max length = 200 / 11,264.7 / 11,154.0 / 35,099
	Num of reads with length ≥ 3,001: 472,064 / 476,356 (99.1%)
BACT1: 263,145 smoothed reads
	min / mean / median / max length = 202 / 11,597.6 / 11,515 / 39,723
	Num of reads with length ≥ 3,001: 260,087 / 263,145 (98.8%)
BACT2: 742,926 smoothed reads
	min / mean / median / max length = 200 / 11,318.0 / 11,259.0 / 40,346
	Num of reads with length ≥ 3,001: 732,240 / 742,926 (98.6%)


## 2. Run LJA on these smoothed reads

More specifically, we just use the jumboDBG module of LJA, since error-correcting reads should (hopefully) not be required after the smoothing process we just did (remember that this was already based on the metaFlye-assembled MAGs).

We use $k = 1{,}001$ and $w = 2{,}000$ (this is the default window size of jumboDBG as of writing), and also use `--coverage` (since this information could be useful when identifying low-coverage strains).

In [14]:
!/home/mfedarko/software/LJA/bin/jumboDBG \
    --reads phasing-data/smoothed-reads/edge_6104_smoothed_reads.fasta \
    -k 1001 \
    --coverage \
    --output-dir phasing-data/smoothed-reads/edge_6104_jumbodbg

!/home/mfedarko/software/LJA/bin/jumboDBG \
    --reads phasing-data/smoothed-reads/edge_1671_smoothed_reads.fasta \
    -k 1001 \
    --coverage \
    --output-dir phasing-data/smoothed-reads/edge_1671_jumbodbg

!/home/mfedarko/software/LJA/bin/jumboDBG \
    --reads phasing-data/smoothed-reads/edge_2358_smoothed_reads.fasta \
    -k 1001 \
    --coverage \
    --output-dir phasing-data/smoothed-reads/edge_2358_jumbodbg

00:00:00 162Mb  INFO: Hello! You are running jumboDBG, a tool for construction of de Bruijn graphs for arbitrarily large values of k
00:00:00 162Mb  INFO: Note that jumboDBG does not perform any error correction and ignores all reads shorter than k + w = 3001
00:00:00 0Mb  INFO: Reading reads
00:00:00 0Mb  INFO: Extracting minimizers
00:00:41 2.3Gb  INFO: Finished read processing
00:00:41 2.3Gb  INFO: 4346161 hashs collected. Starting sorting.
00:00:42 2.3Gb  INFO: Finished sorting. Total distinct minimizers: 1834
00:00:42 2.3Gb  INFO: Starting construction of sparse de Bruijn graph
00:00:42 2.3Gb  INFO: Vertex map constructed.
00:00:42 2.3Gb  INFO: Filling edge sequences.
00:01:20 3.5Gb  INFO: Finished sparse de Bruijn graph construction.
00:01:20 3.5Gb  INFO:  Collecting tips 
00:01:20 3.5Gb  INFO: Added 53 artificial minimizers from tips.
00:01:20 3.5Gb  INFO: Collected 4628 old edges.
00:01:20 3.5Gb  INFO: New minimizers added to sparse graph.
00:01:20 3.5Gb  INFO: Refilling graph 

In [4]:
# Old approach: use all of LJA:
# 1. jumboDBG     [make the de Bruijn graph]
# 2. mowerDBG     [error correct reads]
# 3. multiplexDBG [create multiplex de Bruijn graph]
# !/home/mfedarko/software/LJA/bin/lja \
#     --reads phasing-data/smoothed-reads/edge_6104_smoothed_reads.fasta \
#     --output-dir phasing-data/smoothed-reads/edge_6104_lja

# !/home/mfedarko/software/LJA/bin/lja \
#     --reads phasing-data/smoothed-reads/edge_1671_smoothed_reads.fasta \
#     --output-dir phasing-data/smoothed-reads/edge_1671_lja

# !/home/mfedarko/software/LJA/bin/lja \
#     --reads phasing-data/smoothed-reads/edge_2358_smoothed_reads.fasta \
#     --output-dir phasing-data/smoothed-reads/edge_2358_lja

00:00:00 351Mb  INFO: Hello! You are running La Jolla Assembler (LJA), a tool for genome assembly from PacBio HiFi reads
00:00:00 351Mb  INFO: 0a4619b61b5f12d9bed1a84784942003687a8bda
00:00:00 351Mb  INFO: LJA pipeline started
00:00:00 351Mb  INFO: Performing initial correction with k = 501
00:00:00 0Mb  INFO: Reading reads
00:00:00 0Mb  INFO: Extracting minimizers
00:00:33 3.3Gb  INFO: Finished read processing
00:00:33 3.3Gb  INFO: 3056952 hashs collected. Starting sorting.
00:00:33 3.3Gb  INFO: Finished sorting. Total distinct minimizers: 1092
00:00:33 3.3Gb  INFO: Starting construction of sparse de Bruijn graph
00:00:33 3.3Gb  INFO: Vertex map constructed.
00:00:33 3.3Gb  INFO: Filling edge sequences.
00:01:04 4.5Gb  INFO: Finished sparse de Bruijn graph construction.
00:01:04 4.5Gb  INFO:  Collecting tips 
00:01:04 4.5Gb  INFO: Added 37 artificial minimizers from tips.
00:01:04 4.5Gb  INFO: Collected 2716 old edges.
00:01:04 4.5Gb  INFO: New minimizers added to spar