# Perform read smoothing then assemble with LJA

In [1]:
%run "Header.ipynb"

In [5]:
import os
import time
import pickle
import pysam
import skbio
from collections import defaultdict
from linked_mutations_utils import find_mutated_positions

## 1. Smooth reads

Lots of this code is duplicated from the `Phasing-01-MakeGraph.ipynb` notebook in this folder.

In [6]:
bf = pysam.AlignmentFile("../main-workflow/output/fully-filtered-and-sorted-aln.bam", "rb")
output_dir = "phasing-data/smoothed-reads/"

no_indoor_voice = True

seq2num_supp_alns_ignored = defaultdict(int)

def write_out_reads(filepath, readname2seq):
    # Notably, this uses the "a" (append) method in order to add to the end of a file
    with open(filepath, "a") as of:
        for readname in readname2seq:
            # Write out both the header and the sequence for each read
            of.write(f">{readname}\n{str(smoothed_reads[readname])}\n")
            
ALN_UPDATE_FREQ = 1000
ALN_BUFFER_FREQ = 1000
            
t1 = time.time()
for seq in SEQS:
    fasta = skbio.DNA.read(f"../seqs/{seq}.fasta")
    
    output_smoothed_reads_file = os.path.join(output_dir, f"{seq}_smoothed_reads.fasta")
    
    # Identify all (0-indexed, so compatible with skbio / pysam!)
    # mutated positions in this genome up front to save time.
    #
    # Equivalently, we could also just take in an arbitrary VCF as input
    # (e.g. one produced from another variant calling tool), although we'd
    # need to be careful to only include SNVs and not indels/etc...
    
    print(f"Identifying mutated positions in genome {seq2name[seq]}...")
    mutated_positions = find_mutated_positions(seq)
    print(f"Found {len(mutated_positions):,} mutated positions in {seq2name[seq]}.")
    
    print("Going through these positions...")
    
    # This should already be implicitly sorted, I think, but the code below relies on mutated_positions being
    # in the exact same order as expected. So we may as well be paranoid.
    mutated_positions = sorted(mutated_positions)
    
    # Instead of just writing out every smoothed alignment as soon as we generate it, we build up a "buffer"
    # of these alignments and then write a bunch out at once. This way we limit slowdown due to constantly
    # having to open/close files. I don't really have a good source for this as best practice, but I remembered
    # to do it while writing this code, so somewhere in College Park the CS faculty at Maryland are smiling
    #
    # Also fyi this maps read name to smoothed alignment (well, at this point, just read) sequence. The read name
    # is useful to preserve in fasta files so we have some idea of provenance (where smoothed reads came from)
    smoothed_aln_buffer = {}
    
    # Go through all linear alignments of each read to this genome, focusing (for now) on just the primary
    # alignments...
    ts1 = time.time()
    for ai, aln in enumerate(bf.fetch(seq), 1):
        
        if ai % ALN_UPDATE_FREQ == 0:
            print(
                f"\tOn aln {ai:,} in seq {seq2name[seq]}. "
                f"Time spent on {seq2name[seq]} so far: {time.time() - ts1:,.2f} sec."
            )
            
        if aln.is_supplementary:
            seq2num_supp_alns_ignored[seq] += 1
            continue
            
        if aln.is_secondary:
            raise ValueError(
                "Not to get political or anything, but you should've already filtered secondary alns out"
            )
            
        readname = aln.query_name
        
        if readname in smoothed_aln_buffer:
            raise ValueError("Read has already been smoothed? Du sollst jetzt mit Gott sprechen.")
            
        # Figure out where on the MAG this alignment "hits." These are 0-indexed positions from Pysam.
        # (reference_end points to the position after the actual final position, since these are designed to
        # be interoperable with Python's half-open intervals.)
        #
        # Of course, there likely will be indels within this range: we're purposefully ignoring those here.
        ref_start = aln.reference_start
        ref_end = aln.reference_end - 1
        
        # This should never happen (TM)
        if ref_start >= ref_end:
            raise ValueError(
                f"Ref start {ref_start:,} >= ref end {ref_end:,} for primary aln of read {readname}?"
            )
        
        # Smoothed sequence; we'll edit this so that if this read has (mis)matches to any called mutated
        # positions, these positions are updated with the read's aligned nucleotides at these positions.
        smoothed_aln_seq = fasta[ref_start: ref_end + 1]
        
        # just for debugging: track the exact edits made to smoothed_aln_seq
        replacements_made = {}
        
        ap = aln.get_aligned_pairs(matches_only=True)
        
        # Iterating through the aligned pairs is expensive. Since read lengths are generally in the thousands
        # to tens of thousands of bp (which is much less than the > 1 million bp length of any bacterial genome),
        # we set things up so that we only iterate through the aligned pairs once. We maintain an integer, mpi,
        # that is a poor man's "pointer" to an index in mutated_positions.
        
        mpi = 0
        
        # Go through this aln's aligned pairs. As we see each pair, compare the pair's reference position
        # (refpos) to the mpi-th mutated position (herein referred to as "mutpos").
        #
        # If refpos >  mutpos, increment mpi until refpos <= mutpos (stopping as early as possible).
        # If refpos == mutpos, we have a match! Update readname2mutpos2ismutated[mutpos] based on
        #                      comparing the read to the reference at the aligned positions.
        # If refpos <  mutpos, continue to the next pair.
        
        for pair in ap:
            
            refpos = pair[1]
            mutpos = mutated_positions[mpi]
            
            no_mutations_to_right_of_here = False
            
            # Increment mpi until we get to the next mutated position at or after the reference pos for this
            # aligned pair (or until we run out of mutated positions).
            while refpos > mutpos:
                mpi += 1
                if mpi < len(mutated_positions):
                    mutpos = mutated_positions[mpi]
                else:
                    no_mutations_to_right_of_here = True
                    break
            
            # I expect this should happen only for reads aligned near the right end of the genome.
            if no_mutations_to_right_of_here:
                break
            
            # If the next mutation occurs after this aligned pair, continue on to a later pair.
            if refpos < mutpos:
                continue
                
            # If we've made it here, refpos == mutpos!
            # (...unless I messed something up in how I designed this code.)
            if refpos != mutpos:
                raise ValueError("This should never happen!")
                
            # Finally, get the nucleotide aligned to this mutated position from this read.
            readpos = pair[0]
            read_nt = aln.query_sequence[readpos]
            
            # We don't need to do anything if this read already matches the reference MAG at this position
            if read_nt == str(fasta[mutpos]):
                if no_indoor_voice:
                    print(f"Primary aln of read {readname} matches ref at mut pos {mutpos + 1:,}: both {read_nt}")
            else:
                # Record this specific "allele" for this read.
                relative_pos_on_aln = mutpos - ref_start
                smoothed_aln_seq = smoothed_aln_seq.replace([relative_pos_on_aln], read_nt)
                replacements_made[relative_pos_on_aln] = read_nt
                
        if no_indoor_voice:
            print(f"Primary aln of read {readname} required {len(replacements_made):,} replacements!")
        
        # Now that we've finished processing all called mutations that this alignment spans, prepare it
        # to be written out to a FASTA file. See comments above on smoothed_aln_buffer, and why we don't
        # just write everything out as soon as it's ready.
        #
        # (Also, we've already guaranteed readname isn't already in smoothed_aln_buffer, so no need to worry
        # about accidentally overwriting something from earlier.)
        smoothed_aln_buffer[readname] = smoothed_aln_seq
        
        # Notably, we don't necessarily write out *exactly* ALN_BUFFER_FREQ reads at once -- skipping alignments
        # due to them being supplementary, etc. (actually no need for an "etc.", that's literally the only
        # possible reason as of writing, but whatever) doesn't stop ai from going up. Shouldn't make a difference
        # unless we have a zillion supplementary alignments.
        if ai % ALN_BUFFER_FREQ == 0:
            write_out_reads(output_smoothed_reads_file, smoothed_aln_buffer)
            # Clear the buffer
            smoothed_aln_buffer = {}
        
    # We're probably going to have left over smoothed reads that we still haven't written out, unless things
    # worked out so that on the final alignment we saw ai was exactly divisible by ALN_BUFFER_FREQ (and that's
    # pretty unlikely unless you set the buffer freq to a low number). So make one last dump of the buffer.
    if len(smoothed_aln_buffer) > 0:
        write_out_reads(output_smoothed_reads_file, smoothed_aln_buffer)
    
    print(f"Done with {seq}! Took {time.time() - ts1:,.2f} sec.")
    print(f"\t(FYI, we ignored {seq2num_supp_alns_ignored[seq]:,} supplementary alignments.)")
        
print(f"Time taken: {time.time() - t1:,} sec.")

Identifying mutated positions in genome CAMP...


KeyboardInterrupt: 

## 2. Run LJA on these smoothed reads

In [None]:
!/home/mfedarko/software/LJA/bin/lja \
    --reads phasing-data/smoothed-reads/edge_6104_smoothed_reads.fasta \
    --output-dir phasing-data/smoothed-reads/edge_6104_lja

!/home/mfedarko/software/LJA/bin/lja \
    --reads phasing-data/smoothed-reads/edge_1671_smoothed_reads.fasta \
    --output-dir phasing-data/smoothed-reads/edge_1671_lja

!/home/mfedarko/software/LJA/bin/lja \
    --reads phasing-data/smoothed-reads/edge_2358_smoothed_reads.fasta \
    --output-dir phasing-data/smoothed-reads/edge_2358_lja