# Perform read smoothing then assemble with LJA

In [27]:
%run "Header.ipynb"
%run "../main-workflow/utils.py"

In [28]:
import os
import time
import pickle
import pysam
import skbio
from collections import defaultdict, Counter
from linked_mutations_utils import find_mutated_positions

## 0. Quick sanity check: ensure that all $k$-mers ($k$ = 5,001) are unique in each MAG

In [29]:
# I know there are actual k-mer counting tools you can use but no reason to overcomplicate things for now

k = 5001

for seq in SEQS:
    fasta = skbio.DNA.read(f"../seqs/{seq}.fasta")
    bargain_bin_kmer_counter = Counter()
    
    # The skbio.DNA object is 0-indexed, so 0 is the leftmost k-mer start position and
    # ((seq length) - k) is the rightmost k-mer start position. The + 1 is because python ranges don't include
    # the right endpoint.
    for start_pos in range(0, seq2len[seq] - k + 1):
        
        # NOTE: this is a terrible no good very bad way to do this; it's more efficient to use a "sliding window"
        # approach where you store the entire k-mer and then, with each step, just remove the first character and
        # add on a new last character. "But, uh, this code will only be run on these three MAGs, so I'm gonna
        # prioritize clarity over optimization," says me, the insane person who just spent like a minute writing
        # this comment when I could've been optimizing this code instead look WHATEVER this counts k-mers and it's
        # 4am let's not overcomplicate it, look if you're on GitHub right now and you see this inane comment
        # we can both just pretend that you were looking at some really optimized code and we'll both walk away
        # satisfied, capisce
        kmer = fasta[start_pos : start_pos + k]
        
        bargain_bin_kmer_counter[str(kmer)] += 1
        if start_pos % 1000000 == 0: print(f"On start pos {start_pos:,} in {seq2name[seq]}.")
    
    mckc = bargain_bin_kmer_counter.most_common(1)[0][1]
    print(f"The most common k = {k:,}-mer in {seq2name[seq]} occurred {mckc:,} time(s).")

On start pos 0 in CAMP.
On start pos 1,000,000 in CAMP.
The most common k = 5,001-mer in CAMP occurred 1 time(s).
On start pos 0 in BACT1.
On start pos 1,000,000 in BACT1.
On start pos 2,000,000 in BACT1.
The most common k = 5,001-mer in BACT1 occurred 1 time(s).
On start pos 0 in BACT2.
On start pos 1,000,000 in BACT2.
On start pos 2,000,000 in BACT2.
The most common k = 5,001-mer in BACT2 occurred 1 time(s).


## 1. Smooth reads

Lots of this code is duplicated from the `Phasing-01-MakeGraph.ipynb` notebook in this folder.

In [30]:
# Set this to True to actually generate ordinary smoothed reads that include called mutations;
# set this to False to generate "sanity check" perfect smoothed reads, where no mutations are included
# and the read entirely matches the reference
actually_include_mutations_in_the_smoothed_reads = True

add_virtual_reads = True

In [31]:
# We'll need to know the mean coverage of each sequence when computing virtual reads.
seq2meancov = get_meancovs()
seq2meancov

Sequence edge_6104 has average coverage 4,158.57 and median coverage 4,122.00.
Sequence edge_1671 has average coverage 1,415.07 and median coverage 1,436.00.
Sequence edge_2358 has average coverage 2,993.46 and median coverage 2,936.00.


{'edge_6104': 4158.572468826692,
 'edge_1671': 1415.072755380576,
 'edge_2358': 2993.461913625056}

In [6]:
bf = pysam.AlignmentFile("../main-workflow/output/fully-filtered-and-sorted-aln.bam", "rb")
output_dir = "phasing-data/smoothed-reads/"

# verbose?
no_indoor_voice = False

def write_out_reads(filepath, readname2seq):
    # Notably, this uses the "a" (append) method in order to add to the end of a file
    with open(filepath, "a") as of:
        for readname in readname2seq:
            # Write out both the header and the sequence for each read
            of.write(f">{readname}\n{str(readname2seq[readname])}\n")
            
ALN_UPDATE_FREQ = 5000
ALN_BUFFER_FREQ = 1000
VR_EXTRA_SPAN = 100

P = 10
            
t1 = time.time()
for seq in [SEQS[0]]:
    
    # Record which positions (0-indexed) aren't covered by any smoothed reads in this MAG.
    # We'll add "virtual reads" that span these positions.
    uncovered_positions = set(range(0, seq2len[seq]))
    
    fasta = skbio.DNA.read(f"../seqs/{seq}.fasta")
    
    output_smoothed_reads_file = os.path.join(output_dir, f"{seq}_smoothed_reads.fasta")
    
    # Identify all (0-indexed, so compatible with skbio / pysam!)
    # mutated positions in this genome up front to save time.
    #
    # Equivalently, we could also just take in an arbitrary VCF as input
    # (e.g. one produced from another variant calling tool), although we'd
    # need to be careful to only include SNVs and not indels/etc...
    
    print("=" * 70)
    print(f"Identifying mutated positions (p = {P}%) in genome {seq2name[seq]}...")
    mutpos2pileup = find_mutated_positions(seq, p_to_use=P, incl_pileup=True)
    # We sort because the code below relies on these being in ascending order
    mutated_positions = sorted(mutpos2pileup.keys())
    print(f"Found {len(mutated_positions):,} mutated positions (p = {P}%) in {seq2name[seq]}.")
    print(
        f"Note that this tally is higher than you'd see in e.g. the CP1/2/3 plots, because now we're including "
        "both 'rare' and non-rare mutations. Just so you don't waste five minutes sanity-checking this like I did."
    )
    
    print("Going through these positions...")
    
    num_ignored_alns = 0
    
    # Instead of just writing out every smoothed alignment as soon as we generate it, we build up a "buffer"
    # of these alignments and then write a bunch out at once. This way we limit slowdown due to constantly
    # having to open/close files. I don't really have a good source for this as best practice, but I remembered
    # to do it while writing this code, so somewhere in College Park the CS faculty at Maryland are smiling
    #
    # Also fyi this maps read name to smoothed alignment (well, at this point, just read) sequence. The read name
    # is useful to preserve in fasta files so we have some idea of provenance (where smoothed reads came from)
    smoothed_aln_buffer = {}
    
    # The first time we see an alignment of a read, it's 1; if we see a supp aln of this read, it's 2; etc.
    # Lets us distinguish alignments with different names
    readname2freq_so_far = defaultdict(int)
    
    # Go through all linear alignments of each read to this genome, focusing (for now) on just the primary
    # alignments...
    ts1 = time.time()
    for ai, aln in enumerate(bf.fetch(seq), 1):
        
        if ai % ALN_UPDATE_FREQ == 0:
            print(
                f"\tOn aln {ai:,} in seq {seq2name[seq]}. "
                f"Time spent on {seq2name[seq]} so far: {time.time() - ts1:,.2f} sec."
            )
            
        if aln.is_secondary:
            raise ValueError(
                "Not to get political or anything, but you should've already filtered secondary alns out"
            )
            
        # Note that supplementary alignments are ok, though! We implicitly handle these here.
        #
        # Different alignments of the same read will have different new_readnames, because we're gonna
        # be treating them as distinct "reads". We should have already filtered reference-overlapping
        # supp alns so this shouldn't be a problem
        
        readname = aln.query_name
        readname2freq_so_far[readname] += 1
        new_readname = f"{readname}_{readname2freq_so_far[readname]}"
        
        # should never happen
        if new_readname in smoothed_aln_buffer:
            raise ValueError("This exact read alignment has already been smoothed? Weird.")
            
        # Figure out where on the MAG this alignment "hits." These are 0-indexed positions from Pysam.
        # (reference_end points to the position after the actual final position, since these are designed to
        # be interoperable with Python's half-open intervals.)
        #
        # Of course, there likely will be indels within this range: we're purposefully ignoring those here.
        ref_start = aln.reference_start
        ref_end = aln.reference_end - 1
        
        # This should never happen (TM)
        if ref_start >= ref_end:
            # Du sollst jetzt mit Gott sprechen.
            raise ValueError(
                f"Ref start {ref_start:,} >= ref end {ref_end:,} for read {new_readname}?"
            )
            
        # Smoothed sequence; we'll edit this so that if this read has (mis)matches to any called mutated
        # positions, these positions are updated with the read's aligned nucleotides at these positions.
        smoothed_aln_seq = fasta[ref_start: ref_end + 1]
        
        if actually_include_mutations_in_the_smoothed_reads:
            # just for debugging: track the exact edits made to smoothed_aln_seq
            replacements_made = {}

            # We may choose to ignore this linear alignment, if we think it is error-prone or
            # otherwise not useful. If this gets set to True in the loop below, then we'll notice this
            # and ignore this alignment.
            ignoring_this_aln = False
            
            ap = aln.get_aligned_pairs(matches_only=True)

            # Iterating through the aligned pairs is expensive. Since read lengths are generally in the thousands
            # to tens of thousands of bp (which is much less than the > 1 million bp length of any bacterial genome),
            # we set things up so that we only iterate through the aligned pairs once. We maintain an integer, mpi,
            # that is a poor man's "pointer" to an index in mutated_positions.

            mpi = 0

            # Go through this aln's aligned pairs. As we see each pair, compare the pair's reference position
            # (refpos) to the mpi-th mutated position (herein referred to as "mutpos").
            #
            # If refpos >  mutpos, increment mpi until refpos <= mutpos (stopping as early as possible).
            # If refpos == mutpos, we have a match! Update readname2mutpos2ismutated[mutpos] based on
            #                      comparing the read to the reference at the aligned positions.
            # If refpos <  mutpos, continue to the next pair.

            for pair in ap:

                refpos = pair[1]
                mutpos = mutated_positions[mpi]

                no_mutations_to_right_of_here = False

                # Increment mpi until we get to the next mutated position at or after the reference pos for this
                # aligned pair (or until we run out of mutated positions).
                while refpos > mutpos:
                    mpi += 1
                    if mpi < len(mutated_positions):
                        mutpos = mutated_positions[mpi]
                    else:
                        no_mutations_to_right_of_here = True
                        break

                # I expect this should happen only for reads aligned near the right end of the genome.
                if no_mutations_to_right_of_here:
                    break

                # If the next mutation occurs after this aligned pair, continue on to a later pair.
                if refpos < mutpos:
                    continue

                # If we've made it here, refpos == mutpos!
                # (...unless I messed something up in how I designed this code.)
                if refpos != mutpos:
                    raise ValueError("This should never happen!")

                # Finally, get the nucleotide aligned to this mutated position from this read.
                readpos = pair[0]
                read_nt = aln.query_sequence[readpos]

                # If this read doesn't have match the first or second most common nucleotide at this position,
                # ignore this read. In the future, when we perform read
                # smoothing based on an arbitrary set of SNV calls, we can be more careful about this; but for now
                # we make the simplifiying assumption that a mutation likely only has one alternate nucleotide,
                # and that the 3rd and 4th most common nucleotides indicate errors.
                # (Also, note that we break ties here arbitrarily.)
                nt2ct = dict(zip("ACGT", mutpos2pileup[mutpos][0]))
                nt1 = max(nt2ct, key=nt2ct.get)
                del nt2ct[nt1]
                nt2 = max(nt2ct, key=nt2ct.get)
                
                if read_nt != nt1 and read_nt != nt2:
                    if no_indoor_voice:
                        print(
                            f"Read {new_readname} has 3rd or 4th most common nt at mutpos {mutpos + 1:,}: "
                            f"pileup = {mutpos2pileup[mutpos]}, read = {read_nt}"
                        )
                    ignoring_this_aln = True
                    break
                    
                # Notably, the nucleotide at a mutated position in a smoothed read will always be the first
                # or second most common nucleotide at this position. So "unreasonable" positions, in which
                # the ref nt != the consensus nt, will not be treated as you might expect -- we ignore the
                # reference in this particular case. Shouldn't make a big difference, since in most cases
                # the ref and consensus nt agree.
                relative_pos_on_aln = mutpos - ref_start
                smoothed_aln_seq = smoothed_aln_seq.replace([relative_pos_on_aln], read_nt)
                replacements_made[relative_pos_on_aln] = read_nt
                if no_indoor_voice:
                    print(
                        f"Read {new_readname} mismatches ref at mutpos {mutpos + 1:,}: "
                        f"ref = {ref_nt}, read = {read_nt}"
                    )

            if no_indoor_voice:
                print(f"Read {new_readname} required {len(replacements_made):,} replacements!")
        
        if ignoring_this_aln:
            num_ignored_alns += 1
        else:
            # Now that we've finished processing all called mutations that this alignment spans, prepare it
            # to be written out to a FASTA file. See comments above on smoothed_aln_buffer, and why we don't
            # just write everything out as soon as it's ready.
            #
            # (Also, we've already guaranteed readname isn't already in smoothed_aln_buffer, so no need to worry
            # about accidentally overwriting something from earlier.)
            smoothed_aln_buffer[new_readname] = smoothed_aln_seq

            # Record which positions this read covers (of course, it may not exactly "cover" these positions
            # originally due to indels, but the smoothed version will cover them).
            # We don't update uncovered_positions until *after* we process all aligned pairs of this read, to allow
            # us to ignore reads if desired.
            uncovered_positions -= set(range(ref_start, ref_end + 1))

            if ai % ALN_BUFFER_FREQ == 0:
                write_out_reads(output_smoothed_reads_file, smoothed_aln_buffer)
                # Clear the buffer
                smoothed_aln_buffer = {}
        
    # We're probably going to have left over smoothed reads that we still haven't written out, unless things
    # worked out so that on the final alignment we saw ai was exactly divisible by ALN_BUFFER_FREQ (and that's
    # pretty unlikely unless you set the buffer freq to a low number). So make one last dump of the buffer.
    if len(smoothed_aln_buffer) > 0:
        write_out_reads(output_smoothed_reads_file, smoothed_aln_buffer)
        
    print(f"We ignored {num_ignored_alns:,} linear alignments, fyi.")
        
    if add_virtual_reads and len(uncovered_positions) > 0:
        print(f"For reference, there are {len(uncovered_positions):,} uncovered positions in {seq2name[seq]}.")
        
        sup = sorted(uncovered_positions)
        uc_runs = convert_to_runs(sup)
        print(f'And there are {len(uc_runs)} "runs" of uncovered positions.')
        
        rounded_meancov = round(seq2meancov[seq])
        print(
            f'Adding "virtual reads" spanning each of these runs, at rounded mean coverage '
            f'of {rounded_meancov:,}x, to account for this...'
        )
        
        num_vr = 0
        vr_buffer = {}
        for run in uc_runs:
            # Construct a virtual read that includes this entire run of uncovered positions as well
            # as VR_EXTRA_SPAN positions before and after (clamping to the start/end of the seq if needed).
            #
            # Notably, we could try to make this loop around from end -> start if this is a cyclic MAG, but
            # to remain consistent with how we handle supplementary alignments above -- and because implementing
            # the loop around would be a lot of work and it's like 3am -- we ignore this for now.
            #
            # Also, note that run_start can equal run_end, if only a single isolated position is uncovered.
            # This is fine -- the code handles this case automatically. (I guess the only potential problem is
            # if the length of the MAG is less than VR_EXTRA_SPAN, but... that should never happen. If you have
            # like 100bp-long MAGs that's a problem! I guess, TODO, make note of this when generalizing this
            # code.)
            
            run_start = max(run[0] - VR_EXTRA_SPAN, 0)
            run_end = min(run[1] + VR_EXTRA_SPAN, seq2len[seq] - 1)
            
            # Generate a sequence matching the "reference" MAG at these positions. We of course don't have
            # any info about mutations here, because these positions are uncovered by the real reads!
            vr_seq = fasta[run_start: run_end + 1]
            
            # We need to assign reads unique names, and including the run coordinates here is a nice way
            # to preserve uniqueness across runs and also make our smoothed reads files easier to interpret
            vr_name_prefix = f"vr_{run[0]}_{run[1]}"
            
            # Add M copies of this virtual read, where M = (rounded mean coverage of this MAG)
            for vr_num in range(1, rounded_meancov + 1):
                vr_name = f"{vr_name_prefix}_{vr_num}"
                vr_buffer[vr_name] = vr_seq
                num_vr += 1
                
        write_out_reads(output_smoothed_reads_file, vr_buffer)
        print(f"Wrote out {num_vr:,} virtual reads.")
    
    print(f"Done with {seq}! Took {time.time() - ts1:,.2f} sec.")
        
print(f"Time taken: {time.time() - t1:,} sec.")

Identifying mutated positions (p = 10%) in genome CAMP...
Found 35 mutated positions (p = 10%) in CAMP.
Note that this tally is higher than you'd see in e.g. the CP1/2/3 plots, because now we're including both 'rare' and non-rare mutations. Just so you don't waste five minutes sanity-checking this like I did.
Going through these positions...
	On aln 5,000 in seq CAMP. Time spent on CAMP so far: 5.83 sec.
	On aln 10,000 in seq CAMP. Time spent on CAMP so far: 12.69 sec.
	On aln 15,000 in seq CAMP. Time spent on CAMP so far: 20.07 sec.
	On aln 20,000 in seq CAMP. Time spent on CAMP so far: 28.32 sec.
	On aln 25,000 in seq CAMP. Time spent on CAMP so far: 36.72 sec.
	On aln 30,000 in seq CAMP. Time spent on CAMP so far: 45.45 sec.
	On aln 35,000 in seq CAMP. Time spent on CAMP so far: 54.02 sec.
	On aln 40,000 in seq CAMP. Time spent on CAMP so far: 62.61 sec.
	On aln 45,000 in seq CAMP. Time spent on CAMP so far: 71.21 sec.
	On aln 50,000 in seq CAMP. Time spent on CAMP so far: 79.62 sec

## 2. Assemble these smoothed reads

In [8]:
%%bash

# LJA with no error correction, but with a filter for low-coverage edges

OUTDIR=phasing-data/smoothed-reads

/home/mfedarko/software/LJA-branch/bin/lja \
    --reads $OUTDIR/edge_6104_smoothed_reads.fasta \
    --simpleec \
    --Cov-threshold 10 \
    --output-dir $OUTDIR/edge_6104_lja_cf_10x

/home/mfedarko/software/LJA-branch/bin/lja \
    --reads $OUTDIR/edge_1671_smoothed_reads.fasta \
    --simpleec \
    --Cov-threshold 10 \
    --output-dir $OUTDIR/edge_1671_lja_cf_10x

/home/mfedarko/software/LJA-branch/bin/lja \
    --reads $OUTDIR/edge_2358_smoothed_reads.fasta \
    --simpleec \
    --Cov-threshold 10 \
    --output-dir $OUTDIR/edge_2358_lja_cf_10x

00:00:00 130Mb  INFO: Hello! You are running La Jolla Assembler (LJA), a tool for genome assembly from PacBio HiFi reads
00:00:16 130Mb  INFO: 048bd920e01d28e0a190c090a55e14e64e85bfa4
00:00:16 130Mb  INFO: LJA pipeline started
00:00:16 130Mb  INFO: Performing initial correction with k = 5001
00:00:16 0Mb  INFO: Reading reads
00:00:16 0Mb  INFO: Extracting minimizers
00:00:17 322Mb  INFO: Finished read processing
00:00:17 322Mb  INFO: 304565 hashs collected. Starting sorting.
00:00:17 327Mb  INFO: Finished sorting. Total distinct minimizers: 523
00:00:17 327Mb  INFO: Starting construction of sparse de Bruijn graph
00:00:17 327Mb  INFO: Vertex map constructed.
00:00:17 327Mb  INFO: Filling edge sequences.
00:00:18 327Mb  INFO: Finished sparse de Bruijn graph construction.
00:00:18 327Mb  INFO:  Collecting tips 
00:00:18 327Mb  INFO: Added 28 artificial minimizers from tips.
00:00:18 327Mb  INFO: Collected 1052 old edges.
00:00:18 327Mb  INFO: New minimizers added to sparse graph.
00:00:1

00:00:20 326Mb  INFO: Edge 481522153736219808937826745464669535720A has cov 1229.33
00:00:20 326Mb  INFO: Edge 481522153736219808937826745464669535720A has cov 1229.33
00:00:20 326Mb  INFO: Edge 481522153736219808937826745464669535720A has cov 1229.33
00:00:20 326Mb  INFO: Edge 481522153736219808937826745464669535720A has cov 1229.33
00:00:20 326Mb  INFO: Edge 481522153736219808937826745464669535720A has cov 1229.33
00:00:20 326Mb  INFO: Edge 481522153736219808937826745464669535720A has cov 1229.33
00:00:20 326Mb  INFO: Edge 481522153736219808937826745464669535720A has cov 1229.33
00:00:20 326Mb  INFO: Edge 481522153736219808937826745464669535720A has cov 1229.33
00:00:20 326Mb  INFO: Edge 481522153736219808937826745464669535720A has cov 1229.33
00:00:20 326Mb  INFO: Edge 481522153736219808937826745464669535720A has cov 1229.33
00:00:20 326Mb  INFO: Edge 481522153736219808937826745464669535720A has cov 1229.33
00:00:20 326Mb  INFO: Edge 48152215373621980893782674546466953572

00:00:20 326Mb  INFO: Edge 481522153736219808937826745464669535720A has cov 1229.33
00:00:20 326Mb  INFO: Edge 481522153736219808937826745464669535720A has cov 1229.33
00:00:20 326Mb  INFO: Edge 481522153736219808937826745464669535720A has cov 1229.33
00:00:20 326Mb  INFO: Edge 481522153736219808937826745464669535720A has cov 1229.33
00:00:20 326Mb  INFO: Edge 481522153736219808937826745464669535720A has cov 1229.33
00:00:20 326Mb  INFO: Edge 481522153736219808937826745464669535720A has cov 1229.33
00:00:20 326Mb  INFO: Edge 481522153736219808937826745464669535720A has cov 1229.33
00:00:20 326Mb  INFO: Edge 481522153736219808937826745464669535720A has cov 1229.33
00:00:20 326Mb  INFO: Edge 481522153736219808937826745464669535720A has cov 1229.33
00:00:20 326Mb  INFO: Edge 481522153736219808937826745464669535720A has cov 1229.33
00:00:20 326Mb  INFO: Edge 481522153736219808937826745464669535720A has cov 1229.33
00:00:20 326Mb  INFO: Edge 48152215373621980893782674546466953572

00:00:20 326Mb  INFO: Edge 481522153736219808937826745464669535720A has cov 1229.33
00:00:20 326Mb  INFO: Edge 481522153736219808937826745464669535720A has cov 1229.33
00:00:20 326Mb  INFO: Edge 481522153736219808937826745464669535720A has cov 1229.33
00:00:20 326Mb  INFO: Edge 481522153736219808937826745464669535720A has cov 1229.33
00:00:20 326Mb  INFO: Edge 481522153736219808937826745464669535720A has cov 1229.33
00:00:20 326Mb  INFO: Edge 481522153736219808937826745464669535720A has cov 1229.33
00:00:20 326Mb  INFO: Edge 481522153736219808937826745464669535720A has cov 1229.33
00:00:20 326Mb  INFO: Edge 481522153736219808937826745464669535720A has cov 1229.33
00:00:20 326Mb  INFO: Edge 481522153736219808937826745464669535720A has cov 1229.33
00:00:20 326Mb  INFO: Edge 2206348963468229682596777438204389152230G has cov 230.286
00:00:20 326Mb  INFO: Edge 1492659625065749616142641995359247264130C has cov 225.73
00:00:20 326Mb  INFO: Edge 1949494835146332603265699854634356654

00:00:20 326Mb  INFO: Edge 2206348963468229682596777438204389152230T has cov 1226.5
00:00:20 326Mb  INFO: Edge 251563719030184320818527398878914037701A has cov 1418.42
00:00:20 326Mb  INFO: Edge 481522153736219808937826745464669535720A has cov 1229.33
00:00:20 326Mb  INFO: Edge 2206348963468229682596777438204389152230T has cov 1226.5
00:00:20 326Mb  INFO: Edge 251563719030184320818527398878914037701A has cov 1418.42
00:00:20 326Mb  INFO: Edge 481522153736219808937826745464669535720A has cov 1229.33
00:00:20 326Mb  INFO: Edge 2206348963468229682596777438204389152230T has cov 1226.5
00:00:20 326Mb  INFO: Edge 251563719030184320818527398878914037701A has cov 1418.42
00:00:20 326Mb  INFO: Edge 481522153736219808937826745464669535720A has cov 1229.33
00:00:20 326Mb  INFO: Edge 2206348963468229682596777438204389152230G has cov 230.286
00:00:20 326Mb  INFO: Edge 1492659625065749616142641995359247264130C has cov 225.73
00:00:20 326Mb  INFO: Edge 1949494835146332603265699854634356654

00:00:20 326Mb  INFO: Edge 2206348963468229682596777438204389152230G has cov 230.286
00:00:20 326Mb  INFO: Edge 1492659625065749616142641995359247264130C has cov 225.73
00:00:20 326Mb  INFO: Edge 194949483514633260326569985463435665460T has cov 221
00:00:20 326Mb  INFO: Edge 585359648412615768887665096781583281700A has cov 218.667
00:00:20 326Mb  INFO: Edge 897200217641944872401514446142094988730C has cov 218
00:00:20 326Mb  INFO: Edge 544678115718715386035081048053569809971T has cov 217
00:00:20 326Mb  INFO: Edge 238647703763112687651598635819759101280T has cov 215
00:00:20 326Mb  INFO: Edge 480535502920129320521110307409829909761A has cov 215.049
00:00:20 326Mb  INFO: Edge 288371185134940549868297663591375979891T has cov 213.208
00:00:20 326Mb  INFO: Edge 351541577041978906955611868237741918140G has cov 211
00:00:20 326Mb  INFO: Edge 1500462907428884793235200884780893128780A has cov 210
00:00:20 326Mb  INFO: Edge 257836911354694559212426121058581849550T has cov 251.723
00

00:00:20 326Mb  INFO: Edge 2626117996103857924887961775893612227560C has cov 296
00:00:20 326Mb  INFO: Edge 2735342579840499030495683770774879533310G has cov 297.125
00:00:20 326Mb  INFO: Edge 3120857032225869994475821216879878629061T has cov 299.182
00:00:20 326Mb  INFO: Edge 991418790839405057485967121486937548871A has cov 299.2
00:00:20 326Mb  INFO: Edge 712734501886343628978631502556581613500A has cov 300.789
00:00:20 326Mb  INFO: Edge 438763435427860413182556279907166328870G has cov 299.333
00:00:20 326Mb  INFO: Edge 730312428622792932099741257163676189170C has cov 302.209
00:00:20 326Mb  INFO: Edge 677746214193171225565545284232959472821A has cov 307.086
00:00:20 326Mb  INFO: Edge 484876691430220596449522142845378284841T has cov 1246.98
00:00:20 326Mb  INFO: Edge 257836911354694559212426121058581849550T has cov 251.723
00:00:20 326Mb  INFO: Edge 251563719030184320818527398878914037701A has cov 1418.42
00:00:20 326Mb  INFO: Edge 251563719030184320818527398878914037701A 

00:00:20 326Mb  INFO: Edge 2626117996103857924887961775893612227560C has cov 296
00:00:20 326Mb  INFO: Edge 2735342579840499030495683770774879533310G has cov 297.125
00:00:20 326Mb  INFO: Edge 3120857032225869994475821216879878629061T has cov 299.182
00:00:20 326Mb  INFO: Edge 991418790839405057485967121486937548871A has cov 299.2
00:00:20 326Mb  INFO: Edge 712734501886343628978631502556581613500A has cov 300.789
00:00:20 326Mb  INFO: Edge 438763435427860413182556279907166328870G has cov 299.333
00:00:20 326Mb  INFO: Edge 730312428622792932099741257163676189170C has cov 302.209
00:00:20 326Mb  INFO: Edge 677746214193171225565545284232959472821A has cov 307.086
00:00:20 326Mb  INFO: Edge 484876691430220596449522142845378284841T has cov 1246.98
00:00:20 326Mb  INFO: Edge 257836911354694559212426121058581849550T has cov 251.723
00:00:20 326Mb  INFO: Edge 305836606628012928356671851756566237130C has cov 285.857
00:00:20 326Mb  INFO: Edge 1727254688512886510508949638602194849300A

00:00:20 326Mb  INFO: Edge 2079694405788719572443897415728017089771T has cov 290.25
00:00:20 326Mb  INFO: Edge 569519361132729930382596407134592407941T has cov 294
00:00:20 326Mb  INFO: Edge 2626117996103857924887961775893612227560C has cov 296
00:00:20 326Mb  INFO: Edge 2735342579840499030495683770774879533310G has cov 297.125
00:00:20 326Mb  INFO: Edge 3120857032225869994475821216879878629061T has cov 299.182
00:00:20 326Mb  INFO: Edge 991418790839405057485967121486937548871A has cov 299.2
00:00:20 326Mb  INFO: Edge 712734501886343628978631502556581613500A has cov 300.789
00:00:20 326Mb  INFO: Edge 438763435427860413182556279907166328870G has cov 299.333
00:00:20 326Mb  INFO: Edge 730312428622792932099741257163676189170C has cov 302.209
00:00:20 326Mb  INFO: Edge 677746214193171225565545284232959472821A has cov 307.086
00:00:20 326Mb  INFO: Edge 484876691430220596449522142845378284841T has cov 1246.98
00:00:20 326Mb  INFO: Edge 251563719030184320818527398878914037701A has 

00:00:20 326Mb  INFO: Edge 484876691430220596449522142845378284841T has cov 1246.98
00:00:20 326Mb  INFO: Edge 251563719030184320818527398878914037701A has cov 1418.42
00:00:20 326Mb  INFO: Edge 167862607672826003559324516037857823910A has cov 1586.24
00:00:20 326Mb  INFO: Edge 908520387286924633160749725845037912100A has cov 1608.65
00:00:20 326Mb  INFO: Edge 401193712357590502395025965843908086141A has cov 1602.8
00:00:20 326Mb  INFO: Edge 484876691430220596449522142845378284841T has cov 1246.98
00:00:20 326Mb  INFO: Edge 251563719030184320818527398878914037701A has cov 1418.42
00:00:20 326Mb  INFO: Edge 167862607672826003559324516037857823910A has cov 1586.24
00:00:20 326Mb  INFO: Edge 908520387286924633160749725845037912100A has cov 1608.65
00:00:20 326Mb  INFO: Edge 401193712357590502395025965843908086141A has cov 1602.8
00:00:20 326Mb  INFO: Edge 484876691430220596449522142845378284841T has cov 1246.98
00:00:20 326Mb  INFO: Edge 251563719030184320818527398878914037701A

00:00:20 326Mb  INFO: Edge 484876691430220596449522142845378284841T has cov 1246.98
00:00:20 326Mb  INFO: Edge 484876691430220596449522142845378284841T has cov 1246.98
00:00:20 326Mb  INFO: Edge 484876691430220596449522142845378284841T has cov 1246.98
00:00:20 326Mb  INFO: Edge 484876691430220596449522142845378284841T has cov 1246.98
00:00:20 326Mb  INFO: Edge 484876691430220596449522142845378284841T has cov 1246.98
00:00:20 326Mb  INFO: Edge 484876691430220596449522142845378284841T has cov 1246.98
00:00:20 326Mb  INFO: Edge 484876691430220596449522142845378284841T has cov 1246.98
00:00:20 326Mb  INFO: Edge 484876691430220596449522142845378284841T has cov 1246.98
00:00:20 326Mb  INFO: Edge 484876691430220596449522142845378284841T has cov 1246.98
00:00:20 326Mb  INFO: Edge 484876691430220596449522142845378284841T has cov 1246.98
00:00:20 326Mb  INFO: Edge 484876691430220596449522142845378284841T has cov 1246.98
00:00:20 326Mb  INFO: Edge 48487669143022059644952214284537828484

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



00:00:23 22Mb  INFO: Finished increasing k
00:00:23 22Mb  INFO: Exporting remaining active transitions
00:00:23 22Mb  INFO: Export to Dot
00:00:23 22Mb  INFO: Export to GFA and compressed contigs
00:00:23 24Mb  INFO: Finished repeat resolution
00:00:23 130Mb  INFO: Performing polishing and homopolymer uncompression
00:00:23 3Mb  INFO: Aligning reads back to assembly
00:00:24 241Mb  INFO: Finished alignment.
00:00:24 241Mb  INFO: Printing alignments to "phasing-data/smoothed-reads/20220306_g1217_lja_covfilt_2x/uncompressing/alignments.txt"
00:00:24 241Mb  INFO: Reading and processing initial reads from ["phasing-data/smoothed-reads/edge_6104_smoothed_reads_g1217_again_2.fasta"]
00:00:34 1.2Gb  INFO: Uncompressing homopolymers in contigs
00:00:34 1.2Gb  INFO: Total zero covered nucleotides 0
00:00:34 1.2Gb  INFO: Calculating overlaps between adjacent uncompressed edges
00:00:34 1.2Gb  INFO: Printing final gfa file to "phasing-data/smoothed-reads/20220306_g1217_lja_covfilt_2x/mdbg.gfa"
00