# Try to classify contigs as prokaryotic or eukaryotic just from the alignment

General idea: eukaryotic contigs, compared with prokaryotic contigs, should have relatively high amounts of positions with variants around 50% frequency (representing diploidy).

In [1]:
%run "Header.ipynb"

In [2]:
import math
import json
import time
import skbio
import pysam
import pysamstats

In [3]:
# No edges with length < this will be considered here
MIN_SEQ_LEN = 5000

# Minimum coverage for positions where we want to look for ~50% mutation rates.
# This is purposefully low so as to cast a wide net.
MIN_COV = 5

# Both of these are inclusive. We'll accept positions (with at least MIN_COV aligned (mis)matches)
# where the maximum-frequency alternate nucleotide's frequency (divided by total # of aligned (mis)matches)
# is within this inclusive range.
#
# ...So, for a position with exactly 5x (mis)match coverage, we'd accept a max-freq alt nucleotide freq of
# [2, 3] but not 1, 4, or 5.
MIN_DIPLOID_MAXALT_FREQ = 0.4
MAX_DIPLOID_MAXALT_FREQ = 0.6

# Both of these are also inclusive. All positions with at least 5x coverage and a max-frequency alternate
# nucleotide rate in this range will be included in a denominator.
MIN_DENOMINATOR_MAXALT_FREQ = 0.05
MAX_DENOMINATOR_MAXALT_FREQ = 0.6

In [4]:
bf = pysam.AlignmentFile("../main-workflow/output/fully-filtered-and-sorted-aln.bam", "rb")

## Get a mapping of sequence (edge) name to sequence length

stolen from the diversity index notebook; should merge these into a single function

also the div index notebook version of this accidentally trumps the `seq2len` variable defined in the header notebook... should fix that, but not super impt

In [5]:
edge2len = {}
edgename = None
with open("../main-workflow/output/all_edges.fasta", "r") as fastafile:
    
    # Assumes that sequences are not split up over multiple lines (so a FASTA file with N sequences
    # should have only 2N lines, maybe 2N + 1 if there's an extra empty newline at the bottom of the file)
    for linenum, line in enumerate(fastafile):
    
        if line.startswith(">"):
            if linenum % 2 != 0:
                raise ValueError("something weird with > location in all_edges.fasta. Go yell at Marcus.")
            edgename = line.strip()[1:]
        else:
            if linenum % 2 != 1:
                raise ValueError("something weird with non > location in all_edges.fasta. Go yell at Marcus.")
            if edgename is None:
                raise ValueError("No sequence name set yet?")
            edge2len[edgename] = len(line.strip())
            edgename = None

print(f"edge2len contains {len(edge2len):,} entries. The length of edge_1 is {edge2len['edge_1']:,} bp.")

edge2len contains 78,793 entries. The length of edge_1 is 39,565 bp.


In [6]:
long_enough_seqs = []
for edge in edge2len:
    if edge2len[edge] >= MIN_SEQ_LEN:
        long_enough_seqs.append(edge)
        
pct = 100 * (len(long_enough_seqs) / len(edge2len))
print(f"{len(long_enough_seqs):,} / {len(edge2len):,} ({pct:.2f}%) edges have a length of at least {MIN_SEQ_LEN:,.0f} bp.")

43,562 / 78,793 (55.29%) edges have a length of at least 5,000 bp.


## Go through the alignment; count the number of positions with ~50% max-frequency alternate nucleotide mismatches

Parts of this (e.g. processing each record in the pileup) could be abstracted to code that's shared between here, the diversity index ntbk, and the bam2pileup script I wrote for the three "selected" MAGs.

**NOTE: This takes quite a while to run (as of writing, it took about 18.4 hours to complete).**

In [None]:
edge2num_mincov_positions = {}
edge2num_mincov_positions_with_at_least_one_mismatch = {}
edge2num_mincov_positions_with_mutrate_in_denominator_range = {}
edge2num_mincov_putatively_diploid_positions = {}
edge2meancov = {}

num_seqs = len(long_enough_seqs)

# This'll print out a ton of output for even a single contig, so I don't recommend using this in practice.
verbose = False

t1 = time.time()

for si, seq in enumerate(long_enough_seqs, 1):
    pct = 100 * (si / num_seqs)
    print(f"On seq {seq} ({si:,} / {num_seqs:,}) ({pct:.2f}%).", end=" ", flush=True)   
    print(f"{edge2len[seq]:,} bp long.", end=" ", flush=True)
    print(f"Runtime: ~{time.time() - t1:,.0f} sec.", end=" ", flush=True)
    
    one_tenth_len = math.floor(edge2len[seq] / 10)
    
    num_mincov_positions = 0
    num_mincov_positions_with_at_least_one_mismatch = 0
    num_mincov_positions_with_mutrate_in_denominator_range = 0
    num_mincov_putatively_diploid_positions = 0
    
    total_cov = 0
    
    # Go through each position in the sequence's pileup (even including relatively low-coverage positions).
    # See bam-to-jsons.py for a description of why these params are useful.
    # Also, note that pysam coords are 0-indexed!
    for ri, rec in enumerate(
        pysamstats.stat_variation(
            bf, chrom=seq, fafile="../main-workflow/output/all_edges.fasta", start=0, end=edge2len[seq],
            truncate=True, pad=True, max_depth=1e6
        ),
        1
    ):
        # very simple progress bar for my own sake. might miss a few dots here and there
        if ri % one_tenth_len == 0:
            print("=", end="", flush=True)
            
        matches = rec["matches"]
        mismatches = rec["mismatches"]
        # NOTE: As with "coverage" in the coverage plots, this doesn't take into account deletions at a
        # given position. That info is available through pysamstats so we could use it here if desired.
        cov = matches + mismatches
        
        total_cov += cov
        
        if verbose: print(f"Pos {ri}... Matches: {matches} / Mismatches: {mismatches} / cov: {cov}")
        
        if cov >= MIN_COV:
            num_mincov_positions += 1
            # Rather than looking at (mismatches / cov), we instead look at each alternate non-matching nucleotide
            # individually.
            nonmatches = set("ACGT") - set(rec["ref"])
            max_alt_nt_freq = max(rec[alt_nt] for alt_nt in nonmatches)
            max_alt_nt_relfreq = max_alt_nt_freq / cov
            
            # Keep track of # (min-cov) positions that have at least one mismatch; could be useful to know
            if max_alt_nt_freq >= 1:
                num_mincov_positions_with_at_least_one_mismatch += 1
                if verbose: print(f"\tHas at least 1 mismatch! Ref = {rec['ref']}; A = {rec['A']}; C = {rec['C']}; G = {rec['G']}; T = {rec['T']}")
                
            # The meat of this: see if this position "counts" as putatively diploid due to having a ~50%
            # mutation rate
            if max_alt_nt_relfreq >= MIN_DIPLOID_MAXALT_FREQ and max_alt_nt_relfreq <= MAX_DIPLOID_MAXALT_FREQ:
                num_mincov_putatively_diploid_positions += 1
                if verbose: print(f"\tPutatively diploid! rel freq = {max_alt_nt_relfreq}")
                
            # Count, less generally, positions in a broader range -- we'll use this as the denominator
            if max_alt_nt_relfreq >= MIN_DENOMINATOR_MAXALT_FREQ and max_alt_nt_relfreq <= MAX_DENOMINATOR_MAXALT_FREQ:
                num_mincov_positions_with_mutrate_in_denominator_range += 1
                if verbose: print(f"\tPosition {ri} has a mut rate in [5%, 60%]: {max_alt_nt_freq} / {cov} = {max_alt_nt_relfreq}")
    
    assert ri == edge2len[seq]
    meancov = total_cov / ri
    
    # Save info for each edge
    edge2num_mincov_positions[seq] = num_mincov_positions
    edge2num_mincov_positions_with_at_least_one_mismatch[seq] = num_mincov_positions_with_at_least_one_mismatch
    edge2num_mincov_putatively_diploid_positions[seq] = num_mincov_putatively_diploid_positions
    edge2num_mincov_positions_with_mutrate_in_denominator_range[seq] = num_mincov_positions_with_mutrate_in_denominator_range
    edge2meancov[seq] = meancov
    
    # Move print info to a new line
    print(f"\n\t{num_mincov_positions:,} MCP;", end=" ")
    print(f"{num_mincov_positions_with_at_least_one_mismatch:,} MCP w/ >= 1 mismatch;", end=" ")
    print(f"{num_mincov_positions_with_mutrate_in_denominator_range:,} 'denominator' MCP;", end=" ")
    print(f"{num_mincov_putatively_diploid_positions:,} 'diploid' MCP;", end=" ")
    print(f"mean cov = {meancov:,.2f}x")

	39,560 MCP; 6,312 MCP w/ >= 1 mismatch; 302 'denominator' MCP; 13 'diploid' MCP; mean cov = 460.40x
	36,963 MCP; 3,805 MCP w/ >= 1 mismatch; 277 'denominator' MCP; 11 'diploid' MCP; mean cov = 258.76x
	10,893 MCP; 140 MCP w/ >= 1 mismatch; 139 'denominator' MCP; 4 'diploid' MCP; mean cov = 3.36x
	0 MCP; 0 MCP w/ >= 1 mismatch; 0 'denominator' MCP; 0 'diploid' MCP; mean cov = 1.40x
	1,710,962 MCP; 53,743 MCP w/ >= 1 mismatch; 1,921 'denominator' MCP; 0 'diploid' MCP; mean cov = 111.64x
	39,724 MCP; 1,745 MCP w/ >= 1 mismatch; 990 'denominator' MCP; 199 'diploid' MCP; mean cov = 5.06x
	1,534,901 MCP; 10,728 MCP w/ >= 1 mismatch; 7,914 'denominator' MCP; 331 'diploid' MCP; mean cov = 16.04x
	350,937 MCP; 10,230 MCP w/ >= 1 mismatch; 4,643 'denominator' MCP; 2,786 'diploid' MCP; mean cov = 39.48x
	64,760 MCP; 207 MCP w/ >= 1 mismatch; 146 'denominator' MCP; 0 'diploid' MCP; mean cov = 4.54x
	331,685 MCP; 4,229 MCP w/ >= 1 mismatch; 2,458 'denominator' MCP; 832 'diploid' MCP; mean cov = 24

### Save info to a file

Since this can take a few hours to compute, this helps if we have to restart the notebook halfway through / etc.

In [None]:
with open("misc-output/prok-euk-info.txt", "w") as pef:
    pef.write(json.dumps(edge2num_mincov_positions))
    pef.write("\n")
    pef.write(json.dumps(edge2num_mincov_positions_with_at_least_one_mismatch))
    pef.write("\n")
    pef.write(json.dumps(edge2num_mincov_positions_with_mutrate_in_denominator_range))
    pef.write("\n")
    pef.write(json.dumps(edge2num_mincov_putatively_diploid_positions))
    pef.write("\n")
    pef.write(json.dumps(edge2meancov))
    pef.write("\n")

### Load info from that file (...) just in case we need to restart this notebook, or adjust the plotting without rerunning the above stuff

This is obviously a very _ad hoc_ way of defining a file format. If this ever becomes a tool people use, it'd be a lot faster / more efficient to store this in a dedicated table structure!

In [None]:
with open("misc-output/prok-euk-info.txt", "r") as pef:
    for linenum, line in enumerate(pef, 1):
        if linenum == 1:
            edge2num_mincov_positions = json.loads(line.strip())
        elif linenum == 2:
            edge2num_mincov_positions_with_at_least_one_mismatch = json.loads(line.strip())
        elif linenum == 3:
            edge2num_mincov_positions_with_mutrate_in_denominator_range = json.loads(line.strip())
        elif linenum == 4:
            edge2num_mincov_putatively_diploid_positions = json.loads(line.strip())
        elif linenum == 5:
            edge2meancov = json.loads(line.strip())

## Plot histograms of fractions of "diploid" positions across contigs

We're interested in the "diploid" positions, but we want to normalize this somehow (so that, e.g., 2,000 "diploid" positions carry more weight for a contig of length 5,000 than for a contig of length 500,000).

It isn't immediately clear to me what the best way to normalize this is, so we try a few different denominators:

### Option 1: normalize by total # of min-cov positions

$$\frac{\text{Number of "diploid" positions (with at least some minimum coverage)}}{\text{Total number of positions with at least some minimum coverage}}$$

### Option 2: normalize by total # of min-cov positions with at least one mismatch

(all positions in the numerator should by definition be represented in the denominator, since we've set min coverage to 5x and thus in order for a position to be "diploid" it must have more than one aligned mismatch)

$$\frac{\text{Number of "diploid" positions (with at least some minimum coverage)}}{\text{Number of positions with at least one mismatch (with at least some minimum coverage)}}$$

### Option 3: normalize by total # of min-cov positions with a max-alt nucleotide rate in the range $[5\%, 60\%]$

As with Option 2, all positions in the numerator should by definition be represented in the denominator, since
$[40\%, 60\%] \subset [5\%, 60\%]$.

In [None]:
edge2frac_diploid_to_mincov_positions = {}
edge2frac_diploid_to_mincov_positions_with_at_least_one_mismatch = {}
edge2frac_diploid_to_mincov_positions_in_denominator_range = {}
edge2frac_diploid_to_mincov_positions_with_at_least_one_mismatch_only_if_den_geq5 = {}
for e in edge2meancov.keys():
    if edge2num_mincov_positions[e] > 0:
        edge2frac_diploid_to_mincov_positions[e] = (
            edge2num_mincov_putatively_diploid_positions[e] / edge2num_mincov_positions[e]
        )
        
    if edge2num_mincov_positions_with_at_least_one_mismatch[e] > 0:
        f = (
            edge2num_mincov_putatively_diploid_positions[e] / edge2num_mincov_positions_with_at_least_one_mismatch[e]
        )
        edge2frac_diploid_to_mincov_positions_with_at_least_one_mismatch[e] = f
        if edge2num_mincov_positions_with_at_least_one_mismatch[e] >= 5:
            edge2frac_diploid_to_mincov_positions_with_at_least_one_mismatch_only_if_den_geq5[e] = f
            
    if edge2num_mincov_positions_with_mutrate_in_denominator_range[e] > 0:
        edge2frac_diploid_to_mincov_positions_in_denominator_range[e] = (
            edge2num_mincov_putatively_diploid_positions[e] / edge2num_mincov_positions_with_mutrate_in_denominator_range[e]
        )
            
print(f"{len(edge2meancov):,} unique edges that we considered above (with min len of >= {MIN_SEQ_LEN:,} bp).")
print(f"{len(edge2frac_diploid_to_mincov_positions):,} unique edges with > 0 min-cov positions.")
print(f"{len(edge2frac_diploid_to_mincov_positions_with_at_least_one_mismatch):,} unique edges with > 0 min-cov positions with at least one mismatch.")
print(f"{len(edge2frac_diploid_to_mincov_positions_with_at_least_one_mismatch_only_if_den_geq5):,} unique edges with  >= 5 min-cov positions with at least one mismatch.")
print(f"{len(edge2frac_diploid_to_mincov_positions_in_denominator_range):,} unique edges with > 0 denominator range (for new method) positions.")

In [None]:
def fracplot(edge2frac, denominator_desc, denclause=" and a nonzero denominator", use_symlog=True, figfilename=None):
    """Utility function to make it easy to draw these histograms of fractions.
    
    denominator_desc should be a string that'll be placed within a \mathrm{} within a \frac{}.
    So, it should probably be a raw string that uses backslashes or something to add extra space between
    words -- see the cells that use this function for some examples.
    
    A lot of the code here is yanked from the diversity index notebook I wrote.
    """
    fig, ax = pyplot.subplots(1)
    num_edges = len(edge2frac)
    print(f"{num_edges:,} edges represented in this edge2frac object.")
    fracs = [100 * v for v in edge2frac.values()]
    
    # For reference, print info about extreme edge values, for help with sanity-checking
    # The key=edge2frac.get thing lets us essentially just do argmax: https://stackoverflow.com/a/280156
    sorted_edges = sorted(edge2frac.keys(), key=edge2frac.get)
    max_edge = sorted_edges[-1]
    min_edge = sorted_edges[0]
    print(f"Max frac: {max(fracs)} (edge {max_edge}); min frac: {min(fracs)} (edge {min_edge})")
    N = 10
    print(f"Top {N} edges: {sorted_edges[-N:]}")
    print(f"Bottom {N} edges: {sorted_edges[:N]}")
    
    xlim_max = math.ceil(max(fracs))
    bins = range(xlim_max + 1)
    ax.hist(
        fracs, 
        color="#660066",
        linewidth=1,
        edgecolor="#220022",
        bins=bins
    )
    if xlim_max < 20:
        ax.set_xticks(bins)
    else:
        ax.set_xticks(range(0, xlim_max + 1, 10))
    if use_symlog:
        ax.set_yscale("symlog")
        
    ax.set_xlim(0, xlim_max)

    # Don't write the axis numbers as 10^0, 10^1, etc. -- just write as 1, 10, etc.
    # https://stackoverflow.com/a/49751075
    ax.yaxis.set_major_formatter(matplotlib.ticker.ScalarFormatter())

    use_thousands_sep(ax.yaxis)

    ax.set_ylabel("Number of edges with fractions in a range")
    ax.set_xlabel("Fraction of 'diploid' positions (%)")
    ax.set_title(
        f"Histogram of {num_edges:,} edges\nwith length $\geq$ 5,000{denclause}:\n" +
        r"$\frac{\mathrm{Number\ of\ 'diploid'\ positions\ with\ \geq\ 5x\ coverage}}{\mathrm{" +
        denominator_desc + "}}$",
        fontsize=18
    )
    fig.set_size_inches(10, 5)
    if figfilename is not None:
        fig.savefig(f"figs/{figfilename}", bbox_inches="tight")

In [None]:
fracplot(
    edge2frac_diploid_to_mincov_positions,
    r"Number\ of\ positions\ with\ \geq\ 5x\ coverage",
    figfilename="prokeukfrac_den_mincov.png"
)

In [None]:
fracplot(
    edge2frac_diploid_to_mincov_positions_with_at_least_one_mismatch,
    r"Number\ of\ positions\ with\ at\ least\ one\ mismatch\ and\ \geq\ 5x\ coverage",
    figfilename="prokeukfrac_den_mincov_and_atleast_one_mm.png"
)

In [None]:
fracplot(
    edge2frac_diploid_to_mincov_positions_with_at_least_one_mismatch_only_if_den_geq5,
    r"Number\ of\ positions\ with\ at\ least\ one\ mismatch\ and\ \geq\ 5x\ coverage",
    denclause=" and $\geq$ 5 positions in the denominator",
    figfilename="prokeukfrac_den_mincov_and_atleast_one_mm_den_geq_5_pos.png"
)

In [None]:
fracplot(
    edge2frac_diploid_to_mincov_positions_in_denominator_range,
    r"Number\ of\ positions\ with\ alternate\ frequency\ rate\ in\ [5\%,\ 60\%]\ and\ \geq\ 5x\ coverage",
    figfilename="prokeukfrac_newden.png"
)

In [None]:
def estats(e):
    num_dip = edge2num_mincov_putatively_diploid_positions[e]
    num_1mm = edge2num_mincov_positions_with_at_least_one_mismatch[e]
    num_mcp = edge2num_mincov_positions[e]
    print("=" * 79)
    print(f"Edge {e} ({edge2len[e]:,} bp long) has ...")
    print(f"{num_mcp:,} min-cov'd positions.")
    print(f"{num_1mm:,} min-cov'd positions with at least one mismatch.")
    print(f"{num_dip:,} 'diploid' min-cov'd positions.")
    try:
        print(f"Frac of mincov diploid to total mincov: {edge2frac_diploid_to_mincov_positions[e]}")
        print(f"Frac of mincov diploid to mincov'd with at least one mismatch: {edge2frac_diploid_to_mincov_positions_with_at_least_one_mismatch[e]}")
        print(f"Frac of mincov diploid to mincov'd with at least one mismatch,\n    limited to if there are >= 5 such denominator positions: {edge2frac_diploid_to_mincov_positions_with_at_least_one_mismatch_only_if_den_geq5[e]}")
    except KeyError:
        print(f"(Looks like this edge doesn't have all fractions defined for it.)")
        pass

In [None]:
estats("edge_17771")
estats("edge_71904")
estats("edge_66025")
estats("edge_78689")
estats("edge_78501")
# example of an edge where we probs shouldn't care much about it, since it only has 1 min-cov'd position
# with at least one mismatch (so although its fraction for the second plot is 100%, it's probs
# not super interesting)
estats("edge_2987")
# from BLASTing this, seems like it might be a bacteria?
estats("edge_4916")

## Go through some of the highest-fraction edges and retrieve their sequences in a FASTA file

So we can BLAST them, to get a sense for if they might actually correspond to sequences from eukaryotic organisms.

In [None]:
# I just copied these lists from the "top N" output printed from the fracplot() function, but I could also
# totally things up so that these are automatically computed
edges_to_fetch = set(
    ['edge_44228', 'edge_61346', 'edge_56298', 'edge_28715', 'edge_38218', 'edge_72103', 'edge_71715', 'edge_4916', 'edge_71904', 'edge_66025'] +
    ['edge_64859', 'edge_65510', 'edge_67407', 'edge_69012', 'edge_69441', 'edge_71967', 'edge_73691', 'edge_78462', 'edge_78501', 'edge_78689']
)
print(f"Trying to fetch {len(edges_to_fetch)} edge sequences.")
out_fasta = ""
with open("../main-workflow/output/all_edges.fasta", "r") as fastafile:
    
    # Assumes that sequences are not split up over multiple lines (so a FASTA file with N sequences
    # should have only 2N lines, maybe 2N + 1 if there's an extra empty newline at the bottom of the file)
    for linenum, line in enumerate(fastafile):
    
        if line.startswith(">"):
            if linenum % 2 != 0:
                raise ValueError("something weird with > location in all_edges.fasta. Go yell at Marcus.")
            edgename = line.strip()[1:]
        else:
            if linenum % 2 != 1:
                raise ValueError("something weird with non > location in all_edges.fasta. Go yell at Marcus.")
            if edgename is None:
                raise ValueError("No sequence name set yet?")
            if edgename in edges_to_fetch:
                seq = line.strip()
                out_fasta += f">{edgename}\n{seq}\n"
            edgename = None

with open("misc-output/potentially_euk_edges.fasta", "w") as eukfile:
    eukfile.write(out_fasta)

print("Done!")