# Compute diversity indices

In [1]:
%run "Header.ipynb"

In [31]:
import math
import pysam
import pysamstats
import skbio

## Parameters of the diversity index: $p$ and $m$

See paper for description.

In [26]:
m = 5

# The possible values of p we'll try. These are NOT p-values so i'm not gonna call them that lmao
# We exclude p = 0 since computing m/0 is obviously uh not gonna work out
percentages = [2, 1, 0.5, 0.25, 0.1, 0.05]

# No edges with length < this will be included in the diversity index
MIN_SEQ_LEN = 1e6

## Compute diversity index for varying values of $p$

Note that not all genomes that pass the filter for one value of $p$ will pass the filter for other values of $p$. This is expected.

In [4]:
# This should include all edges' alignments now, not just the three selected genomes!
bf = pysam.AlignmentFile("../main-workflow/output/fully-filtered-and-sorted-aln.bam", "rb")

### Get a mapping of sequence (edge) name to sequence length

In [24]:
# Compute mapping of edge name to length -- need to know this for some of the computations below.
#
# We don't use skbio.DNA.read() for this since that seems to only return a single sequence from the
# FASTA file at once (it's configurable with the seq_num parameter, but I don't want to accidentally
# make this a "Schlemiel the Painter's" algorithm by first iterating to seq 1, then starting over and
# iterating to seq 2, and so on. Easiest to just handle this parsing from scratch.

seq2len = {}
seqname = None
with open("../main-workflow/output/all_edges.fasta", "r") as fastafile:
    
    # Assumes that sequences are not split up over multiple lines (so a FASTA file with N sequences
    # should have only 2N lines, maybe 2N + 1 if there's an extra empty newline at the bottom of the file)
    for linenum, line in enumerate(fastafile):
    
        if line.startswith(">"):
            if linenum % 2 != 0:
                raise ValueError("something weird with > location in all_edges.fasta. Go yell at Marcus.")
            seqname = line.strip()[1:]
        else:
            if linenum % 2 != 1:
                raise ValueError("something weird with non > location in all_edges.fasta. Go yell at Marcus.")
            if seqname is None:
                raise ValueError("No sequence name set yet?")
                
            seqlen = len(line.strip())
            seq2len[seqname] = seqlen
            seqname = None

print(f"seq2len contains {len(seq2len):,} entries. The length of edge_1 is {seq2len['edge_1']:,} bp.")

seq2len contains 78,793 entries. The length of edge_1 is 39,565 bp.


In [38]:
num_long_enough_seqs = 0
for seq in seq2len:
    if seq2len[seq] >= MIN_SEQ_LEN:
        num_long_enough_seqs += 1
print(f"{num_long_enough_seqs:,} / {len(seq2len):,} seqs have a length of at least {MIN_SEQ_LEN:,.0f} bp.")

468 / 78,793 seqs have a length of at least 1,000,000 bp.


### Actually compute the diversity indices

In [None]:
# Some stuff we precompute to avoid doing this a zillion times unnecessarily
p2mincov = {p: m / p for p in percentages}
p2pct = {p: p / 100 for p in percentages}

# The main results of this work will be saved here.
p2seq2dividx = {p: {} for p in percentages}

num_seqs = len(seq2len)

for si, seq in enumerate(seq2len.keys(), 1):
    pct = 100 * (si / num_seqs)
    print(f"On seq {seq} ({si:,} / {num_seqs:,}) ({pct:.2f}%)...", end=" ", flush=True)
    
    # Immediately ignore sequences with length < 1 Mbp.
    if seq2len[seq] < MIN_SEQ_LEN:
        print(f"Skipping since sequence length < {MIN_SEQ_LEN:,.0f} bp.")
        continue
    else:
        print(f"Length = {seq2len[seq] / 1e6:,.2f} Mbp. Progress:", end=" ", flush=True)
    
    # Keep track of how many positions in this sequence are sufficiently covered (the classification of a
    # position as sufficiently covered or not will depend on the value of p set).
    # We'll then use this to determine whether or not we can compute the diversity index for a sequence for
    # a given value of p.
    p2sufficientlycoveredpositionct = {p: 0 for p in percentages}
    
    # Numbers of identified p-mutations within just the sufficiently-covered positions for a given p for this
    # sequence.
    p2mutationct = {p: 0 for p in percentages}
    
    one_tenth_len = math.floor(seq2len[seq] / 10)
    
    # Go through each position in the sequence's pileup and look for sufficiently-covered positions and
    # p-mutations within those sufficiently-covered positions.
    #
    # See bam-to-jsons.py for a description of why these params are useful.
    for ri, rec in enumerate(
        pysamstats.stat_variation(
            bf, chrom=seq, fafile="../main-workflow/output/all_edges.fasta", start=0, end=seq2len[seq],
            truncate=True, pad=True, max_depth=1e6
        ),
        1
    ):
        # very simple progress bar for my own sake. might miss a few dots here and there
        if ri % one_tenth_len == 0:
            print("*", end="", flush=True)
            
        matches = rec["matches"]
        mismatches = rec["mismatches"]
        # NOTE: As with "coverage" in the coverage plots, this doesn't take into account deletions at a
        # given position. That info is available through pysamstats so we could use it here if desired.
        cov = matches + mismatches
        
        # Rather than looking at (mismatches / cov), we instead look at each alternate non-matching nucleotide
        # individually.
        nonmatches = set("ACGT") - set(rec["ref"])
        
        for p in percentages:
            if cov >= p2mincov[p]:
                p2sufficientlycoveredpositionct[p] += 1
                
                for nm in nonmatches:
                    if (rec[nm] / cov) > p2pct[p]:
                        p2mutationct[p] += 1
                        # Only count a given position once -- even if it contains multiple distinct p-mutations
                        # (e.g. if we set p = 1% and a given position has 95% reference [A], 2% C, 3% G,
                        # then that one position would only count as a single p-mutation).
                        #
                        # I guess this behavior could be changed if desired, although it would alter the
                        # interpretation of the diversity index since it'd no longer be constrained to [0, 1].
                        # (I guess the range would then be [0, 3]? Where a div index of 3 means that there are
                        # 3 p-mutations per reference position...? eesh.)
                        break
    
    # Now that we've looked through all positions, see which value(s) of p we can compute the div index for
    # for this sequence. Do so, and save the resulting info.
    valid_p = []
    half_seq_len = 0.5 * seq2len[seq]
    for p in percentages:
        if p2sufficientlycoveredpositionct[p] >= half_seq_len:
            valid_p.append(p)
            p2seq2dividx[p][seq] = p2mutationct[p] / p2sufficientlycoveredpositionct[p]
            
    print(f"\n\tDiversity index is defined for {len(valid_p)} / {len(percentages)} values of p: {valid_p}.")

On seq edge_1 (1 / 78,793) (0.00%)... Skipping since sequence length < 1,000,000 bp.
On seq edge_2 (2 / 78,793) (0.00%)... Skipping since sequence length < 1,000,000 bp.
On seq edge_3 (3 / 78,793) (0.00%)... Skipping since sequence length < 1,000,000 bp.
On seq edge_4 (4 / 78,793) (0.01%)... Skipping since sequence length < 1,000,000 bp.
On seq edge_5 (5 / 78,793) (0.01%)... Skipping since sequence length < 1,000,000 bp.
On seq edge_6 (6 / 78,793) (0.01%)... Skipping since sequence length < 1,000,000 bp.
On seq edge_7 (7 / 78,793) (0.01%)... Skipping since sequence length < 1,000,000 bp.
On seq edge_8 (8 / 78,793) (0.01%)... Length = 1.71 Mbp. Progress: **********
	Diversity index is defined for 6 / 6 values of p: [2, 1, 0.5, 0.25, 0.1, 0.05].
On seq edge_9 (9 / 78,793) (0.01%)... Skipping since sequence length < 1,000,000 bp.
On seq edge_10 (10 / 78,793) (0.01%)... Length = 1.56 Mbp. Progress: **********
	Diversity index is defined for 3 / 6 values of p: [2, 1, 0.5].
On seq edge_11 (1

## Visualize the diversity index

Histograms sharing the x-axis (should go from 0 to 1 I guess -- maybe convert to percentages idk)

Highlight CAMP, BACTERIA, and BACTEROIDALES as vertical lines colored using same colors as logratio / covlen plots on each histogram!