# Identify and cluster "linked" mutated positions

In [1]:
%run "Header.ipynb"

In [2]:
%run "LoadMutationJSONData.ipynb"

In [3]:
import pysam
import skbio
from itertools import combinations
from collections import defaultdict

In [None]:
# unless (pos j) - (pos i) < this, we do not consider i and j linked.
MAX_DIST_BTWN_LINKED_POSITIONS_NONINCLUSIVE = 3000

# unless at least this many reads have mutations at both pos i and pos j, we do not consider i and j linked.
MIN_COV_OF_MUTATIONS_AT_LINKED_POSITIONS = 1000

# unless |Reads(i, -)| + |Reads(-, j)| < this fraction * |Reads(i, j)|, we do not consider i and j linked.
MAX_NONLINKED_MUTATED_FRACTION_NONINCLUSIVE = 0.2

# How we call a mutation: only if (# mismatches) / (# mismatches + # matches) > MINFREQ.
# Defaults to 0.5%.
MINFREQ = 0.005

In [4]:
bf = pysam.AlignmentFile("../main-workflow/output/fully-filtered-and-sorted-aln.bam", "rb")

t1 = time.time()
for seq in SEQS:
    fasta = skbio.DNA.read("../seqs/{}.fasta".format(seq))
    # Maps tuple of (left integer pos, right integer pos) to a list of [0, 0, 0, 0].
    # (Since a pair (i, j) is equal to a pair (j, i), we just index this so that the leftmost position is the
    # first element in the tuple and the rightmost position is the second element. This seems like a more intuitive
    # way of structuring this then as a nested dict of leftpos2rightpos2groupcts.)
    # https://stackoverflow.com/a/13065439
    #
    # Each entry in the list indicates counts of types of reads connecting these two positions we've seen thus
    # far. In 0-indexed coordinates:
    #
    # 0. Reads(i, j): reads that support mutations at both positions
    # 1. Reads(i, -): reads that only support mutations at i
    # 2. Reads(-, j): reads that only support mutations at j
    # 3. Reads(-, -): reads that do not support mutations at either position
    #
    # This matches the definitions in the paper (currently that is section 3.6.2, but that number may change as
    # the paper is edited and restructured).
    
    pospair2groupcts = defaultdict(lambda: [0, 0, 0, 0])
    
    # Identify all mutated positions up front to save time.
    mutated_positions = []
    for pos in seq2pos2matchct[seq].keys():
        
        matchct = seq2pos2matchct[seq][pos]
        mismatchct = seq2pos2mismatchct[seq][pos]
        cov = mismatchct + matchct
        
        # We can be strict and filter out positions that don't pass the coverage filter for linked reads -- no
        # sense including these.
        if cov >= MIN_COV_OF_MUTATIONS_AT_LINKED_POSITIONS:
            
            # Actually "call" mutations, the same way we do elsewhere in these analyses (albeit maybe with
            # different values of MINFREQ). Of course, this isn't the only way to do this.
            if (mismatchct / cov) > MINFREQ:
                mutated_positions.append(int(pos))
    
    for ri, read in enumerate(bf.fetch(seq), 1):
        ap = read.get_aligned_pairs(matches_only=True)
        
        # Maps mutated positions seen in this read to 0 (match to the reference) or 1 (mismatch to the reference).
        # The absence of a mutated position from this dict implies that this position is not seen in this read
        # (either due to indels/skips or this read just not being aligned to cover it).
        #
        # After we compute this we can increment pospair2groupcts accordingly for every pair of mutated positions
        # present in this dict.
        mutpos2seenval = {}
        
        # TODO: use this pair to "slice" mutated_positions to just the relevant ones
#         leftmostpair = ap[0]
#         rightmostpair = ap[-1]
        
#         leftidx = 0
#         rightidx = len(mutated_positions) - 1
#         for mpi, mpos in enumerate(mutated_positions):
#             if mpos >= leftidx:
#                 pass
            
        # After slicing the mutated positions, we can go through them...?
        # HM. Maybe stuff below is best idea after all.
        
        # Iterating through the aligned pairs is expensive. Since read lengths are generally in the thousands
        # to tens of thousands of bp (which is much less than the > 1 million bp length of any bacterial genome),
        # we set things up so that we only iterate through the aligned pairs once. We maintain an integer that
        # is a poor man's "pointer" to an index in mutated_positions; at every iteration we check this pointer.
        
        mpi = 0
        
        # Go through this read's aligned pairs. As we see each pair, compare the pair's reference position
        # (refpos) to the mpi-th mutated position (mutpos).
        #
        # If refpos >  mutpos, increment mpi until refpos <= mutpos (stopping as early as possible).
        # If refpos == mutpos, we have a match! Update mutpos2seenval[mutpos] based on comparing this read's
        #                      aligned value at this position to the reference at this position.
        #                      Should be either a 0 or a 1.
        # If refpos <  mutpos, continue to the next pair.
        
        # After doing all that, we can just use mutpos2seenval to update the group counts for each pair. Should
        # be feasible, I think (!!!)
        
        for pair in ap:

            # For each mutated position...
            for pi, p1 in enumerate(mutated_positions, 0):

                # For each "rightward" mutated position:
                for p2 in mutated_positions[pi + 1:]:

                    if p2 - p1 < MAX_DIST_BTWN_LINKED_POSITIONS_NONINCLUSIVE:
                        # Figure out what this read says at p1 and at p2.
                        # TODO: Look through aligned pairs.

                    else:
                        # Since mutated_positions is monotonically increasing (...it's a sorted list of positions),
                        # we can break as soon as we get to the max distance away from p1.
                        # Doing this lets us avoid iterating over all possible pairs of mutated positions
                        # for every read (if there are 40k mutated positions, then there would be almost 800 million
                        # such pairs...)
                        break
                
        # For every pair of mutated positions:
        # (Per https://docs.python.org/3/library/itertools.html#itertools.combinations, each combination tuple
        # is sorted in the same order as mutated_positions -- so we can be sure that p1 is to the left of p2.)
        for (p1, p2) in combinations(mutated_positions, 2):
            if p2 - p1 < MAX_DIST_BTWN_LINKED_POSITIONS_NONINCLUSIVE:
                

SyntaxError: unexpected EOF while parsing (<ipython-input-4-f103b9078c4b>, line 58)