# Generate "mutation matrices"

In [1]:
%run "Header.ipynb"

In [None]:
import copy
import time
import pysam
import skbio
from collections import defaultdict, Counter
from parse_sco import parse_sco

## Initialize data structures that we'll store frequency data in

In [None]:
# 64x63 dict: each key is a triplet of {A, C, G, T}, and each value is another dict with all the other codons
codon2codon2freq = {}

# 21x20 dict: each key is a proteinogenic amino acid (A, C, D, E, F, ...), limited to just
# stuff in the standard genetic code (i.e. ignoring selenocystine and pyrrolsine) but including
# "*", representing a stop codon.
aa2aa2freq = {}

# 64-key dict: maps each triplet to an integer indicating how frequently this triplet occurs in all genes
# in the genomes (i.e. not counting mutations into this triplet).
codon2freq = {}

# 21-key dict: maps amino acid/stop codon to integer indicating frequency across all genes.
aa2freq = {}

# There's probably a fancier way of generating this list, but this is fine.
codons = []
# Also, we figure out the reverse complements of each of the 64 3-mers in advance -- this avoids
# us having to call str(skbio.DNA(c).reverse_complement()) every time we see a codon, and saves a tiny
# amount of time per read (the skbio approach took ~9e-5 seconds every time; the new approach takes ~9e-7
# seconds every time). Considering we're going through well over a million reads, the time savings comes out
# to ... 130.977 seconds, aka 2 minutes 10 seconds, if I'm computing this correctly. So, not much, but it's
# something!
codon2revcomp = {}
nts = "ACGT"
for i in nts:
    for j in nts:
        for k in nts:
            c = "{}{}{}".format(i, j, k)
            codons.append(c)
            codon2revcomp[c] = str(skbio.DNA(c).reverse_complement())

aas = set([])
for c in codons:
    aas.add(str(skbio.DNA(c).translate()))
    
# Initialize dicts to 0s
for c1 in codons:
    codon2codon2freq[c1] = {c2: 0 for c2 in set(codons) - set([c1])}
    codon2freq[c1] = 0
    
for aa1 in aas:
    aa2aa2freq[aa1] = {aa2: 0 for aa2 in set(aas) - set([aa1])}
    aa2freq[aa1] = 0

## Go through all reads aligned to each genome

Define a dict which we'll use to keep track of aligned codon frequencies for each codon, for each gene, for each genome.

- For each read, see which predicted genes (if any) this read intersects within the genome. Note that "intersects" doesn't mean "fully covers".

- For each of these genes, see which codons (if any) this read fully covers within the gene.

- Increment aligned codon frequencies for all codons accordingly.

The reason we do things this way, as opposed to iterating over just the reads overlapping each codon in each gene, is that doing things that way is really slow! I'm pretty sure it's because "find out which reads overlap this region" is a pretty slow operation when working with large datasets -- and also since these are long reads, doing this on the level of each codon means we're effectively doing a lot of redundant work (you can imagine that, for a given codon, the odds are pretty good that most reads overlapping it will also overlap adjacent codon(s)).

In [1]:
# Maps sequence IDs to genes (keyed by their Index in the .sco file) to codons (keyed by (0-indexed!)
# left end, i.e. the lower of the two positional boundaries of the codon, regardless of if its gene
# is on the + or - strand) to observed aligned codon frequencies (keyed by just the triplet, e.g. "AAA").
#
# Example:
# {"edge_6104":                                Sequence
#     {1:                                      Gene index in the .sco file
#         {265:                                Left codon position
#             {"TTA": 1000, "TTT": 1, ... }    Aligned codon frequencies for this particular codon
#         }
#     }
# }
seq2gene2codon2alignedcodons = {}

# This mapping has the same structure as the one above for the first three levels; however, the bottom level
# instead maps codons to the read names we've seen aligned to this codon.
#
# The reason for this is so we can check to see if there are any reads that happen to cover the same codon
# more than once -- if so we just ignore these additional alignments, I guess? Or we could include them, but
# in either case we should mention this in the report because this is a weird thing.
#
# (Since we've filtered out secondary alignments, the only case in which reads could align to a codon > 1
# time -- at least as far as I can tell -- is if the read has supplementary alignment(s) that overlap. This is
# possible but probably not a very common scenario, since it's expected per the SAM specification that
# supplementary alignments have only small overlaps.)
seq2gene2codon2seenreads = {}

In [2]:
bf = pysam.AlignmentFile("../main-workflow/output/fully-filtered-and-sorted-aln.bam", "rb")

for seq in SEQS:
    fasta = skbio.DNA.read("../seqs/{}.fasta".format(seq))
    df = parse_sco("../seqs/genes/{}.sco".format(seq))
    
    # Initialize some of the data structures
    # NOTE: this is kind of slow. However, it still finishes within a few seconds, so not the most
    # important thing to optimize
    seq2gene2codon2alignedcodons[seq] = {}
    seq2gene2codon2seenreads[seq] = {}
    for gene_data in df.itertuples():
        
        # Should never happen, but check this so that we can compute overlap easily and with peace of mind later
        if gene_data.LeftEnd >= gene_data.RightEnd:
            raise ValueError("Gene {}'s coordinates seem messed up: left = {}, right = {}".format(
                gene_data.Index, gene_data.LeftEnd, gene_data.RightEnd
            ))
        
        seq2gene2codon2alignedcodons[seq][gene_data.Index] = {}
        seq2gene2codon2seenreads[seq][gene_data.Index] = {}
        
        codon_positions = [
            i for i in range(gene_data.LeftEnd, gene_data.RightEnd + 1, 3)
        ]

        # For each codon in this gene...
        for cpleft in codon_positions:
            seq2gene2codon2alignedcodons[seq][gene_data.Index][cpleft] = defaultdict(int)
            seq2gene2codon2seenreads[seq][gene_data.Index][cpleft] = []
            
    print("Finished initialization for seq = {}".format(seq))
    readtimes = []
    
    # Note that this isn't really a "read" so much as it is an aligned linear segment (and a read
    # can have multiple such segments derived from it, as discussed above).
    for ri, read in enumerate(bf.fetch(seq)):
        
        t1 = time.time()
        
        # Find all genes that this read intersects in this genome
        
        # These are 0-indexed coordinates (and segright is offset to the right by one; see
        # https://pysam.readthedocs.io/en/latest/api.html#pysam.AlignedSegment.reference_end)
        segleft = read.reference_start
        segright = read.reference_end
        
        if segleft is None or segright is None:
            raise ValueError("Read {} is unmapped? This shouldn't happen!".format(seg.query_name))
        
        if segleft >= segright:
            raise ValueError("Read {}'s coordinates in pysam seem messed up: left = {}, right = {}".format(
                seg.query_name, segleft, segright
            ))
        
        # Convert aligned segment boundaries to 1-indexed coordinates to make comparing with gene
        # coordinates from the .sco file easier.
        # Since segright was already offset to the right by 1, we don't need to do anything for it
        # (the gene coordinates are exact: a gene from [266, 712] starts at base 266 and ends at base 712,
        # using 1-indexing. So in order to make the read boundaries match, we'd add 1 for segright and then
        # subract 1 since segright was already 1 base off -- and n + 1 - 1 = n. (...math is hard)
        segleft += 1
        
        # This could probably be vectorized to make it faster
        genes_overlapping_read = []
        for gene_data in df.itertuples():
            # Quick test for overlap between two ranges where left is guaranteed to be < right.
            # See https://stackoverflow.com/a/325964.
            if segleft <= gene_data.RightEnd and gene_data.LeftEnd <= segright:
                genes_overlapping_read.append(gene_data)

        # (Debugging code)
        # print("{} genes overlap read {}".format(len(genes_overlapping_read), ri))
        # print("Read {}, which ranges from {} to {}, overlaps these genes:".format(ri, segleft, segright))
        # print(genes_overlapping_read)
                
        # If no genes overlap this read, we are free to move on to the next read
        if len(genes_overlapping_read) > 0:
            
            # Computing this is relatively slow, which is why we jump through so many hoops before we do this.
            # Each entry in get_aligned_pairs() is a tuple with 2 elements:
            # the first is the read pos and the second is the reference pos.
            ap = read.get_aligned_pairs(matches_only=True)
            
            # We only consider the leftmost position of each codon, so we don't need to bother checking the last
            # two pairs of positions (since neither could be the leftmost position of a codon that this read
            # fully covers).
            for api, pair1 in enumerate(ap[:-2]):

                # Convert to 1-indexed position for ease of comparison with gene coordinates
                pair1_refpos = pair1[1] + 1
                for gene_data in genes_overlapping_read:
                    
                    gi = gene_data.Index
                    gl = gene_data.LeftEnd
                    gr = gene_data.RightEnd
                    
                    # If this pair is located within a gene:
                    if pair1_refpos >= gl and pair1_refpos <= gr:
                        
                        # If this pair is located at the leftmost position of a codon in this gene:
                        # (Note that check works for both + or - strand genes. whether the leftmost position
                        # is the "start" [i.e. CP 1] or "end" [i.e. CP 3] of the gene changes with the strand
                        # of the gene, but underlying math is the same.)
                        if (pair1_refpos - gl) % 3 == 0:
                            
                            # Check that the pairs are all consecutive (i.e. no "jumps" in the read,
                            # and no "jumps" in the reference)
                            # Since we don't consider the last two pairs in ap, pair2 and pair3 should
                            # always be available.
                            pair2 = ap[api + 1]
                            pair3 = ap[api + 2]
                            readpos_consecutive = pair2[0] == pair1[0] + 1 and pair3[0] == pair2[0] + 1
                            refpos_consecutive =  pair2[1] == pair1[1] + 1 and pair3[1] == pair2[1] + 1
                            
                            if refpos_consecutive and readpos_consecutive:
                                # Nice! This read fully covers this codon.
                                # Figure out what the read actually *says* in the alignment to the codon.
                                # (It'll probably be a complete match most of the time, but there will
                                # be some occasional mismatches -- and seeing those is ... the whole point
                                # of this notebook.)

                                # We make sure to index the read by read coords, not reference coords!
                                aligned_codon = read.query_sequence[pair1[0]: pair1[0] + 3]

                                if gene_data.Strand == "-":
                                    aligned_codon = codon2revcomp[aligned_codon]
                                    
                                if len(aligned_codon) != 3:
                                    # This actually happened when I indexed the reads incorrectly and Python
                                    # just silently returned an empty string lol
                                    print(read, read.query_sequence, gene_data, pair, aligned_codon)
                                    raise ValueError("^^^Something went very wrong with read coordinates!")

                                # Finally, update information about codon frequencies.
                                seq2gene2codon2alignedcodons[seq][gi][pair1_refpos][aligned_codon] += 1
                                if read.query_name not in seq2gene2codon2seenreads[seq][gi][pair1_refpos]:
                                    seq2gene2codon2seenreads[seq][gi][pair1_refpos].append(read.query_name)
                                else:
                                    raise ValueError(
                                        "Oh wow, read {} covers codon {} in {} more than once???".format(
                                            read.query_name, pair1_refpos, seq
                                        )
                                    )
        t2 = time.time()
        readtimes.append(t2 - t1)
        if ri % 100 == 0:
            print("Seen {} reads so far in {}.".format(ri, seq))
    # At this point, we've seen all the reads aligned to all the codons in this genome.
    # We can now "call" mutations based on the frequencies we've counted.
    # TODO: do that!
    break

SyntaxError: invalid syntax (<ipython-input-2-38182d14e5c3>, line 35)

In [None]:
bf = pysam.AlignmentFile("../main-workflow/output/fully-filtered-and-sorted-aln.bam", "rb")

for seq in SEQS:
    fasta = skbio.DNA.read("../seqs/{}.fasta".format(seq))
    df = parse_sco("../seqs/genes/{}.sco".format(seq))
    # Note that we don't explicitly account for gene overlap, so the same position could be considered multiple
    # times if it's present within multiple genes.
    for gene_data in df.itertuples():
        

        
        # For each codon in this gene...
        for cps in codon_positions:
            
            # The codon positions we computed are 1-indexed, so in order to work with pysam/other python libs
            # (which usually use 0-indexing) we reduce the start position by 1 and don't add 1 to the stop
            # position.
            start = min(cps) - 1
            stop = max(cps)
            
            # Make note of the codon sequence and amino acid encoded by this codon in the "reference" genome.
            codon_dna = fasta[start: stop]
            if gene_data.Strand == "-":
                codon_dna = codon_dna.reverse_complement()

            codon_seq = str(codon_dna)
            aa = str(codon_dna.translate())
            
            # Update frequencies accordingly.
            codon2freq[codon_seq] += 1
            aa2freq[aa] += 1

            # Find all reads that span this codon without indels/skips,
            # and save their sequence here.
            # Idea to use defaultdict(int) for this from https://stackoverflow.com/a/722702
            aligned_codons = defaultdict(int)
            
            num_good_reads = 0
            seen_read_names = []
            for read in bf.fetch(seq, start, stop):
                
                # Check that the read has no indels intersecting with this codon.
                # We do this by going through all aligned pairs. Once we find the position aligned
                # to the start of this codon, check that the next two codon positions follow immediately
                # after in the aligned pairs (and that there aren't any Nones in these pairs indicating
                # indels/skips/etc).
                #
                # get_aligned_pairs() is kinda slow -- we can speed it up a bit by passing matches_only=True
                # since we don't care about skips/indels. For the sake of safety, and because I originally
                # wrote this without using matches_only=True, we still check for Nones anyway.
                ap = read.get_aligned_pairs(matches_only=True)
                for i, pair in enumerate(ap):
                    if pair[1] == start:
                        try:
                            pair2 = ap[i + 1]
                            pair3 = ap[i + 2]
                        except IndexError:
                            # If there just straight up aren't any pairs after this one, then that's fine:
                            # it means this read doesn't fully overlap the codon. Just move on to the next
                            # read.
                            break

                        # Check that none of the pairs have skips/indels.
                        # For an aligned pair, [0] is the read pos and [1] is the reference pos
                        if pair[0] is not None and pair2[0] is not None and pair3[0] is not None:

                            # Check that the pairs are all consecutive
                            refpos_consecutive =  pair2[1] == start + 1 and pair3[1] == stop - 1
                            readpos_consecutive = pair2[0] == pair[0] + 1 and pair3[0] == pair2[0] + 1
                            if refpos_consecutive and readpos_consecutive:
                                # Make sure to index the read by read coordinates, not reference coords!
                                aligned_seq = read.query_sequence[pair[0]: pair3[0] + 1]

                                if gene_data.Strand == "-":
                                    aligned_seq = str(skbio.DNA(aligned_seq).reverse_complement())

                                if len(aligned_seq) != 3:
                                    # This actually happened when I indexed the reads incorrectly and Python
                                    # just silently returned an empty string lol
                                    print(read, read.query_sequence, start, stop, aligned_seq)
                                    raise ValueError("Something went very wrong with read coordinates!")

                                aligned_codons[aligned_seq] += 1
                                num_good_reads += 1
                                seen_read_names.append(read.query_name)
                                
                        # Regardless of if this was a "good" read or not, this was our once chance at
                        # aligning this read to this codon -- so no need to continue going through the aligned
                        # pairs. Break out of that loop.
                        break
                        
            if len(set(seen_read_names)) < len(seen_read_names):
                print(Counter(seen_read_names).most_common())
                raise ValueError(
                    "Duplicate reads covering a single codon: figure out how to handle this situation."
                )
            print("{} (good) reads covered codon {}".format(num_good_reads, codon_seq))
            print("sum of vals of ac is {}".format(sum(aligned_codons.values())))
            print("Codon {} from {} to {} in gene {} in seq {} has mutations: {}".format(
                codon_seq, start, stop, gene_data.Index, seq, aligned_codons
            ))
            # Now, we can finally compute stats re: number of mismatching and matching codons
            num_aligned_codons = sum(aligned_codons.values())
            alt_codon_frac = (num_aligned_codons - aligned_codons[codon_seq]) / num_aligned_codons
            
            # Using minfreq = 0.5%
            if alt_codon_frac > 0.005:
                
                alt_codons = {c: aligned_codons[c] for c in aligned_codons if c != codon_seq}
                # Retrieve max-freq alternate codon.
                # Based on https://stackoverflow.com/a/280156.
                # (Note that if there's a tie, the result is arbitrary. Shouldn't be a big deal. Making note
                # of in the paper.)
                max_freq_alt_codon = max(alt_codons, key=alt_codons.get)
                codon2codon2freq[codon_seq][max_freq_alt_codon] += 1
                
                print("Is mutation! And max freq alt codon is {}".format(max_freq_alt_codon))
                
                # NOTE: I guess you could argue that we should do this another way, where we actually compute
                # the translations of all the alt codons and then pick the most common AA/stop codon from there?
                #
                # You could argue this either way: doing it based on just the mutated codon keeps the matrices
                # consistent and lessens the impact of small errors, while taking into account all alt codon
                # translations could help show weird things where multiple mutations have similar consequences.
                # Hmm.
                #
                # TODO: think about!
                alt_codon_aa = str(skbio.DNA(max_freq_alt_codon).translate())
                if alt_codon_aa != aa:
                    aa2aa2freq[aa][alt_codon_aa] += 1
                    print("Is nonsyn mutation! Alt {} codes for {}; orig coded for {}".format(
                        max_freq_alt_codon, alt_codon_aa, aa
                    ))

## Basic pseudocode

- Set up a 64x64 __codon mutation matrix__ of all zeroes (numpy?). Notably, there is no guarantee of symmetricity here, although we may expect this to be roughly symmetric.
  - Define a mapping codon2index for this matrix. This should be in alphabetical order, so AAA -> 0, AAC -> 1, ..., TTT -> 63.


- Set up a 21x21 __amino acid mutation matrix__ of all zeroes. Similarly, no symmetric guarantee.
  - Define a mapping aa2index for this matrix. Also in alphabetical order: so (see [here](https://en.wikipedia.org/wiki/International_Union_of_Pure_and_Applied_Chemistry#Amino_acid_and_nucleotide_base_codes)), it'll look like A, C, ..., Y, \*. We add an extra row/column for \*, a stop codon -- for consistency, since mutations into or from stop codons can happen.
  

- For every genome
  - For every gene
    - For every codon (i.e. first look at [0, 1, 2] in this gene, then [3, 4, 5], etc.)
      - Filter to reads that completely cover this codon's coordinates using pysam.
      - Determine what each of these read spells in this 3-mer.
      - Choose the most common codon out of all mutated 3-mers. If its frequency out of all reads covering this codon is at least 0.5%, call it a mutated codon.
      - Update the codon mutation matrix and amino acid matrix based on the 3-mer and corresponding amino acid (or stop codon) spelled out by the original and mutated 3-mer sequence.

- Download the matrices to my laptop!

-----

- ALTERNATELY: We could go through just the mutation rates, assuming that all reads intersecting a codon cover the thing entirely (which will probs usually but not always be true). We can find mutated codons by looking at individual positions' mutation rates. However, the method above is closer to what was requested, and at this point makes more sense to me.