## Overview

This workbook contains code and notes used to answer week 3's programming assignment.

In [1]:
# Need to load the genome
from Bio.Seq import Seq
import Bio.SeqIO

genome = list(Bio.SeqIO.parse('chr1.GRCh38.excerpt.fasta', 'fasta')).pop().seq

In [50]:
# Prefer to use numpy than a list of lists
import numpy as np

# We also need an approximate matching algorithm
# As mentioned in the programming notes, this is similar to the exact alignment algorithm
# but critically, the first row is initialized as zeros.
def approximate_match(P, T):
    """
    Adaptation of the edit distance function to do approximate matching
    I renamed the variables to be more telling than X/Y.
    Also, I leveraged numpy arrays instead of lists of lists,
    although there is little difference in efficiency in this case
    
    Args:
        P (str): pattern to match to T
        T (str): reference sequence (e.g., genome)
    
    Returns:
        edit_distance (integer): approximate match distance
    """

    # Need len + 1 so we can account for the initialization term
    D = np.zeros([len(P)+1, len(T)+1]).astype(int)

    D[:, 0] = range(len(P)+1)

    for row in range(1, len(P)+1):
        for col in range(1, len(T)+1):
            dist_vert = D[row-1, col] + 1
            dist_hor = D[row, col-1] + 1

            # Compare against last alignment
            dist_diag = D[row-1, col-1]

            # Do the letters mismatch?
            if P[row-1] == T[col-1]:
                dist_diag += 0
            else:
                dist_diag += 1

            # Finally, assign the distance to this particular cell in the array
            D[row, col] = min(dist_vert, dist_hor, dist_diag)

    return D[-1, :].min()

In [68]:
list(Bio.SeqIO.parse('chr1.GRCh38.excerpt.fasta', 'fasta'))

'CM000663.2_excerpt'

## Example 01

Example provided in the programming reading section. Testing function above against it to make sure it works as expected

In [52]:
P = 'GCGTATGC'
T = 'TATTGGCTATACGGTT'

approximate_match(P, T)

2

## Question 01

In [58]:
P = 'GCTGATCGATCGTACG'
approximate_match(P, genome)

3

## Question 02

In [60]:
P = 'GATTTACCAGATTGAG'
approximate_match(P, genome)

2

## Question 03

In [None]:
genome = list(Bio.SeqIO.parse('chr1.GRCh38.excerpt.fasta', 'fasta')).pop().seq

In [61]:
def overlap(seq_suffix, seq_prefix, min_length=3):
    """
    Check to see if the suffix of seq_suffix matches the prefix
    of seq_prefix exactly. Note that no differences are tolerated.
    
    Args:
        seq_suffix (str): sequence whose suffix will be compared
                          against the seq_prefix
        seq_prefix (str): sequence whose prefix will be compared
                          against the seq_suffix

    Returns:
        is_overlap (bool): True if the sequences match exactly
                           up to min_length. Otherwise, False
    """

    start = 0  # start all the way at the left

    while True:
        # 
        start = seq_suffix.find(seq_prefix[:min_length], start)  # look for b's prefix in a
        if start == -1:  # no more occurrences to right
            return False
        # found occurrence; check for full suffix/prefix match
        if seq_prefix.startswith(seq_suffix[start:]):
            return True

        start += 1

In [70]:
# As mentioned in question 3, it is very slow to compare all possible combinations of reads from even a small
# set of reads. Instead, build an index out of the reads. To shorten our search path, we'll build a custom
# index class that will create a lookup for the desired k-mer.
#
# To do this, I modified previous work using a k-mer index.
class Index(object):
    
    # Build a k-mer index
    def __init__(self, genome, k):

        # Track k_mer length
        self.k = k
        
        # For every read, build the index

    def _get_read_kmers(seq, k):

        k_mers = set()

        for i in range(len(t) - k + 1):  # for each k-mer
            k_mers.add((t[i:i+k], i))  # add (k-mer, offset) pair

        return k_mers

#         self.index.sort()  # alphabetize by k-mer
    # Identify sequences that contain