## Overview

This workbook contains code and notes used to answer week 3's programming assignment.

In [1]:
# Need to load the genome
from Bio.Seq import Seq
import Bio.SeqIO

genome = list(Bio.SeqIO.parse('chr1.GRCh38.excerpt.fasta', 'fasta')).pop().seq

In [50]:
# Prefer to use numpy than a list of lists
import numpy as np

# We also need an approximate matching algorithm
# As mentioned in the programming notes, this is similar to the exact alignment algorithm
# but critically, the first row is initialized as zeros.
def approximate_match(P, T):
    """
    Adaptation of the edit distance function to do approximate matching
    I renamed the variables to be more telling than X/Y.
    Also, I leveraged numpy arrays instead of lists of lists,
    although there is little difference in efficiency in this case
    
    Args:
        P (str): pattern to match to T
        T (str): reference sequence (e.g., genome)
    
    Returns:
        edit_distance (integer): approximate match distance
    """

    # Need len + 1 so we can account for the initialization term
    D = np.zeros([len(P)+1, len(T)+1]).astype(int)

    D[:, 0] = range(len(P)+1)

    for row in range(1, len(P)+1):
        for col in range(1, len(T)+1):
            dist_vert = D[row-1, col] + 1
            dist_hor = D[row, col-1] + 1

            # Compare against last alignment
            dist_diag = D[row-1, col-1]

            # Do the letters mismatch?
            if P[row-1] == T[col-1]:
                dist_diag += 0
            else:
                dist_diag += 1

            # Finally, assign the distance to this particular cell in the array
            D[row, col] = min(dist_vert, dist_hor, dist_diag)

    return D[-1, :].min()

## Example 01

Example provided in the programming reading section. Testing function above against it to make sure it works as expected

In [52]:
P = 'GCGTATGC'
T = 'TATTGGCTATACGGTT'

approximate_match(P, T)

2

## Question 01

In [58]:
P = 'GCTGATCGATCGTACG'
approximate_match(P, genome)

3

## Question 02

In [None]:
P = 'GATTTACCAGATTGAG'
approximate_match(P, genome)

In [48]:

# Need len + 1 so we can account for the initialization term
D = np.zeros([len(P)+1, len(T)+1]).astype(int)

D[:, 0] = range(len(P)+1)

for row in range(1, len(P)+1):
    for col in range(1, len(T)+1):
        dist_vert = D[row-1, col] + 1
        dist_hor = D[row, col-1] + 1

        # Compare against last alignment
        dist_diag = D[row-1, col-1]

        # Do the letters mismatch?
        if P[row-1] == T[col-1]:
            dist_diag += 0
        else:
            dist_diag += 1

        # Finally, assign the distance to this particular cell in the array
        D[row, col] = min(dist_vert, dist_hor, dist_diag)

In [49]:
D

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1],
       [2, 2, 2, 2, 2, 1, 1, 0, 1, 2, 2, 2, 1, 1, 1, 1, 2],
       [3, 3, 3, 3, 3, 2, 1, 1, 1, 2, 3, 3, 2, 1, 1, 2, 2],
       [4, 3, 4, 3, 3, 3, 2, 2, 1, 2, 2, 3, 3, 2, 2, 1, 2],
       [5, 4, 3, 4, 4, 4, 3, 3, 2, 1, 2, 2, 3, 3, 3, 2, 2],
       [6, 5, 4, 3, 4, 5, 4, 4, 3, 2, 1, 2, 3, 4, 4, 3, 2],
       [7, 6, 5, 4, 4, 4, 5, 5, 4, 3, 2, 2, 3, 3, 4, 4, 3],
       [8, 7, 6, 5, 5, 5, 5, 5, 5, 4, 3, 3, 2, 3, 4, 5, 4]])