In [12]:
import numpy as np

In [14]:
#the log odds of one amino acid being substituted by another
blosum50 = np.loadtxt("blosum50.txt", dtype = 'i')

In [15]:
blosum50

array([[ 5, -2, -1, -2, -1, -1, -1,  0, -2, -1, -2, -1, -1, -3, -1,  1,
         0, -3, -2,  0],
       [-2,  7, -1, -2, -4,  1,  0, -3,  0, -4, -3,  3, -2, -3, -3, -1,
        -1, -3, -1, -3],
       [-1, -1,  7,  2, -2,  0,  0,  0,  1, -3, -4,  0, -2, -4, -2,  1,
         0, -4, -2, -3],
       [-2, -2,  2,  8, -4,  0,  2, -1, -1, -4, -4, -1, -4, -5, -1,  0,
        -1, -5, -3, -4],
       [-1, -4, -2, -4, 13, -3, -3, -3, -3, -2, -2, -3, -2, -2, -4, -1,
        -1, -5, -3, -1],
       [-1,  1,  0,  0, -3,  7,  2, -2,  1, -3, -2,  2,  0, -4, -1,  0,
        -1, -1, -1, -3],
       [-1,  0,  0,  2, -3,  2,  6, -3,  0, -4, -3,  1, -2, -3, -1, -1,
        -1, -3, -2, -3],
       [ 0, -3,  0, -1, -3, -2, -3,  8, -2, -4, -4, -2, -3, -4, -2,  0,
        -2, -3, -3, -4],
       [-2,  0,  1, -1, -3,  1,  0, -2, 10, -4, -3,  0, -1, -1, -2, -1,
        -2, -3,  2, -4],
       [-1, -4, -3, -4, -2, -3, -4, -4, -4,  5,  2, -3,  2,  0, -3, -3,
        -1, -3, -1,  4],
       [-2, -3, -4, -4, -2, -2

In [32]:
#map the characters with their indices
blosum_index_map = dict(zip(['A','R','N','D','C','Q','E','G','H','I','L','K',
                    'M','F','P','S','T','W','Y','V'],
                      [i for i in range(len(blosum50))]))

In [60]:
blosum_index_map

{'A': 0,
 'R': 1,
 'N': 2,
 'D': 3,
 'C': 4,
 'Q': 5,
 'E': 6,
 'G': 7,
 'H': 8,
 'I': 9,
 'L': 10,
 'K': 11,
 'M': 12,
 'F': 13,
 'P': 14,
 'S': 15,
 'T': 16,
 'W': 17,
 'Y': 18,
 'V': 19}

In [36]:
#given the bases to consider swapping, get the blosum log probability score of swapping them
#by looking up the corresponding index of the chars and then accessing that index of the matrix 
def get_blosum_score(char1, char2):
    i = blosum_index_map[char1]
    j = blosum_index_map[char2]
    return blosum50[i][j]

In [147]:
#assumes that the blosum50 field is accessible in its environment
def needleman_wunsch(str1, str2):
    
    #our penalty score
    d = -8
    
    #initialize an empty matrix
    height = len(str1) + 1
    width = len(str2) + 1
    
    #explicitly object data type so that we can store tuples
    matrix = np.zeros((height, width), dtype = "object")
    
    #initialization step, mark the cells at the sides of the matrix
    #as incrementing by the penalty, and mark the "arrow" directions all as north/west
    count = 0
    for i in range(width):
        matrix[0][i] = (count,"w")
        count += d
    count = 0
    for i in range(height):
        matrix[i][0] = (count, "n")
        count +=d
    
    for i in range(1, height):
        
        for j in range(1, width):
            
            char1 = str1[i - 1]
            char2 = str2[j - 1]
            score = get_blosum_score(char2, char1)
            
            #direct implementation of the dynamic programming step
            matrix[i][j] = max([
                #each cell contains a tuple of score and direction so extract the score
                (matrix[i-1][j-1][0] + score,"nw"),
                (matrix[i-1][j][0] + d,"n"),
                (matrix[i][j-1][0] + d,"w")
            ], key = lambda tup: tup[0])
    
    print(matrix)
    
    #F matrix has been generated, now need to perform backwards algorithm
    #start at the bottom right of the matrix
    i = height - 1
    j = width - 1
    
    #path of cell transitions we will build up
    path = []
    
    while True:

        print("Path contents:", path)
        
        #end condition, we have retraced the path
        if i == 0 and j == 0:
            break
        
        #the direction we need to go in
        dr = matrix[i][j][1]
        
        print("at indices:",i,j,"and the dir is:",dr)
        
        if dr == "nw":
            path.append(((i,j),(i - 1,j - 1)))
            i -= 1
            j -= 1
            continue
        if dr == "n":
            path.append(((i,j),(i - 1, j)))
            i -= 1
            continue
        if dr == "w":
            path.append(((i,j),(i, j - 1)))
            j -= 1
        
        

In [148]:
needleman_wunsch("PAWHEAE", "HEAGAWGHEE")

[[(0, 'n') (-8, 'w') (-16, 'w') (-24, 'w') (-32, 'w') (-40, 'w')
  (-48, 'w') (-56, 'w') (-64, 'w') (-72, 'w') (-80, 'w')]
 [(-8, 'n') (-2, 'nw') (-9, 'nw') (-17, 'nw') (-25, 'w') (-33, 'nw')
  (-41, 'w') (-49, 'w') (-57, 'w') (-65, 'nw') (-73, 'nw')]
 [(-16, 'n') (-10, 'nw') (-3, 'nw') (-4, 'nw') (-12, 'w') (-20, 'nw')
  (-28, 'w') (-36, 'w') (-44, 'w') (-52, 'w') (-60, 'w')]
 [(-24, 'n') (-18, 'n') (-11, 'n') (-6, 'nw') (-7, 'nw') (-15, 'nw')
  (-5, 'nw') (-13, 'w') (-21, 'w') (-29, 'w') (-37, 'w')]
 [(-32, 'n') (-14, 'nw') (-18, 'nw') (-13, 'nw') (-8, 'nw') (-9, 'nw')
  (-13, 'n') (-7, 'nw') (-3, 'nw') (-11, 'w') (-19, 'w')]
 [(-40, 'n') (-22, 'n') (-8, 'nw') (-16, 'w') (-16, 'nw') (-9, 'nw')
  (-12, 'nw') (-15, 'n') (-7, 'nw') (3, 'nw') (-5, 'nw')]
 [(-48, 'n') (-30, 'n') (-16, 'n') (-3, 'nw') (-11, 'w') (-11, 'nw')
  (-12, 'nw') (-12, 'nw') (-15, 'n') (-5, 'n') (2, 'nw')]
 [(-56, 'n') (-38, 'n') (-24, 'nw') (-11, 'n') (-6, 'nw') (-12, 'nw')
  (-14, 'nw') (-15, 'nw') (-12, 'nw') (-