# Space-Efficient Fitting Alignment Using the Hirschberg Algorithm

The fitting alignment problem refers to finding the maximal global alignment between a string v and a longer reference string w, out of all possible substrings of string w. Space-efficient algorithms are designed to reduce the relative amount of memory needed to achieve a certain functionality. The goal of this final project is to implement an algorithm to determine optimal fitting alignment in linear space and polynomial time. Specifically, the algorithm will run in O(m) space and O(mn) time for input sequences of length m and n. This algorithm will feature the Hirschberg Algorithm described in class and drawn from  the paper: “A Linear Space Algorithm for Computing Maximal Common Subsequences”. The Hirschberg Algorithm requires an initial call containing the beginning position and the ending position. In a global alignment problem, these values are inherently known. However, this is not true for a fitting alignment problem, which is why we must define subroutines to find the beginning of the optimal fitting alignment and the ending of the optimal fitting alignment while preserving O(m) memory usage and O(mn) time complexity. This is done below.

In [248]:
import numpy as np

def findEnd(short, reference, delta):
    M = [[0 for j in range(2)] for i in range(len(short)+1)]
    M = np.array(M)
    
    bestEnd = (len(short), len(reference))
    maxScore = float('-inf')

    for j in range(len(reference) + 1):
        if j == 0:
            M[0][0] = 0
            for i in range(len(short) + 1):
                if i > 0:
                    M[i][0] = M[i-1][0] + delta[short[i-1]]['-']
                    if i == len(short):
                        if M[i][0] > maxScore:
                            maxScore = M[i][0]
                            bestEnd = (i, j)
                            print(M[i][0])
        else:
            for i in range(len(short) + 1):
                if i == 0:
                    M[0][1] = 0
                else:
                    diag = M[i-1][0] + delta[short[i-1]][reference[j-1]]
                    delete = M[i-1][1] + delta[short[i-1]]['-']
                    insert = M[i][0] + delta[reference[j-1]]['-']
                    M[i][1] = max(diag, delete, insert)
                    if i == len(short):
                        print(M[i][1])
                        if M[i][1] > maxScore:
                            maxScore = M[i][1]
                            bestEnd = (i, j)
                        M[:,0] = M[:,1]
            
    
    return maxScore, bestEnd
                

In [249]:
keys = ['A', 'C', 'T', 'G', '-']
delta = {}
for i in range(len(keys)):
    delta[keys[i]] = {k : v for (k,v) in zip(keys, [1 if keys[i] == keys[j]  else -1 for j in range(len(keys))])}

score,fend = findEnd("TAGATA", "GTAGGCTTAAGGTTAT", delta)
score,fend

-6
-4
-2
0
0
0
0
1
1
2
1
0
0
1
1
2
1


(2, (6, 9))

In [250]:
def findStart(short, reference, delta, end):
    M = [[0 for j in range(2)] for i in range(len(short)+1)]
    M = np.array(M)
    bestStart = (0, len(reference))
    maxScore = float('-inf')
    for i in range(len(short) + 1):
        if i > 0:
            M[i][0] = M[i-1][0] + delta[short[i-1]]['-']
    for j in range(1,end[1]):
        for i in range(1,end[0]):
            if j == 0:
                M[0][1] = 0
            if i > 0:
                k = end[1] - j
                l = end[0] - i
                diag = M[i-1][0] + delta[short[l+1]][reference[k+1]]
                delete = M[i-1][1] + delta[short[l+1]]['-']
                insert = M[i][0] + delta[reference[k+1]]['-']
                M[i][1] = max(diag, delete, insert)
                if i == len(short):
                    if M[i][1] > maxScore:
                        maxScore = M[i][1]
                        bestStart = (i, j)
                    M[:,0] = M[:,1]
    
    return bestStart

In [251]:
keys = ['A', 'C', 'T', 'G', '-']
delta = {}
for i in range(len(keys)):
    delta[keys[i]] = {k : v for (k,v) in zip(keys, [1 if keys[i] == keys[j]  else -1 for j in range(len(keys))])}

# start = findStart("TAGATA", "GTAGGCTTAAGGTTA", delta, fend)
# start

In [252]:
UP = (-1,0)
LEFT = (0, -1)
TOPLEFT = (-1, -1)
ORIGIN = (0, 0)

def traceback_fitting(v, w, M, init_j, pointers):
    i, j = len(v), init_j
    new_v = []
    new_w = []
    while True:
        di, dj = pointers[i][j]
        if (di,dj) == LEFT:
            new_v.append('-')
            new_w.append(w[j-1])
        elif (di,dj) == UP:
            new_v.append(v[i-1])
            new_w.append('-')
        elif (di,dj) == TOPLEFT:
            new_v.append(v[i-1])
            new_w.append(w[j-1])
        i, j = i + di, j + dj
        if (i <= 0):
            break
    return ''.join(new_v[::-1]) + '\n'+''.join(new_w[::-1])

def fitting_align(short, reference, delta):
    """
    Returns the score of the maximum scoring alignment of short and all
    substrings of reference.

    :param: short the shorter of the two strings we are trying to align
    :param: reference the longer string among whose substrings we are doing global alignment
    :param: delta the scoring function for the alphabet of the two strings

    :returns: a tuple (score, alignment)
    """
    M = [[0 for j in range(len(reference)+1)] for i in range(len(short)+1)]
    pointers = [[ORIGIN for j in range(len(reference)+1)] for i in range(len(short)+1)]
    score = None
    init_j = 0
    # YOUR CODE HERE
    for i in range(len(short) + 1):
        if i > 0:
            M[i][0] = M[i-1][0] + delta[short[i-1]]['-']
    for j in range(len(reference) + 1):
        if j > 0:
            M[0][j] = 0
    for i in range(len(short) + 1):
        for j in range(len(reference) + 1):
            if i > 0 and j > 0:
                diag = M[i-1][j-1] + delta[short[i-1]][reference[j-1]]
                delete = M[i-1][j] + delta[short[i-1]]['-']
                insert = M[i][j-1] + delta[reference[j-1]]['-']
                M[i][j] = max(diag, delete, insert)
    
    i = len(short)
            
    init_j = M[i].index(max(M[i]))
    print(M[i])
    score = M[i][init_j]
    print(i, init_j)
    
    j = init_j
    
    if i == 0:
        pointers [i][j] == ORIGIN
    else:
        while i > 0:
            if j > 0:
                diag = M[i-1][j-1] + delta[short[i-1]][reference[j-1]]
                delete = M[i-1][j] + delta[short[i-1]]['-']
                insert = M[i][j-1] + delta[reference[j-1]]['-']
                if M[i][j] == diag:
                    pointers[i][j] = TOPLEFT
                    i = i - 1
                    j = j - 1
                elif M[i][j] == delete:
                    pointers[i][j] = UP
                    i = i - 1
                else:
                    pointers[i][j] = LEFT
                    j = j -1
            else:
                pointers[i][j] = UP
                i = i - 1
    alignment = traceback_fitting(short,reference,M, init_j,pointers)
    return score, alignment

In [253]:
keys = ['A', 'C', 'T', 'G', '-']
delta = {}
for i in range(len(keys)):
    delta[keys[i]] = {k : v for (k,v) in zip(keys, [1 if keys[i] == keys[j]  else -1 for j in range(len(keys))])}

fitting_align("TAGATA", "GTAGGCTTAAGGTTAT", delta)

[-6, -4, -2, 0, 0, 0, 0, 1, 1, 2, 1, 0, 0, 1, 1, 2, 1]
6 9


(2, 'TA-G-ATA\nTAGGCTTA')