In [1]:
import gzip

import numpy as np


def load_fasta(filename, verbose=0):
    """
    Parses a classically formatted and possibly
    compressed FASTA file into a dictionary where the key
    for a sequence is the first part of its header without
    any white space; if verbose is nonzero then the identifiers
    together with lengths of the read sequences are printed
    """
    if filename.endswith(".gz"):
        fp = gzip.open(filename, 'rt')
    else:
        fp = open(filename, 'r')
    # split at headers
    # data = fp.read().split('>')
    data = fp.read()
    data = data.split('>')
    fp.close()
    # ignore whatever appears before the 1st header
    data.pop(0)
    # prepare the dictionary
    D = {}
    for sequence in data:
        lines = sequence.split('\n')
        header = lines.pop(0).split()
        key = header[0]
        D[key] = ''.join(lines)
        if verbose:
            print("Sequence %s of length %d read" % (key, len(D[key])))
    return D

#### 1. Global and Local Alignment  
  
  I implemented salutation for global and local alignment as described in the lectures. For global alignment I implemented Needlemen-Wunsh algorithm which is based on dynamic programming. Given two sequences seqA and seqB we initialize a matrix F of size (n+1)x(m+1), where n is length of seqA and m length of seqB.  
  Let d be a gap penalty we will set the F[i,0] = -id, i=0, ..., n and F[0, j] = -jd, j=0, ..., m. The rest of the fields in the matrix are calculated recursively using the following rule:

      F[i, j]= max(
          F[i-1, j-1] + s[seqA[i-1], seqB[j-1]]  
          F[i-1, j] + d  
          F[i, j-1] + d  
          )
Score of the alignment can be found in F[n, m].
  
  For local alignment we use Smith-Watermann algorithm, which is very similar to previous algorithm, the difference is that we cannot have negative values inside the matrix F. And for the score we return the maximum value in matrix F.

In [2]:
def NeedlemanWunsh(seqA, seqB, match, mismatch, gap):
    """

    :param seqA: string with the first sequence
    :param seqB: string with the second sequence
    :param match: a positive score when two symbols match
    :param mismatch: a negative penalty for mismatch
    :param gap: a negative penalty for a gap of length 1
    :return: the score of global alignment of the sequences seqA and seqB
    """

    n = len(seqA) + 1
    m = len(seqB) + 1

    F = np.zeros((n, m))

    for i in range(1, n):
        F[i, 0] = i * gap

    for j in range(1, m):
        F[0, j] = j * gap
        
    for i in range(1, n):
        for j in range(1, m):
            F[i, j] = max([
                F[i - 1, j - 1] + (seqA[i - 1] != seqB[j - 1]) * mismatch + (seqA[i - 1] == seqB[j - 1]) * match,
                F[i - 1, j] + gap,
                F[i, j - 1] + gap
            ])

    return F[n - 1, m - 1]


In [3]:
def SmithWatermann(seqA, seqB, match, mismatch, gap):
    """

    :param seqA: string with the first sequence
    :param seqB: string with the second sequence
    :param match: a positive score when two symbols match
    :param mismatch: a negative penalty for mismatch
    :param gap: a negative penalty for a gap of length 1
    :return: the score of global alignment of the sequences seqA and seqB
    """
    n = len(seqA) + 1
    m = len(seqB) + 1

    F = np.zeros((n, m))

    for i in range(1, n):
        for j in range(1, m):
            F[i, j] = max([
                0,
                F[i - 1, j - 1] + (seqA[i - 1] != seqB[j - 1]) * mismatch + (seqA[i - 1] == seqB[j - 1]) * match,
                F[i - 1, j] + gap,
                F[i, j - 1] + gap
            ])

    return F.max()

In [4]:
H = load_fasta('H.fasta')['seq']
I = load_fasta('I.fasta')['seq']
J = load_fasta('J.fasta')['seq']

match = 1
mismatch = -1
gap = -2

print('Needleman-Wunsh global alignment: ')
r1 = NeedlemanWunsh(H, I, match, mismatch, gap)
print('H-I score: '+str(r1))
r2 = NeedlemanWunsh(H, J, match, mismatch, gap)
print('H-J score: '+str(r2))
r3 = NeedlemanWunsh(I, J, match, mismatch, gap)
print('I-J score: '+str(r3))

Needleman-Wunsh global alignment: 
H-I score: -214.0
H-J score: -206.0
I-J score: 1020.0


In [5]:
print('Smith-Watermann local alignment: ')
r4 = SmithWatermann(H, I, match, mismatch, gap)
print('H-I score: '+str(r4))
r5 = SmithWatermann(H, J, match, mismatch, gap)
print('H-J score: '+str(r5))
r6 = SmithWatermann(I, J, match, mismatch, gap)
print('I-J score: '+str(r6))

Smith-Watermann local alignment: 
H-I score: 1292.0
H-J score: 727.0
I-J score: 1042.0


#### 2. Needleman-Wunsch Algorithm with Affine Gap Penalty  
The idea of this algorithm is that penalty for gap insertion and gap extention should not be the same. Instead of using one matrix F we will use 3 matrices M, Ix, Iy, where:
*  M[i, j] is the best score up to [i, j] given seqA[i] is aligned to seqB[j]
* Ix[i, j] is the best score up to [i, j] given seqB[j] is aligned to a gap
* Iy[i, j] is the best score up to [i, j] given seqA[i] is aligned to a gap in seqB.  

Let d be the penalty of opening a gap and e a penalty of extending a gap.
We initialize the matrices as follow:
*  M[0, 0] = 0, Ix[0, 0] = Iy[0, 0] = -inf
*  Ix[0, j] = d + (j-1)e,  M[0, j] = Iy[0, j] = -inf ; j = 1, ..., m
*  Iy[i, 0] = d + (i-1)e,  M[i, 0] = Ix[i, 0] = -inf ; i = 1, ..., n
  
  For the rest of the fields we recursively calculate them using the following equations:  
  * Ix[i, j]=max( M[i, j-1] + d, Ix[i, j-1] + e) 
  * Iy[i, j]=max( M[i-1, j] + d, Iy[i-1, j] + e) 
  * M[i, j]=s(seqA[i-1], seq[j-1]) + max( M[i-1, j-1], Ix[i-1, j-1], Iy[i-1, j-1]) 
  
  Score is the maximum value in the bottom right corner of matrices M, Ix and Iy.


In [6]:
def NWAffine(seqA, seqB, match, mismatch, gapopen, gapext):
    """

    :param seqA: string with the first sequence
    :param seqB: string with the second sequence
    :param match: a positive score when two symbols match
    :param mismatch: a negative penalty for mismatch
    :param gapopen: a negative penalty for opening a gap
    :param gapext: a negative penalty for a gap extension
    :return: the score of global alignment of the sequences seqA and seqB
    """

    n = len(seqA) + 1
    m = len(seqB) + 1

    M = np.zeros((n, m))
    Ix = np.zeros((n, m))
    Iy = np.zeros((n, m))

    for i in range(1, n):
        Iy[i, 0] = gapopen + (i - 1) * gapext
        M[i, 0] = Ix[i, 0] = -np.inf

    for j in range(1, m):
        Ix[0, j] = gapopen + (j - 1) * gapext
        M[0, j] = Iy[0, j] = -np.inf

    for i in range(1, n):
        for j in range(1, m):
            Ix[i, j] = max([
                M[i, j - 1] + gapopen,
                Ix[i, j - 1] + gapext
            ])

            Iy[i, j] = max([
                M[i - 1, j] + gapopen,
                Iy[i - 1, j] + gapext
            ])

            M[i, j] = max([
                M[i - 1, j - 1],
                Ix[i - 1, j - 1],
                Iy[i - 1, j - 1]
            ]) + (seqA[i - 1] != seqB[j - 1]) * mismatch + (seqA[i - 1] == seqB[j - 1]) * match

    return max([M[- 1, - 1], Ix[-1, -1], Iy[-1, -1]])

In [7]:
match = 1
mismatch = -1
gapopen = -2
gapext = -1

Homo = list(load_fasta('Homo.fasta').values())[0]
Mus = list(load_fasta('Mus.fasta').values())[0]
Rattus = list(load_fasta('Rattus.fasta').values())[0]

r7 = NWAffine(Homo, Mus, match, mismatch, gapopen, gapext)
r8 = NWAffine(Homo, Rattus, match, mismatch, gapopen, gapext)
r9 = NWAffine(Mus, Rattus, match, mismatch, gapopen, gapext)
print('Homo sapiens insulin-Mus musculus insulin II score: '+str(r7))
print('Homo sapiens insulin-Rattus norvegicus insulin 1 score: '+str(r8))
print('Mus musculus insulin II-Rattus norvegicus insulin 1 score: '+str(r9))

Homo sapiens insulin-Mus musculus insulin II score: 207.0
Homo sapiens insulin-Rattus norvegicus insulin 1 score: 215.0
Mus musculus insulin II-Rattus norvegicus insulin 1 score: 323.0
