## Implementing Needleman-Wunsch

In [1]:
import numpy as np
from Bio.Align import substitution_matrices

blossum = substitution_matrices.load("BLOSUM62")
print(blossum)


def sim_blossum(a,b, gap=-8):
    if a == '-' or b == '-':
        return gap
    return blossum[a,b]

def sim_basic(a,b, id=1, sub=-1, gap=-2):
    if a == b:
        return id
    elif a == '-' or b == '-':
        return gap
    else:
        return sub

#  Matrix made by matblas from blosum62.iij
#  * column uses minimum score
#  BLOSUM Clustered Scoring Matrix in 1/2 Bit Units
#  Blocks Database = /data/blocks_5.0/blocks.dat
#  Cluster Percentage: >= 62
#  Entropy =   0.6979, Expected =  -0.5209
     A    R    N    D    C    Q    E    G    H    I    L    K    M    F    P    S    T    W    Y    V    B    Z    X    *
A  4.0 -1.0 -2.0 -2.0  0.0 -1.0 -1.0  0.0 -2.0 -1.0 -1.0 -1.0 -1.0 -2.0 -1.0  1.0  0.0 -3.0 -2.0  0.0 -2.0 -1.0  0.0 -4.0
R -1.0  5.0  0.0 -2.0 -3.0  1.0  0.0 -2.0  0.0 -3.0 -2.0  2.0 -1.0 -3.0 -2.0 -1.0 -1.0 -3.0 -2.0 -3.0 -1.0  0.0 -1.0 -4.0
N -2.0  0.0  6.0  1.0 -3.0  0.0  0.0  0.0  1.0 -3.0 -3.0  0.0 -2.0 -3.0 -2.0  1.0  0.0 -4.0 -2.0 -3.0  3.0  0.0 -1.0 -4.0
D -2.0 -2.0  1.0  6.0 -3.0  0.0  2.0 -1.0 -1.0 -3.0 -4.0 -1.0 -3.0 -3.0 -1.0  0.0 -1.0 -4.0 -3.0 -3.0  4.0  1.0 -1.0 -4.0
C  0.0 -3.0 -3.0 -3.0  9.0 -3.0 -4.0 -3.0 -3.0 -1.0 -1.0 -3.0 -1.0 -2.0 -3.0 -1.0 -1.0 -2.0 -2.0 -1.0 -3.0 -3.0 -2.0 -4.0
Q -1.0  1.0  0.0  0.

In [2]:
def compute_matrix_needleman_wunsch(a, b, sim):
    """
    Efficiently compute the Needleman-Wunsch dynamic programming matrix. 
    Maximise similartity.

    Args:
      a: str, first sequence
      b: str, second sequence
      sim:fct, similarity function

    Returns:
      The dynamic programming matrix
    """

    # construct matrices
    D = np.zeros((len(a)+1, len(b)+1), dtype=int)
    A = np.zeros((len(a)+1, len(b)+1), dtype=int)

    # init matrix
    for i in range(1, len(a)+1):
        D[i,0] = D[i-1, 0] + sim(a[i-1], '-')
    for j in range(1, len(b)+1):
        D[0,j] = D[0, j-1] + sim('-', b[j-1])

    # fill matrix
    for i in range(1, len(a)+1):
        for j in range(1, len(b)+1):
            x1 = D[i-1,j-1]+sim(a[i-1], b[j-1])
            x2 = D[i-1,j]+sim(a[i-1], '-')
            x3 = D[i,j-1]+sim('-', b[j-1])
            if x1 >= x2 and x1 >= x3: # diagonal
                D[i,j] = x1
                A[i,j] = 1
            if x2 >= x1 and x2 >= x3: # up
                D[i,j] = x2
                A[i,j] = 2
            if x3 >= x1 and x3 >= x2: # left
                D[i,j] = x3
                A[i,j] = 3
    
    return D, A

def trace_back(a, b, A):
    """
    Determine the alignment given a dynamic programming matrix with directions
    Args:
      a: str, first sequence
      b: str, second sequence
      A: np.array with directions
    Print:
      Two strings with "-" symbols: The alignment between a and b
    Returns:
      The two new aligned strings
    """
    # perform trace back, construct alignment strings aa and ab
    aa = ""
    bb = ""
    
    i = len(a)
    j = len(b)
    while i>0 and j>0 :
        if A[i,j] == 1:
            aa = a[i-1] + aa
            bb = b[j-1] + bb
            i -= 1
            j -= 1
        elif A[i,j] == 2:
            aa = a[i-1] + aa
            bb = '-' + bb
            i -= 1
        else:
            aa = '-' + aa
            bb = b[j-1] + bb
            j -= 1
    if i == 0:
        aa = '-'*j + aa
        bb = b[:j] + bb
    elif j == 0:
        aa = a[:i] + aa
        bb = '-'*i + bb

    return aa, bb

def needleman_wunsch_global_alignment(a, b, cost):
    """
    Global alignment with linear gap cost.

    Args:
      a: str, first sequence
      b: str, second sequence
      cost: a cost function

    Returns:
      The optimal alignment and its score
    """

    D, A = compute_matrix_needleman_wunsch(a, b, cost)
    aa, bb = trace_back(a, b, A)
    print(aa)
    print(bb)
    return D, aa, bb

In [3]:
# test

a = "HER"
b = "CAT"
D, aa, bb = needleman_wunsch_global_alignment(a, b, sim_blossum)
print(D)

HER
CAT
[[  0  -8 -16 -24]
 [ -8  -3 -10 -18]
 [-16 -11  -4 -11]
 [-24 -19 -12  -5]]


In [4]:
def compute_matrix_gotoh(a, b, mismatch_cost, gap_opening, gap_extension ):
    """
    Global alignment with affine gap cost.

    Args:
      a: str, first sequence
      b: str, second sequence
      mismatch_cost: cost of a mismatch
      gap_opening: gap opening cost
      gap_extension: gap extension cost

    Returns:
      The optimal alignment and its score
    """
    D = np.zeros((len(a)+1, len(b)+1), dtype=int)
    A = np.zeros((len(a)+1, len(b)+1))
    B = np.zeros((len(a)+1, len(b)+1))

    # init matrix
    A[0,0] = gap_opening
    B[0,0] = gap_opening
    for i in range(1, len(a)+1):
        A[i,0] = A[i-1, 0] + gap_extension
        B[i,0] = np.inf
        D[i,0] = A[i,0]
    for j in range(1, len(b)+1):
        A[0,j] = np.inf
        B[0,j] = B[0, j-1] + gap_extension
        D[0,j] = B[0,j]      

    for i in range(1, len(a)+1):
        for j in range(1, len(b)+1):
          A[i,j] = min(D[i-1,j]+gap_opening+gap_extension, A[i-1,j]+gap_extension)
          B[i,j] = min(D[i,j-1]+gap_opening+gap_extension, B[i,j-1]+gap_extension)
          D[i,j] = min(D[i-1,j-1]+mismatch_cost(a[i-1], b[j-1]), A[i,j], B[i,j])

          
    # perform trace back, construct alignment strings aa and ab
    aa = ""
    bb = ""
    i = len(a)
    j = len(b)
    while i>0 and j>0 :
        x1 = D[i-1,j-1] + mismatch_cost(a[i-1], b[j-1])
        x2 = A[i,j]
        x3 = B[i,j]
        if x1 <= x2 and x1 <= x3:
            aa = a[i-1] + aa
            bb = b[j-1] + bb
            i -= 1
            j -= 1
        elif x2 <= x1 and x2 <= x3:
            aa = a[i-1] + aa
            bb = '-' + bb
            i -= 1
        else:
            aa = '-' + aa
            bb = b[j-1] + bb
            j -= 1
    if i == 0:
        aa = '-'*j + aa
        bb = b[:j] + bb
    elif j == 0:
        aa = a[:i] + aa
        bb = '-'*i + bb

    print(aa)
    print(bb)        

    return D

In [5]:
def sim_multiple(A, B, i, j, sim):
    """
    Args:
      A, B: list of aligned sequences
      i, j: int, index of the positions to compare
      sim: similarity function comparing two characters

    Returns:
      The average similarity, at the given index, between the two groups of alignements  
    """
    res = 0
    for a in A:
        for b in B:
            res += sim(a[i],b[j])
    return res/(len(A)*len(B))

def compute_matrix_needleman_wunsch_multiple(a, b, sim):
    """
    Compute the Needleman-Wunsch dynamic programming matrix for two sets of aligned sequences

    Args:
      a, b: list of aligned sequences
      sim:f ct, similarity function

    Returns:
      The dynamic programming matrix
    """
    len_a = len(a[0])
    len_b = len(b[0])

    # construct matrices
    D = np.zeros((len_a+1, len_b+1))
    A = np.zeros((len_a+1, len_b+1), dtype=int)

    # init matrix
    for i in range(1, len_a+1):
        D[i,0] = i * sim('a', '-')  # cost of a gap
    for j in range(1, len_b+1):
        D[0,j] = i * sim('a', '-')

    # fill matrix
    for i in range(1, len_a+1):
        for j in range(1, len_b+1):
            x1 = D[i-1,j-1] + sim_multiple(a, b, i-1, j-1, sim)
            x2 = D[i-1,j] + sim('a', '-')
            x3 = D[i,j-1] + sim('a', '-')
            if x1 >= x2 and x1 >= x3: # diagonal
                D[i,j] = x1
                A[i,j] = 1
            elif x2 >= x1 and x2 >= x3: # up
                D[i,j] = x2
                A[i,j] = 2
            else: # x3 >= x1 and x3 >= x2: # left
                D[i,j] = x3
                A[i,j] = 3
    
    return D, A


def trace_back_multiple(A, B, D):
    """
    Determine the alignment given a dynamic programming matrix with directions
    Args:
      A, B: list of aligned sequences
      D: np.array with directions
    Returns:
      A list with all the aligned sequences
    """

    AA = ["" for _ in range(len(A))]
    BB = ["" for _ in range(len(B))]
    
    i = len(A[0])
    j = len(B[0])
    while i>0 and j>0 :
        if D[i,j] == 1:
            for a in range(len(A)):
                AA[a] = A[a][i-1] + AA[a]
            for b in range(len(B)):
                BB[b] = B[b][j-1] + BB[b]
            i -= 1
            j -= 1
        elif D[i,j] == 2:
            for a in range(len(A)):
                AA[a] = A[a][i-1] + AA[a]
            for b in range(len(B)):
                BB[b] = '-' + BB[b]
            i -= 1
        else:
            for a in range(len(A)):
                AA[a] = '-' + AA[a]
            for b in range(len(B)):
                BB[b] = B[b][j-1] + BB[b]
            j -= 1
    if i == 0:
        for a in range(len(A)):
            AA[a] = '-'*j + AA[a]
        for b in range(len(B)):
            BB[b] = B[b][:j] + BB[b]
    elif j == 0:
        for a in range(len(A)):
            AA[a] = A[a][:i] + AA[a]
        for b in range(len(B)):
            BB[b] = '-'*i + BB[b]

    return AA + BB   # list concatenation

In [6]:
a = "CHAT"
b = "CAT"

D, A = compute_matrix_needleman_wunsch_multiple([a], [b], sim_blossum)
print(D)
print(A)

L = trace_back_multiple([a], [b], A)
print(L)

[[  0. -32. -32. -32.]
 [ -8.   9.   1.  -7.]
 [-16.   1.   7.  -1.]
 [-24.  -7.   5.   7.]
 [-32. -15.  -3.  10.]]
[[0 0 0 0]
 [0 1 3 3]
 [0 2 1 1]
 [0 2 1 1]
 [0 2 2 1]]
['CHAT', 'C-AT']


In [7]:
from time import time

def clustalW(seq, sim): # TODO: check if possible to improve efficiency
  """
  Computes multiple sequence alignement

  Args:
    seq: list of sequences to align
    sim: similarity function

  Returns:
    list of aligned sequences, score of alignment
  """
  time_mat = 0
  time_trace = 0

  # transforming each sequence into a cluster of one sequence
  clusters = [[s] for s in seq]  

  # initialising distance matrix
  matrix = np.zeros((len(seq), len(seq)))
  for i in range(len(seq)):
    for j in range(i+1, len(seq)):
      start = time()
      matrix[i,j] = compute_matrix_needleman_wunsch(seq[i], seq[j], sim)[0][-1,-1]
      time_mat += time() - start

  while len(clusters) > 1:
    # finding closest clusters
    max_sim = -np.inf
    for i in range(len(clusters)):
      for j in range(i+1,len(clusters)):
        if matrix[i,j] > max_sim:
          max_sim = matrix[i,j]
          best_i = i
          best_j = j

    # merging two clusters best_i < best_j
    a = clusters[best_i]
    b = clusters[best_j]
    start = time()
    D, A = compute_matrix_needleman_wunsch_multiple(a, b, sim)
    time_mat += time() - start
    start = time()
    new_cluster = trace_back_multiple(a, b, A)
    time_trace += time() - start

    clusters.remove(a)
    clusters.remove(b)
    clusters.append(new_cluster)

    # updating distance matrix
    matrix = np.delete(matrix, [best_i,best_j], axis=0)
    matrix = np.delete(matrix, [best_i,best_j], axis=1)

    new_column = []
    for cluster in clusters[:-1]:
      start = time()
      new_column.append(compute_matrix_needleman_wunsch_multiple(cluster, new_cluster, sim)[0][-1,-1])
      time_mat += time() - start
    matrix = np.concatenate((matrix, np.array([new_column]).T), axis=1)
    matrix = np.concatenate((matrix, np.zeros((1, len(clusters)))), axis=0)

  print("Time for matrix computation: ", time_mat)
  print("Time for trace back: ", time_trace)
  
  return clusters[0], D[-1,-1]

In [8]:
# test
seq = ["CAT", "CHAT", "HER"]

res = clustalW(seq, sim_blossum)

print(res)

Time for matrix computation:  0.0005829334259033203
Time for trace back:  2.9802322387695312e-05
(['H-ER', 'C-AT', 'CHAT'], -13.0)


In [9]:
from Bio import SeqIO

def read_fasta(filename):
    """
    Read a fasta file and return the sequences as a list of strings
    """
    sequences = []
    for record in SeqIO.parse(filename, "fasta"):
        sequences.append(str(record.seq))
    return sequences

In [10]:
seq = read_fasta("data/paralogs.fasta")
test = clustalW(seq, sim_blossum)
for s in test[0]:
  print(s)
print(f"Score of alignment: {test[1]}")

Time for matrix computation:  9.13224720954895
Time for trace back:  0.0013599395751953125
MG-LSDGEWQLVLNVWGKVEADIPGHGQEVLIRLFKGHPETLEKFDKFKHLKSEDEMKASEDLKKHGATVLTALGGILKKKGHHEAEIKPLAQSHATKHKIPVKYLEFISECIIQVLQSKHPGDFGADAQGAMNKALELFRKDMASNYKELGFQG
MVHLTPEEKSAVTALWGKV--NVDEVGGEALGRLLVVYPWTQRFFESFGDLSTPDAVMGNPKVKAHGKKVLGAFSDGLAHLDNLKGTFATLSELHCDKLHVDPENFRLLGNVLVCVLAHHFGKEFTPPVQAAYQKVVAGVANALAH--K---YH-
MVHLTPEEKTAVNALWGKV--NVDAVGGEALGRLLVVYPWTQRFFESFGDLSSPDAVMGNPKVKAHGKKVLGAFSDGLAHLDNLKGTFSQLSELHCDKLHVDPENFRLLGNVLVCVLARNFGKEFTPQMQAAYQKVVAGVANALAH--K---YH-
MVHFTAEEKAAVTSLWSKM--NVEEAGGEALGRLLVVYPWTQRFFDSFGNLSSPSAILGNPKVKAHGKKVLTSFGDAIKNMDNLKPAFAKLSELHCDKLHVDPENFKLLGNVMVIILATHFGKEFTPEVQAAWQKLVSAVAIALAH--K---YH-
MGHFTEEDKATITSLWGKV--NVEDAGGETLGRLLVVYPWTQRFFDSFGNLSSASAIMGNPKVKAHGKKVLTSLGDAIKHLDDLKGTFAQLSELHCDKLHVDPENFKLLGNVLVTVLAIHFGKEFTPEVQASWQKMVTAVASALSS--R---YH-
MS-LTKTERTIIVSMWAKISTQADTIGTETLERLFLSHPQTKTYFPHF-DLH-P----GSAQLRAHGSKVVAAVGDAVKSIDDIGGALSKLSELHAYILRVDPVNFKLLSHCLLVTLAARFPADFTAEA

## Test protein with PDB

In [1]:
from Bio.PDB import *

parser = PDBParser()
structure = parser.get_structure("1MBD", "data/1mbd.pdb")

for model in structure:
    print(model)
    for chain in model:
        print(chain)
        for residue in chain:
            # print(residue)
            # print(residue.get_resname(), residue.get_id()[1], chain.id)
            pass

dssp = DSSP(model, )

SyntaxError: unexpected EOF while parsing (1304937512.py, line 12)