# Notes

slides (pdf) + code

similar to scientific articles:
- context/introduction (litterature review)
- results (take perspective, present your work as a story: problematic, progression...)
- evaluat algos on provided dataset ( balibase)
- discussion: critizise, propose solution, give perspective (lol), personnal thinking

TODO:
- benchmark: compare own alignments with clustalW (see scores on slides td0, or available on: https://www.genome.jp/tools-bin/clustalw) and balibase (reference); scores given by bali_score
- implement Needlemand-Wunsch using PDB

penalité de gap dans clustalW:
10 open et 0.1 extend

coef de similarité dépendant de l'enfouissement:
- *1.5 pour burial=1
- *0.5 pour burial=0


# Necessary imports and utility functions

In [1]:
import numpy as np
import pandas as pd
import os
import warnings
from Bio import SeqIO
from Bio.Align import substitution_matrices
from Bio.PDB import *
from bali_score import  compare_seq, read_fasta2dict

# Type hinting
from typing import List, Tuple, Set, Union, Callable

blossum = substitution_matrices.load("BLOSUM62")
print(blossum)

def sim_blossum(a:chr, b:chr)->float:
    return blossum[a,b]

def sim_basic(a:chr, b:chr, id=1, sub=-1)->float:
    if a == b:
        return id
    else:
        return sub

#  Matrix made by matblas from blosum62.iij
#  * column uses minimum score
#  BLOSUM Clustered Scoring Matrix in 1/2 Bit Units
#  Blocks Database = /data/blocks_5.0/blocks.dat
#  Cluster Percentage: >= 62
#  Entropy =   0.6979, Expected =  -0.5209
     A    R    N    D    C    Q    E    G    H    I    L    K    M    F    P    S    T    W    Y    V    B    Z    X    *
A  4.0 -1.0 -2.0 -2.0  0.0 -1.0 -1.0  0.0 -2.0 -1.0 -1.0 -1.0 -1.0 -2.0 -1.0  1.0  0.0 -3.0 -2.0  0.0 -2.0 -1.0  0.0 -4.0
R -1.0  5.0  0.0 -2.0 -3.0  1.0  0.0 -2.0  0.0 -3.0 -2.0  2.0 -1.0 -3.0 -2.0 -1.0 -1.0 -3.0 -2.0 -3.0 -1.0  0.0 -1.0 -4.0
N -2.0  0.0  6.0  1.0 -3.0  0.0  0.0  0.0  1.0 -3.0 -3.0  0.0 -2.0 -3.0 -2.0  1.0  0.0 -4.0 -2.0 -3.0  3.0  0.0 -1.0 -4.0
D -2.0 -2.0  1.0  6.0 -3.0  0.0  2.0 -1.0 -1.0 -3.0 -4.0 -1.0 -3.0 -3.0 -1.0  0.0 -1.0 -4.0 -3.0 -3.0  4.0  1.0 -1.0 -4.0
C  0.0 -3.0 -3.0 -3.0  9.0 -3.0 -4.0 -3.0 -3.0 -1.0 -1.0 -3.0 -1.0 -2.0 -3.0 -1.0 -1.0 -2.0 -2.0 -1.0 -3.0 -3.0 -2.0 -4.0
Q -1.0  1.0  0.0  0.

In [2]:
def read_fasta(filename):
    """ Read a fasta file and return the sequences as a list of strings """
    sequences = []
    ids = []
    for record in SeqIO.parse(filename, "fasta"):
        sequences.append(str(record.seq))
        ids.append(record.id)
    return sequences, ids

def write_fasta(filename, sequences, ids=None, len_line=80):
    """ Write a list of sequences to a fasta file """
    if ids is None:
        ids = [f"seq{i}" for i in range(len(sequences))]
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    with open(filename, "w") as f:
        for i, seq in enumerate(sequences):
            f.write(">{}\n".format(ids[i]))
            while len(seq) > len_line:
                f.write(seq[:len_line] + "\n")
                seq = seq[len_line:]
            f.write(seq + "\n")

# Pairwise sequence alignment

## Needleman-Wunsch: alignment with linear gap cost
This algorithm tries to align to protein sequences $a_1^n$ and $b_1^m$.
By dynamic programming, the algorithm computes the maximum similarity between any pair of prefixes $a_{1...i}$ and $b_{1...j}$ and stores them in a matrix $M_{i,j}$

$M_{i,j}$ is defined by recursion with the following formula:
$$
M_{i,j} = \max \begin{cases}
  M_{i-1,j-1} + s(a_i,b_j) \\
  M_{i-1,j} - \text{gap cost} &\\
  M_{i,j-1} - \text{gap cost}&
\end{cases}
$$

To facilitate the traceback, we also compute a matrix $D$ *(like Directions)* indicating which case achieved the maximum.

In [3]:
def compute_matrix_needleman_wunsch(seq_a:str, seq_b:str, sim_fct=sim_basic, gap_cost=-2):
    """ compute the Needleman-Wunsch matrix """

    n_a = len(seq_a)
    n_b = len(seq_b)

    # construct matrices
    M = np.zeros((n_a+1, n_b+1), dtype=int)
    D = np.zeros((n_a+1, n_b+1), dtype=int)

    # init matrices
    for i in range(1, n_a+1):
        M[i,0] = M[i-1, 0] + gap_cost
    for j in range(1, n_b+1):
        M[0,j] = M[0, j-1] + gap_cost

    # fill matrices
    for i in range(1, n_a+1):
        for j in range(1, n_b+1):
            if seq_a[i-1] == '-' or seq_b[j-1] == '-':
                sim = gap_cost
            else:
                sim = sim_fct(seq_a[i-1], seq_b[j-1])
            x1 = M[i-1,j-1] + sim
            x2 = M[i-1,j] + gap_cost
            x3 = M[i,j-1] + gap_cost
            if x1 >= x2 and x1 >= x3: # diagonal
                M[i,j] = x1
                D[i,j] = 1
            if x2 >= x1 and x2 >= x3: # up
                M[i,j] = x2
                D[i,j] = 2
            if x3 >= x1 and x3 >= x2: # left
                M[i,j] = x3
                D[i,j] = 3
    
    return M, D

def trace_back(a:str, b:str, D:np.array):
    """ traceback matrix D to get the alignment """
    aa = ""
    bb = ""
    
    i = len(a)
    j = len(b)
    while i>0 and j>0 :
        if D[i,j] == 1:
            aa = a[i-1] + aa
            bb = b[j-1] + bb
            i -= 1
            j -= 1
        elif D[i,j] == 2:
            aa = a[i-1] + aa
            bb = '-' + bb
            i -= 1
        else:
            aa = '-' + aa
            bb = b[j-1] + bb
            j -= 1
    if i == 0:
        aa = '-'*j + aa
        bb = b[:j] + bb
    elif j == 0:
        aa = a[:i] + aa
        bb = '-'*i + bb

    return aa, bb

def needleman_wunsch_global_alignment(a:str, b:str, sim_fct=sim_blossum, gap_cost=-8):
    """ Global alignment with linear gap cost """
    M, D = compute_matrix_needleman_wunsch(a, b, sim_fct, gap_cost)
    aa, bb = trace_back(a, b, D)
    print(aa)
    print(bb)
    return M, aa, bb

## Gotoh: alignment with affine gap cost

Instead of one matrix, we need 3:
- $F_{i,j}$ storing the best score for $a_{1...i}$ and $b_{1...j}$ that ends with $a_i$ and $b_j$
- $A_{i,j}$ storing the best score that ends with $a_i$ and a gap
- $B_{i,j}$ storing the best score that ends with a gap and $b_j$

We have:
$$
A_{i,j} = \max \begin{cases}
  A_{i-1,j} + \text{gap extension} \\
  F_{i-1,j} + \text{gap opening} &\\
\end{cases}
$$

$$
B_{i,j} = \max \begin{cases}
  B_{i,j-1} + \text{gap extension} \\
  F_{i,j-1} + \text{gap opening} &\\
\end{cases}
$$
$$
F_{i,j} = \max \begin{cases}
  F_{i-1,j-1} + s(a_i,b_j)  \\
  F_{i-1,j-1} + s(a_i,b_j)  \\
  F_{i-1,j-1} + s(a_i,b_j)  \\
\end{cases}
$$

The matrix $M_{i,j}$ as defined previously is the max (element wise) of this three matrices.

In [4]:
def compute_matrix_gotoh(seq_a:str, seq_b:str, sim_fct, gap_opening:float=-2, gap_extension:float=-2):
    """ compute the Gotoh matrices """

    n_a = len(seq_a)
    n_b = len(seq_b)
    
    F = np.zeros((n_a+1, n_b+1))
    A = np.zeros((n_a+1, n_b+1))
    B = np.zeros((n_a+1, n_b+1))

    # init matrix A
    for j in range(0, n_b+1):
        A[0,j] = -np.inf
        A[1,j] = 2*gap_opening + (j-1)*gap_extension
    for i in range(1, n_a+1):    
        A[i,0] = gap_opening + (i-1)*gap_extension

    # init matrix B
    for i in range(0, n_a+1):
        B[i,0] = -np.inf
        B[i,1] = 2*gap_opening + (i-1)*gap_extension
    for j in range(1, n_b+1):
        B[0,j] = gap_opening + (j-1)*gap_extension

    # init matrix F
    F[0,0] = 0
    for i in range(1, n_a+1):
        F[i,0] = -np.inf
    for j in range(1, n_b+1):
        F[0,j] = -np.inf
        F[0,j] = -np.inf    

    # fill matrices
    for i in range(1, n_a+1):
        for j in range(1, n_b+1):
            if i>1:
                A[i,j] = max(F[i-1,j] + gap_opening, A[i-1,j] + gap_extension)
            if j>1:
                B[i,j] = max(F[i,j-1] + gap_opening, B[i,j-1] + gap_extension)
            sim = sim_fct(seq_a[i-1], seq_b[j-1])
            F[i,j] = max(F[i-1, j-1] + sim, A[i-1,j-1] + sim, B[i-1,j-1] + sim)

    # get score and traceback matrix
    M = np.maximum(F, np.maximum(A,B))
    D = np.zeros((n_a+1, n_b+1), dtype=int)
    for i in range(1, n_a+1):
        for j in range(1, n_b+1):
            if M[i,j] == F[i,j]:
                D[i,j] = 1
            elif M[i,j] == A[i,j]:
                D[i,j] = 2
            else:
                D[i,j] = 3
    
    return M, D

def gotoh_global_alignment(seq_a:str, seq_b:str, sim_fct=sim_basic, gap_opening=-2, gap_extension=-2):
    """ Global alignment with affine gap cost """
    M, D = compute_matrix_gotoh(seq_a, seq_b, sim_fct, gap_opening, gap_extension)
    seq_aa, seq_bb = trace_back(seq_a, seq_b, D)
    # print(seq_aa)
    # print(seq_bb)
    return M, seq_aa, seq_bb

In [5]:
# test
align_a = "CHCHCHAT"
align_b = "CCAT"

M, aa, bb = gotoh_global_alignment(align_a, align_b, sim_basic, -3, -1)
print(M)

M, aa, bb = gotoh_global_alignment(align_a, align_b, sim_basic, -3, -3)
print(M)

[[  0.  -3.  -4.  -5.  -6.]
 [ -3.   1.  -2.  -3.  -4.]
 [ -4.  -2.   0.  -3.  -4.]
 [ -5.  -3.  -1.  -1.  -4.]
 [ -6.  -4.  -4.  -2.  -2.]
 [ -7.  -5.  -3.  -5.  -3.]
 [ -8.  -6.  -6.  -4.  -6.]
 [ -9.  -7.  -7.  -5.  -5.]
 [-10.  -8.  -8.  -8.  -4.]]
[[  0.  -3.  -6.  -9. -12.]
 [ -3.   1.  -2.  -5.  -8.]
 [ -6.  -2.   0.  -3.  -6.]
 [ -9.  -5.  -1.  -1.  -4.]
 [-12.  -8.  -4.  -2.  -2.]
 [-15. -11.  -7.  -5.  -3.]
 [-18. -14. -10.  -8.  -6.]
 [-21. -17. -13.  -9.  -9.]
 [-24. -20. -16. -12.  -8.]]


# Pairwise profile alignment
We want to merge two alignments $A$ and $B$ into one by aligning them. To compute the similarity matrix, we just have to modify the previous algorithms by replacing the similarity cost of two bases by the average similarity over all the bases in the two alignments.

In [6]:
def append(aa, a):
    if type(aa) == str:     # aa, a = str, chr
        aa += a
    elif type(a) == list:   # aa, a = list of tuple, list of tuple
        aa.extend(a)
    else:
        aa.append(a)
    return aa

def compute_matrix_gotoh_multiple(align_a:List[str], align_b:List[str], sim_fct=sim_basic, gap_opening:float=-2, gap_extension:float=-2):
    """ compute the Gotoh matrices """

    n_a = len(align_a[0])
    n_b = len(align_b[0])
    
    F = np.zeros((n_a+1, n_b+1))
    A = np.zeros((n_a+1, n_b+1))
    B = np.zeros((n_a+1, n_b+1))

    # init matrix A
    for j in range(0, n_b+1):
        A[0,j] = -np.inf
        A[1,j] = 2*gap_opening + (j-1)*gap_extension
    for i in range(1, n_a+1):    
        A[i,0] = gap_opening + (i-1)*gap_extension

    # init matrix B
    for i in range(0, n_a+1):
        B[i,0] = -np.inf
        B[i,1] = 2*gap_opening + (i-1)*gap_extension
    for j in range(1, n_b+1):
        B[0,j] = gap_opening + (j-1)*gap_extension

    # init matrix F
    F[0,0] = 0
    for i in range(1, n_a+1):
        F[i,0] = -np.inf
    for j in range(1, n_b+1):
        F[0,j] = -np.inf
        F[0,j] = -np.inf    

    # fill matrices
    for i in range(1, n_a+1):
        for j in range(1, n_b+1):
            if i>1:
                A[i,j] = max(F[i-1,j] + gap_opening, A[i-1,j] + gap_extension)
            if j>1:
                B[i,j] = max(F[i,j-1] + gap_opening, B[i,j-1] + gap_extension)

            # sim = average similarity between the two sets of aligned sequences
            # problem: how do we handle gaps that might already exists in the alignment ?
            sim = 0
            for a in align_a:
                for b in align_b:
                    if a[i-1] == '-' and b[j-1] == '-':
                            sim += 0
                    elif a[i-1] == '-':
                        if i>0 and a[i-1] == '-':
                            sim += gap_extension
                        else:
                            sim += gap_opening            
                    elif b[j-1] == '-':
                        if j>0 and b[j-1] == '-':
                            sim += gap_extension
                        else:
                            sim += gap_opening
                    else:
                        sim += sim_fct(a[i-1], b[j-1])
            sim /= len(align_a) * len(align_b)

            F[i,j] = max(F[i-1, j-1] + sim, A[i-1,j-1] + sim, B[i-1,j-1] + sim)

    # get score and traceback matrix
    M = np.maximum(F, np.maximum(A,B))
    D = np.zeros((n_a+1, n_b+1), dtype=int)
    for i in range(1, n_a+1):
        for j in range(1, n_b+1):
            if M[i,j] == F[i,j]:
                D[i,j] = 1
            elif M[i,j] == A[i,j]:
                D[i,j] = 2
            else:
                D[i,j] = 3
    
    return M, D

def trace_back_multiple(align_a:List[str], align_b:List[str], D:np.array):
    """ Determine the alignment given a matrix with directions """

    if type(align_a[0]) == str:
        align_aa = ["" for _ in range(len(align_a))]
        align_bb = ["" for _ in range(len(align_b))]
    else:
        align_aa = [[] for _ in range(len(align_a))]
        align_bb = [[] for _ in range(len(align_b))]
    
    i = len(align_a[0])
    j = len(align_b[0])
    
    while i>0 and j>0 :
        if D[i,j] == 1:
            for a in range(len(align_a)):
                align_aa[a] = append(align_aa[a], align_a[a][i-1])
            for b in range(len(align_b)):
                align_bb[b] = append(align_bb[b], align_b[b][j-1])
            i -= 1
            j -= 1
        elif D[i,j] == 2:
            for a in range(len(align_a)):
                align_aa[a] = append(align_aa[a], align_a[a][i-1])
            for b in range(len(align_b)):
                align_bb[b] = append(align_bb[b], '-')
            i -= 1
        else:
            for a in range(len(align_a)):
                align_aa[a] = append(align_aa[a], '-')
            for b in range(len(align_b)):
                align_bb[b] = append(align_bb[b], align_b[b][j-1])
            j -= 1
    while j>0:
        for a in range(len(align_a)):
            align_aa[a] = append(align_aa[a],  '-')
        for b in range(len(align_b)):
            align_bb[b] = append(align_bb[b], align_b[b][j-1])
            j -= 1
    while i > 0:
        for a in range(len(align_a)):
            align_aa[a] = append(align_aa[a], align_a[a][i-1])
        for b in range(len(align_b)):
            align_bb[b] = append(align_bb[b], '-')
        i -= 1

    align = align_aa + align_bb    # list concatenation

    return [seq[::-1] for seq in align]
 

def gotoh_global_alignment_multiple(align_a:List[str], align_b:List[str], sim_fct=sim_basic, gap_opening=-2, gap_extension=-2):
    """ Global alignment with affine gap cost """
    M, D = compute_matrix_gotoh_multiple(align_a, align_b, sim_fct, gap_opening, gap_extension)
    align = trace_back_multiple(align_a, align_b, D)
    return M, align

In [7]:
# test
align_a = "CHAT"
align_b = "CAT"
seq_c = "CIT"

M, aa, bb = gotoh_global_alignment(align_a, align_b, sim_basic, -2, -1)
print(M)

M, align = gotoh_global_alignment_multiple([align_a], [align_b], sim_basic, -2, -1)
print(M)
print(align)

M, align = gotoh_global_alignment_multiple(align, [seq_c], sim_basic, -2, -1)
print(align)

[[ 0. -2. -3. -4.]
 [-2.  1. -1. -2.]
 [-3. -1.  0. -2.]
 [-4. -2.  0. -1.]
 [-5. -3. -2.  1.]]
[[ 0. -2. -3. -4.]
 [-2.  1. -1. -2.]
 [-3. -1.  0. -2.]
 [-4. -2.  0. -1.]
 [-5. -3. -2.  1.]]
['CHAT', 'C-AT']
['CHAT', 'C-AT', 'C-IT']


In [8]:
test = [('A',1), ('B',2)]
test2 = append(test, ('C',3))
print(test2)

[('A', 1), ('B', 2), ('C', 3)]


#  Progressive multiple alignment
To align N sequences, we do clustering. First, we create one cluster per sequence. Then, we align and merge the two closest clusters until we only have one big cluster, containing all the alignment sequences.

In [9]:
def progressive_alignment(seq:list, sim_fct=sim_blossum, gap_opening=-10, gap_extension=-0.5):
  """ Computes multiple sequence alignement """

  # transforming each sequence into a cluster of one sequence
  clusters = [[s] for s in seq]
  # order of the clusters, to keep track of the original order
  order = list(range(len(seq)))

  # initialising distance matrix
  matrix = np.zeros((len(seq), len(seq)))
  for i in range(len(seq)):
    for j in range(i+1, len(seq)):
      matrix[i,j] = compute_matrix_gotoh(seq[i], seq[j], sim_fct, gap_opening, gap_extension)[0][-1,-1]
      matrix[j,i] = matrix[i,j]

  while len(clusters) > 1:
    # finding closest clusters (cad with highest similarity)
    max_sim = -np.inf
    for i in range(len(clusters)):
      for j in range(i+1,len(clusters)):
        if matrix[i,j] > max_sim:
          max_sim = matrix[i,j]
          best_i = i
          best_j = j

    # merging two clusters best_i < best_j
    cluster_j = clusters.pop(best_j)
    cluster_i = clusters.pop(best_i)
    
    order_j = order.pop(best_j)
    order_i = order.pop(best_i)
    order.append([order_i, order_j])

    M, D = compute_matrix_gotoh_multiple(cluster_i, cluster_j, sim_fct, gap_opening, gap_extension)
    new_cluster = trace_back_multiple(cluster_i, cluster_j, D)

    clusters.append(new_cluster)

    # updating distance matrix
    new_line = (len(cluster_i)*matrix[best_i,:] + len(cluster_j)*matrix[best_j,:]) / (len(cluster_i)+len(cluster_j))  # weighted average
    new_column = np.append(new_line, 0)  # adding a column of 0
    new_line = new_line.reshape(1, -1)
    new_column = new_column.reshape(-1, 1)

    matrix = np.append(matrix, new_line, axis=0)
    matrix = np.append(matrix, new_column, axis=1)

    matrix = np.delete(matrix, [best_i, best_j], axis=0)
    matrix = np.delete(matrix, [best_i, best_j], axis=1)

  def get_order(gt):
    """((4, (0, 1)), (2, 3)) -> [4, 0, 1, 2, 3]"""
    if type(gt) == int:
        return [gt]
    else:
        return get_order(gt[0]) + get_order(gt[1])
    
  order = get_order(order[0])
  reverse_order = [order.index(i) for i in range(len(order))]
  
  alignment = clusters[0]
  alignment = [alignment[i] for i in reverse_order]
  
  return alignment, M[-1,-1]

In [10]:
# test: easy example
seq = ["CCCAT", "CHCHCHAT", "HER"]

align, score = progressive_alignment(seq, sim_blossum)
print(score)
print(align)

-11.5
['-CCCAT', 'THHHAT', '---HER']


In [11]:
# test: real example
seq, ids = read_fasta("data/paralogs.fasta")
# seq = read_fasta("data/paralogs.fasta")

for s in seq:
  print(s)
print()

alignment, score = progressive_alignment(seq, sim_blossum, gap_opening=-10, gap_extension=-0.5)
for s in alignment:
  print(s)
print(f"Score of alignment: {score}")
print(f"Size of alignment: {len(alignment[0])}")
print()

MALSAEDRALVRALWKKLGSNVGVYTTEALERTFLAFPATKTYFSHLDLSPGSSQVRAHGQKVADALSLAVERLDDLPHALSALSHLHACQLRVDPASFQLLGHCLLVTLARHYPGDFSPALQASLDKFLSHVISALVSEYR
MSLTKTERTIIVSMWAKISTQADTIGTETLERLFLSHPQTKTYFPHFDLHPGSAQLRAHGSKVVAAVGDAVKSIDDIGGALSKLSELHAYILRVDPVNFKLLSHCLLVTLAARFPADFTAEAHAAWDKFLSVVSSVLTEKYR
MVLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHGKKVADALTNAVAHVDDMPNALSALSDLHAHKLRVDPVNFKLLSHCLLVTLAAHLPAEFTPAVHASLDKFLASVSTVLTSKYR
MVHLTPEEKSAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFESFGDLSTPDAVMGNPKVKAHGKKVLGAFSDGLAHLDNLKGTFATLSELHCDKLHVDPENFRLLGNVLVCVLAHHFGKEFTPPVQAAYQKVVAGVANALAHKYH
MVHLTPEEKTAVNALWGKVNVDAVGGEALGRLLVVYPWTQRFFESFGDLSSPDAVMGNPKVKAHGKKVLGAFSDGLAHLDNLKGTFSQLSELHCDKLHVDPENFRLLGNVLVCVLARNFGKEFTPQMQAAYQKVVAGVANALAHKYH
MVHFTAEEKAAVTSLWSKMNVEEAGGEALGRLLVVYPWTQRFFDSFGNLSSPSAILGNPKVKAHGKKVLTSFGDAIKNMDNLKPAFAKLSELHCDKLHVDPENFKLLGNVMVIILATHFGKEFTPEVQAAWQKLVSAVAIALAHKYH
MGHFTEEDKATITSLWGKVNVEDAGGETLGRLLVVYPWTQRFFDSFGNLSSASAIMGNPKVKAHGKKVLTSLGDAIKHLDDLKGTFAQLSELHCDKLHVDPENFKLLGNVLVTVLAIHFGKEFTPEV

# Include structural information

### Download and read PDB files
`DSSP(model, PDBfile)` is an algorithm to *Define Secondary Structure of Proteins*. It returns a dictionary describing the structure of the protein. In this dictionary:
- the keys are tupple (chain_id, residue_id)
- the values are tupple (DSSP index, Amino acid, Secondary structure, solvent accessibility, Phi, Psi, NH–>O_1_relidx,...) (see https://biopython.org/docs/1.75/api/Bio.PDB.DSSP.html)
    - for example: (3, 'V', '-', 0.7394366197183099, -62.9, 147.7, 1, -0.1, 2, -1.6, 0, 0.0, -1, -0.0)

In [12]:
def get_ssp(mol:str):
    # download PDB file
    PDBfile = f"data/pdb/{mol}.pdb"
    if not os.path.exists(PDBfile):
        pdbl = PDBList()
        oldPDBfile = pdbl.retrieve_pdb_file(mol, pdir='data/pdb/', file_format="pdb")
        os.rename(oldPDBfile, PDBfile)

    # parse PDB file
    parser = PDBParser()
    with warnings.catch_warnings():
        structure = parser.get_structure(mol, PDBfile)

    # get DSSP
    model = structure[0]
    ssp = DSSP(model, PDBfile)
    return model, ssp

In [13]:
# test        
mol = "1idy"
model, ssp = get_ssp(mol)

for chain in model:
    print(f"chain {chain.id}")
    i = 0
    for residue in chain:
        print(ssp[(chain.id, residue.id)])
        i += 1
        if i > 5:
            break

chain A
(1, 'M', '-', 1.0, 360.0, -66.0, 0, 0.0, 0, 0.0, 0, 0.0, 0, 0.0)
(2, 'E', '-', 0.7628865979381443, 64.4, 100.8, 2, -0.1, 3, -0.0, 1, -0.0, 0, 0.0)
(3, 'V', '-', 0.7394366197183099, -62.9, 147.7, 1, -0.1, 2, -1.6, 0, 0.0, -1, -0.0)
(4, 'K', '-', 0.9414634146341463, -65.1, 89.4, 2, -0.0, 2, -0.2, 3, -0.0, -1, -0.1)
(5, 'K', '-', 0.4926829268292683, -89.1, 156.7, -2, -1.6, 2, -1.9, 1, -0.1, -1, -0.0)
(6, 'T', '-', 0.6197183098591549, -70.5, 84.3, -2, -0.2, 2, -1.3, 1, -0.2, -1, -0.1)


In [14]:
def read_fasta_and_PDB(filename) -> Tuple[List[str], List[List[Tuple]]]:
    """
    Read a fasta file and the corresponding PDB files
    Returns:
    - seqFASTA, list of str: the sequences in the fasta file
    - SSPs, list of list of tuple: list of the secondary structure infos for the corresponding molecules
    """
    seqFASTA = []
    ids = []
    SSPs = []
    for record in SeqIO.parse(filename, "fasta"):

        seqFASTA.append(str(record.seq))
        ids.append(record.id)

        name = record.name
        mol, chain = name.split("_")
        if chain == "":
            chain = "A"
        model, ssp = get_ssp(mol)

        # transforms ssp dict into a list containing only the amino acids
        new_ssp = []
        for residue in model[chain]:
            if is_aa(residue):
                try:
                    new_ssp.append(ssp[(chain, residue.id)])                    
                except KeyError:
                    print(f"residue {residue} not found in DSSP for {mol}_{chain}")
        SSPs.append(new_ssp)
    return seqFASTA, SSPs, ids

### Align fasta and PDB
Some amino-acids might be missing in the PDB file. Therefore we have to use Needleman-Wunsch to align properly the sequence obtained from the fasta and from the PDB file.

In [15]:
def trace_back_SSP(seq:str, ssp:List[tuple], D:np.array) -> Tuple[str, List[tuple]]:
    """ traceback matrix D to align a sequence and a list of ssp lines """
    new_seq = ""
    new_ssp = []
    
    i = len(seq)
    j = len(ssp)
    while i>0 and j>0 :
        if D[i,j] == 1:
            new_seq += seq[i-1]
            new_ssp.append(ssp[j-1])
            i -= 1
            j -= 1
        elif D[i,j] == 2:   # a base is missing in the PDB
            new_seq += seq[i-1]
            missing_res = ['-' for _ in range(14)] # we don't know the secondary structure
            missing_res[1] = seq[i-1]               # but we know the amino acid
            new_ssp.append(tuple(missing_res)) 
            i -= 1
        else:               # a base is missing in the sequence but present in the PDB (not expected)
            new_seq += ssp[j-1][1]
            new_ssp.append(ssp[j-1])
            j -= 1
    if i == 0:
        new_seq += '-'*j
        new_ssp += ssp[:j]
    elif j == 0:
        new_seq += seq[:i]
        new_ssp += [tuple() for _ in range(i)]     
    return new_seq[::-1], new_ssp[::-1]

def align_SSP(seq:str, dssp:List[tuple]):
    """ align sequence and list of PDB lines"""
    sim_bool = lambda a, b: 1 if a == b else 0
    M, D = compute_matrix_needleman_wunsch(seq, "".join([d[1] for d in dssp]), sim_bool, gap_cost=-1)
    new_seq, new_dssp = trace_back_SSP(seq, dssp, D)
    return new_seq, new_dssp

def read_and_align_fasta_and_SSP(filename):
    seq_list, ssp_list, ids = read_fasta_and_PDB(filename)
    new_seq_list, new_ssp_list = [], []
    for i in range(len(seq_list)):
        seq, ssp = align_SSP(seq_list[i], ssp_list[i])
        new_seq_list.append(seq)
        new_ssp_list.append(ssp)
    return new_seq_list, new_ssp_list, ids

def ssp2tupple(ssp:List[tuple]) -> List[Tuple[str, float]]:
    """ extracts a list of tuples (amino-acid, burial) from a list of ssp lines """
    return [(res[1], res[3]) for res in ssp]

In [16]:
# test
seq_list, ssp_list, ids = read_and_align_fasta_and_SSP("data/balibase/RV11.unaligned/BB11003.fasta")
burial_list = [ssp2tupple(ssp) for ssp in ssp_list]



residue <Residue MSE het=H_MSE resseq=78 icode= > not found in DSSP for 1o20_A
residue <Residue MSE het=H_MSE resseq=189 icode= > not found in DSSP for 1o20_A


In [17]:
a = burial_list[0][0]
b = burial_list[1][0]

for i in range(len(seq_list)):
    print(seq_list[i])
    print("".join([ssp_list[i][aa][1] for aa in range(len(ssp_list[i]))]))
print()

print(burial_list[0])

SISDTVKRAREAFNSGKTRSLQFRIQQLEALQRMINENLKSISGALASDLGKNEWTSYYEEVAHVLEELDTTIKELPDWAEDEPVAKTRQTQQDDLYIHSEPLGVVLVIGAWNYPFNLTIQPMVGAVAAGNAVILKPSEVSGHMADLLATLIPQYMDQNLYLVVKGGVPETTELLKERFDHIMYTGSTAVGKIVMAAAAKHLTPVTLELGGKSPCYVDKDCDLDVACRRIAWGKFMNSGQTCVAPDYILCDPSIQNQIVEKLKKSLKDFYGEDAKQSRDYGRIINDRHFQRVKGLIDNQKVAHGGTWDQSSRYIAPTILVDVDPQSPVMQEEIFGPVMPIVCVRSLEEAIQFINQREKPLALYVFSNNEKVIKKMIAETSSGGVTANDVIVHITVPTLPFGGVGNSGMGAYHGKKSFETFSHRRSCLVKSLLNEEAHKARYPPSPA
SISDTVKRAREAFNSGKTRSLQFRIQQLEALQRMINENLKSISGALASDLGKNEWTSYYEEVAHVLEELDTTIKELPDWAEDEPVAKTRQTQQDDLYIHSEPLGVVLVIGAWNYPFNLTIQPMVGAVAAGNAVILKPSEVSGHMADLLATLIPQYMDQNLYLVVKGGVPETTELLKERFDHIMYTGSTAVGKIVMAAAAKHLTPVTLELGGKSPCYVDKDCDLDVACRRIAWGKFMNSGQTCVAPDYILCDPSIQNQIVEKLKKSLKDFYGEDAKQSRDYGRIINDRHFQRVKGLIDNQKVAHGGTWDQSSRYIAPTILVDVDPQSPVMQEEIFGPVMPIVCVRSLEEAIQFINQREKPLALYVFSNNEKVIKKMIAETSSGGVTANDVIVHITVPTLPFGGVGNSGMGAYHGKKSFETFSHRRSCLVKSLLNEEAHKARYPPSPA
MTVEPFRNEPIETFQTEEARRAMREALRRVREEFGRHYPLYIGGEWVDTKERMVSLNPSAPSEVVGTTAKAGKAEAEAALEAAWKAFKTWKDWPQEDRSRLLLKAA

### Use burial to modulate similarity function in Needleman-Wunsh
Idea: we keep the same algorithm but modify the gap or mutation cost depending of the structural information.

The most easy data to use is the relative solvent accessibility (RSA) : it indicates if a residue in on the surface of the molecule or in the inside. A mutation inside of the protein is more costly than on the outside. Therefore we use the same algorythm as previopusly but multiply the similarity by a coefficient depending on the burial.


$M_{i,j}$ is defined by recursion with the following formula:
$$
M_{i,j} = \max \begin{cases}
  M_{i-1,j-1} + f(x) \cdot s(a_i,b_j) \\
  M_{i-1,j} - f(x) \cdot \text{gap cost} &\\
  M_{i,j-1} - f(x) \cdot \text{gap cost}&
\end{cases}
$$

In [18]:
def sim_burial(a, b, sim_fct=sim_blossum)->float:
    ''' computes a similarity score ponderated by the burial score
    a, b are tuples (str: amino-acide, float: burial score)'''
    if a[1] == '-' or b[1] == '-':    # no burial score provided for this residue
        coeff = 1
    else:
        coeff = 1.5 - (a[1] + b[1])/2 # score between 0.5 and 1.5; 0.5 means that the two residues are very exposed
    return coeff * sim_fct(a[0], b[0])

In [19]:
alignment, score = progressive_alignment(burial_list, sim_burial, -10, -0.5)

In [20]:
seq_list = []
for seq in alignment:
    for i in range(len(seq)):
        if seq[i] == '-':
            seq[i] = ('-', '-')
    seq_list.append("".join([aa[0] for aa in seq]))

for seq in seq_list:
    print(seq)
    

A-------S-KRAREAF--------------------------------NSGKTRSLQFRIQQLEALQRMIN-------EN--------------LKS-ISGA-LASDLGKNEWTSYYEEV-AHVLEE-LDTTIKE---LPDWAED--EPVAKTRQTQQDDLYIHSEPLGVVLVIGAWNYPFNLT-I-QPMVGAVAAGNAVILK-----PSE----VSG------HMADL---LATLIPQYMDQ-NLYLVVKGGVPETTELLKERFDHIMYTGSTAVGKIVMAAAAKHLTP------VTLELG-GKSPCYVDKDC---DLDVACRRI---AWGKFMNSGQTCVAPDYILCDPSIQNQIVEK-LKKSLKDFYGEDAKQSRD--YGRIINDRHFQRVKGLI-----DNQKVAHGGTWDQSSRYIAPTIL-VDVDP--QSPVMQEEIFGPVMPIVCVRSLEEAIQFINQREKPLALY--VFSNNE---KVIKKM-IAETSSGGVTANDVIVHITVP---TL--PFGG----VGNS-GMGAYHGKKSFETFSHRRSCLVKSLLNEE---AHKARYPPSPA
-TPNIFERMARRVREEFGRHYPLYIGGEWVDTKERMVSLNPSAPSEVVGTTAKAGKAEAEA-ALEAAWKAFKTWKDWPQEDRSRLLLKAAALMRRRKRELEAT-LVYEVGKN-WVEASADV-AEAIDF-IEYYARA---ALRYRYPAVEVVPYPGED-NESFYV---PLGAGVVIAPWNFPVAIF-T-GMIVGPVAVGNTVIAK-----PAEDAVVVGAKVFEIFHEAGFPPGVVNFLPGVGEEVGAYL-V-----EHPR-I--RF--INFTGSLEVGLKIYEAAGRLAPGQTWFKRAYVETG-GKNAIIVDETA---DFDLAAEGVVVSAYG--F-QGQKCSAASRLILTQGAYEPVLERVLKRAERLSVGP-AEENPD--LGPVVSAEQERKVLSYIEIGKNEGQLVLGGKRLEGEGYFIA

# Run simulations

### With affine gap but without PDB
(takes about 3min30 for 5 alignments)

In [21]:
# train_data = 5

# for i in range(1, train_data+1):
#     data_file = f"data/balibase/RV11.unaligned/BB11{str(i).zfill(3)}.fasta"
#     seqs, ids = read_fasta(data_file)
#     for gap_opening in [-5, -8,-10,-12,-15, -20]:
#         for gap_extension in [-0.1,-0.5,-1,-2]:
#             result_file = f"data/results/affineWithoutPDB/results{-gap_opening}_{-gap_extension}/BB11{str(i).zfill(3)}.fasta"
#             alignment, score = progressive_alignment(seqs, sim_blossum, gap_opening, gap_extension)
#             write_fasta(result_file, alignment, ids)

### With affine gap and PDB
(takes about 5min for 5 alignments)

In [29]:
train_data = 5

for i in range(1, train_data+1):
    data_file = f"data/balibase/RV11.unaligned/BB11{str(i).zfill(3)}.fasta"
    seqs, ssps, ids = read_and_align_fasta_and_SSP(data_file)
    burial_list = [ssp2tupple(ssp) for ssp in ssps]
    for gap_opening in [-18, -20]:
        for gap_extension in [-1,-2,-4, -5]:
            result_file = f"data/results/affineWithPDB2/results{-gap_opening}_{-gap_extension}/BB11{str(i).zfill(3)}.fasta"
            alignment, score = progressive_alignment(burial_list, sim_burial, gap_opening, gap_extension)
            alignment = ["".join([aa[0] for aa in seq]) for seq in alignment]
            write_fasta(result_file, alignment, ids)



residue <Residue SER het=  resseq=283 icode= > not found in DSSP for 1ov3_A




residue <Residue MSE het=H_MSE resseq=78 icode= > not found in DSSP for 1o20_A
residue <Residue MSE het=H_MSE resseq=189 icode= > not found in DSSP for 1o20_A




residue <Residue LLP het=H_LLP resseq=253 icode= > not found in DSSP for 1bw0_A




residue <Residue MSE het=H_MSE resseq=112 icode= > not found in DSSP for 1fg3_A
residue <Residue MSE het=H_MSE resseq=237 icode= > not found in DSSP for 1fg3_A
residue <Residue MSE het=H_MSE resseq=265 icode= > not found in DSSP for 1fg3_A




residue <Residue MSE het=H_MSE resseq=92 icode= > not found in DSSP for 1h1c_A
residue <Residue MSE het=H_MSE resseq=93 icode= > not found in DSSP for 1h1c_A
residue <Residue MSE het=H_MSE resseq=95 icode= > not found in DSSP for 1h1c_A
residue <Residue MSE het=H_MSE resseq=239 icode= > not found in DSSP for 1h1c_A
residue <Residue MSE het=H_MSE resseq=265 icode= > not found in DSSP for 1h1c_A
residue <Residue MSE het=H_MSE resseq=272 icode= > not found in DSSP for 1h1c_A
residue <Residue MSE het=H_MSE resseq=288 icode= > not found in DSSP for 1h1c_A
residue <Residue MSE het=H_MSE resseq=326 icode= > not found in DSSP for 1h1c_A
residue <Residue LYS het=  resseq=335 icode= > not found in DSSP for 1h1c_A




residue <Residue LLP het=H_LLP resseq=203 icode= > not found in DSSP for 1jg8_A




residue <Residue LLP het=H_LLP resseq=266 icode= > not found in DSSP for 1ax4_A




### With ClustalW

In [23]:
# from Bio.Align.Applications import ClustalOmegaCommandline

# train_data = 38

# for i in range(1, train_data+1):
#     infile = f"data/balibase/RV11.unaligned/BB11{str(i).zfill(3)}.fasta"
#     outfile = f"data/balibase/results/clustalo/BB11{str(i).zfill(3)}.fasta"
#     cli = ClustalOmegaCommandline( "clustalo", infile=infile, outfile=outfile )
#     try:
#         stdout , stderr = cli()
#     except Exception as e:
#         print(e)

# Benchmark
For the sequences from balivase, we want to compare:
- our computed alignments (progressive alignment using affine/linear gap with/without structural information)
- the prediction from clustalW (best alignment without structural info)
- the consensus alignment given in the fasta file

`balibase2.py` contains a function `compare_seq` which takes in input a reference and a test alignment (in form of a dictionnary of sequences and its list of keys), and return two scores:
- the sum-of-pairs score (SPS): measures if *some* sequences are correctly aligned
- the column score: measures if *every* sequences are correctly aligned

In [25]:
# comparing clustal omega to balibase reference
train_data = 38

Res = {'id':[], 'SP':[], 'TC':[]}

for i in range(1, train_data+1):
    file_bali = f"data/balibase/RV11.aligned/BB11{str(i).zfill(3)}.fasta"
    file_clustal = f"data/results/clustalo/BB11{str(i).zfill(3)}.fasta"

    refkeylist, refseq = read_fasta2dict(file_bali)
    testkeylist, testseq = read_fasta2dict(file_clustal)

    sp, tc = compare_seq(refseq, refkeylist, testseq, testkeylist)
    Res['id'].append(i)
    Res['SP'].append(sp)
    Res['TC'].append(tc)

df_clustalo = pd.DataFrame(Res)
print("Results of clustal omega compared to balibase reference:")
print('____________________________________________________')
print(df_clustalo)

Results of clustal omega compared to balibase reference:
____________________________________________________
    id        SP    TC
0    1  0.901316  0.80
1    2  0.483283  0.00
2    3  0.604775  0.42
3    4  0.522447  0.38
4    5  0.400970  0.10
5    6  0.382857  0.20
6    7  0.607697  0.33
7    8  0.379630  0.18
8    9  0.324786  0.00
9   10  0.320652  0.17
10  11  0.217188  0.09
11  12  0.862975  0.76
12  13  0.053061  0.00
13  14  0.766741  0.62
14  15  0.719833  0.56
15  16  0.574000  0.32
16  17  0.768152  0.67
17  18  0.501323  0.25
18  19  0.574087  0.16
19  20  0.593892  0.19
20  21  0.493750  0.35
21  22  0.152047  0.00
22  23  0.530314  0.23
23  24  0.236842  0.00
24  25  0.104575  0.00
25  26  0.125908  0.00
26  27  0.355337  0.14
27  28  0.583053  0.00
28  29  0.479798  0.39
29  30  0.579601  0.21
30  31  0.474688  0.13
31  32  0.613805  0.36
32  33  0.226607  0.00
33  34  0.557555  0.22
34  35  0.415152  0.12
35  36  0.558796  0.23
36  37  0.480212  0.27
37  38  0.747050

In [27]:
# comparing our results to culstalo and balibase reference
train_data = 5

Res = {'id':[], 'ref':[], "pdb":[], 'gap_opening':[], 'gap_extension':[], 'SP':[], 'TC':[]}


for i in range(1, train_data+1):
    ref_file1 = f"data/balibase/RV11.aligned/BB11{str(i).zfill(3)}.fasta"
    ref_file2 = f"data/results/clustalo/BB11{str(i).zfill(3)}.fasta"

    refkeylist1, refseq1 = read_fasta2dict(ref_file1)
    refkeylist2, refseq2 = read_fasta2dict(ref_file2)

    for gap_opening in [-8, -10, -12, -15]:
        for gap_extension in [-2, -1, -0.5]:
            test_file = f"data/results/affineWithoutPDB/results{-gap_opening}_{-gap_extension}/BB11{str(i).zfill(3)}.fasta"
            testkeylist, testseq = read_fasta2dict(test_file)

            sp1, tc1 = compare_seq(refseq1, refkeylist1, testseq, testkeylist)
            sp2, tc2 = compare_seq(refseq2, refkeylist2, testseq, testkeylist)
            Res['id'].append(i)
            Res['ref'].append('balibase')
            Res['gap_opening'].append(gap_opening)
            Res['gap_extension'].append(gap_extension)
            Res['SP'].append(sp1)
            Res['TC'].append(tc1)
            Res['pdb'].append(False)

            Res['id'].append(i)
            Res['ref'].append('clustalo')
            Res['gap_opening'].append(gap_opening)
            Res['gap_extension'].append(gap_extension)
            Res['SP'].append(sp2)
            Res['TC'].append(tc2)
            Res['pdb'].append(False)

            test_file = f"data/results/affineWithPDB2/results{-gap_opening}_{-gap_extension}/BB11{str(i).zfill(3)}.fasta"
            testkeylist, testseq = read_fasta2dict(test_file)

            sp1, tc1 = compare_seq(refseq1, refkeylist1, testseq, testkeylist)
            sp2, tc2 = compare_seq(refseq2, refkeylist2, testseq, testkeylist)
            Res['id'].append(i)
            Res['ref'].append('balibase')
            Res['gap_opening'].append(gap_opening)
            Res['gap_extension'].append(gap_extension)
            Res['SP'].append(sp1)
            Res['TC'].append(tc1)
            Res['pdb'].append(True)

            Res['id'].append(i)
            Res['ref'].append('clustalo')
            Res['gap_opening'].append(gap_opening)
            Res['gap_extension'].append(gap_extension)
            Res['SP'].append(sp2)
            Res['TC'].append(tc2)
            Res['pdb'].append(True)

df = pd.DataFrame(Res)

In [42]:
# comparing our results to culstalo and balibase reference
train_data = 5

# Res = {'id':[], 'ref':[], "pdb":[], 'gap_opening':[], 'gap_extension':[], 'SP':[], 'TC':[], 'pdb':[]}
Res = []

for root, dirs, files in os.walk("data/results/affineWithPDB2"):
    for file in files:
        folder = root.split('/')[-1].lstrip('results')
        gap_opening, gap_extension = folder.split('_')
        ref_file_bali = f"data/balibase/RV11.aligned/{file}"
        ref_file_clustalo = f"data/results/clustalo/{file}"
        refkeylist_bali, refseq_bali = read_fasta2dict(ref_file_bali)
        refkeylist_clustalo, refseq_clustalo = read_fasta2dict(ref_file_clustalo)
        test_file = f"{root}/{file}"
        testkeylist, testseq = read_fasta2dict(test_file)

        sp_bali, tc_bali = compare_seq(refseq_bali, refkeylist_clustalo, testseq, testkeylist)
        sp_clustalo, tc_clustalo = compare_seq(refseq_clustalo, refkeylist_clustalo, testseq, testkeylist)

        Res.append([int(file.split('.')[0][-3:]),"clustalo",gap_opening,gap_extension,sp_clustalo,tc_clustalo,True])
        Res.append([int(file.split('.')[0][-3:]),"balibase",gap_opening,gap_extension,sp_bali,tc_bali,True])

for root, dirs, files in os.walk("data/results/affineWithoutPDB"):
    for file in files:
        folder = root.split('/')[-1].lstrip('results')
        gap_opening, gap_extension = folder.split('_')
        ref_file_bali = f"data/balibase/RV11.aligned/{file}"
        ref_file_clustalo = f"data/results/clustalo/{file}"
        refkeylist_bali, refseq_bali = read_fasta2dict(ref_file_bali)
        refkeylist_clustalo, refseq_clustalo = read_fasta2dict(ref_file_clustalo)
        test_file = f"{root}/{file}"
        testkeylist, testseq = read_fasta2dict(test_file)

        sp_bali, tc_bali = compare_seq(refseq_bali, refkeylist_clustalo, testseq, testkeylist)
        sp_clustalo, tc_clustalo = compare_seq(refseq_clustalo, refkeylist_clustalo, testseq, testkeylist)

        Res.append([int(file.split('.')[0][-3:]),"clustalo",gap_opening,gap_extension,sp_clustalo,tc_clustalo,False])
        Res.append([int(file.split('.')[0][-3:]),"balibase",gap_opening,gap_extension,sp_bali,tc_bali,False])

df = pd.DataFrame(Res, columns =['mol', 'ref', 'gap_opening', 'gap_extension', 'SP', 'TC', 'PDB'])
print(df)

     mol       ref gap_opening gap_extension        SP    TC    PDB
0      4  clustalo          15             1  0.015670  0.00   True
1      4  balibase          15             1  0.006734  0.00   True
2      5  clustalo          15             1  0.187913  0.00   True
3      5  balibase          15             1  0.159937  0.00   True
4      2  clustalo          15             1  0.000000  0.00   True
..   ...       ...         ...           ...       ...   ...    ...
435    2  balibase          12           0.1  0.117021  0.00  False
436    1  clustalo          12           0.1  0.655702  0.48  False
437    1  balibase          12           0.1  0.719298  0.60  False
438    3  clustalo          12           0.1  0.323699  0.08  False
439    3  balibase          12           0.1  0.263926  0.07  False

[440 rows x 7 columns]


In [45]:
df.groupby(['ref', 'gap_opening', 'gap_extension', 'PDB']).mean().sort_values(by='SP', ascending=False)
# df[df.ref=='clustalo'].groupby(['gap_opening', 'gap_extension']).mean().sort_values(ascending=False)
# df[df.ref=='balibase'].groupby(['gap_opening', 'gap_extension'])['SP'].mean().sort_values(ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,mol,SP,TC
ref,gap_opening,gap_extension,PDB,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
clustalo,10,1,False,3.0,0.487251,0.252
clustalo,15,1,False,3.0,0.482489,0.246
balibase,10,1,False,3.0,0.480398,0.276
clustalo,12,0.5,False,3.0,0.477708,0.228
clustalo,8,1,False,3.0,0.460479,0.210
...,...,...,...,...,...,...
balibase,15,0.5,True,3.0,0.097811,0.000
clustalo,12,0.5,True,3.0,0.097159,0.000
balibase,12,0.5,True,3.0,0.092671,0.000
balibase,8,0.5,True,3.0,0.087103,0.000


When sorting the results by sum of pairs score, we find that the best parameters seem to be `gap_opening` = -12 and `gap_extension` = -1.

With PDB performs very badly. For better performance, we have to improve `sim_burial`.