# Content

slides (pdf) + code

similar to scientific articles:
- context/introduction (litterature review)
- results (take perspective, present your work as a story: problematic, progression...)
- evaluat algos on provided dataset ( balibase)
- discussion: critizise, propose solution, give perspective (lol), personnal thinking

TODO:
- benchmark: compare own alignments with clustalW (see scores on slides td0, or available on: https://www.genome.jp/tools-bin/clustalw) and balibase (reference); scores given by bali_score
- implement Needlemand-Wunsch using PDB

penalité de gap dans clustalW:
10 open et 0.1 extend

coef de similarité dépendant de l'enfouissement:
- *1.5 pour burial=1
- *0.5 pour burial=0


# Necessary imports and utility functions

In [1]:
import numpy as np
from Bio import SeqIO
from Bio.Align import substitution_matrices
from Bio.PDB import *
import os

# Type hinting
from typing import List, Tuple, Set, Union, Callable
# type sim_fct = Callable[[chr, chr], float]

blossum = substitution_matrices.load("BLOSUM62")
print(blossum)

def sim_blossum(seq_a:str, seq_b:str, i, j)->float:
    a = seq_a[i]
    b = seq_b[j]
    return blossum[a,b]

def sim_basic(seq_a:str, seq_b:str, i, j, id=1, sub=-1)->float:
    a = seq_a[i]
    b = seq_b[j]
    if a == b:
        return id
    else:
        return sub

# used for profile alignment: is the average similarity between two sets of aligned sequences
def sim_multiple(align_A:List[str], align_B:List[str], i:int, j:int, sim_fct=sim_basic, gap_opening=-2, gap_extension=-2):
    """ Compute the average similarity between two sets of aligned sequences, at a given position """
    res = 0
    n = 0
    for seq_a in align_A:
        for seq_b in align_B:
            if seq_a[i] == '-':
                if i>0 and seq_a[i-1] == '-':
                    res += gap_extension
                else:
                    res += gap_opening            
            elif seq_b[j] == '-':
                if j>0 and seq_b[j-1] == '-':
                    res += gap_extension
                else:
                    res += gap_opening
            else:
                res += sim_fct(seq_a, seq_b, i, j)
            n += 1
    return res/n

#  Matrix made by matblas from blosum62.iij
#  * column uses minimum score
#  BLOSUM Clustered Scoring Matrix in 1/2 Bit Units
#  Blocks Database = /data/blocks_5.0/blocks.dat
#  Cluster Percentage: >= 62
#  Entropy =   0.6979, Expected =  -0.5209
     A    R    N    D    C    Q    E    G    H    I    L    K    M    F    P    S    T    W    Y    V    B    Z    X    *
A  4.0 -1.0 -2.0 -2.0  0.0 -1.0 -1.0  0.0 -2.0 -1.0 -1.0 -1.0 -1.0 -2.0 -1.0  1.0  0.0 -3.0 -2.0  0.0 -2.0 -1.0  0.0 -4.0
R -1.0  5.0  0.0 -2.0 -3.0  1.0  0.0 -2.0  0.0 -3.0 -2.0  2.0 -1.0 -3.0 -2.0 -1.0 -1.0 -3.0 -2.0 -3.0 -1.0  0.0 -1.0 -4.0
N -2.0  0.0  6.0  1.0 -3.0  0.0  0.0  0.0  1.0 -3.0 -3.0  0.0 -2.0 -3.0 -2.0  1.0  0.0 -4.0 -2.0 -3.0  3.0  0.0 -1.0 -4.0
D -2.0 -2.0  1.0  6.0 -3.0  0.0  2.0 -1.0 -1.0 -3.0 -4.0 -1.0 -3.0 -3.0 -1.0  0.0 -1.0 -4.0 -3.0 -3.0  4.0  1.0 -1.0 -4.0
C  0.0 -3.0 -3.0 -3.0  9.0 -3.0 -4.0 -3.0 -3.0 -1.0 -1.0 -3.0 -1.0 -2.0 -3.0 -1.0 -1.0 -2.0 -2.0 -1.0 -3.0 -3.0 -2.0 -4.0
Q -1.0  1.0  0.0  0.

In [45]:
def read_fasta(filename):
    """ Read a fasta file and return the sequences as a list of strings """
    sequences = []
    ids = []
    for record in SeqIO.parse(filename, "fasta"):
        sequences.append(str(record.seq))
        ids.append(record.id)
    return sequences, ids

def write_fasta(filename, sequences, ids=None, len_line=80):
    """ Write a list of sequences to a fasta file """
    if ids is None:
        ids = [f"seq{i}" for i in range(len(sequences))]
    with open(filename, "w") as f:
        for i, seq in enumerate(sequences):
            f.write(">{}\n".format(ids[i]))
            while len(seq) > len_line:
                f.write(seq[:len_line] + "\n")
                seq = seq[len_line:]
            f.write(seq + "\n")

# Pairwise sequence alignment

## Needleman-Wunsch: alignment with linear gap cost
This algorithm tries to align to protein sequences $a_1^n$ and $b_1^m$.
By dynamic programming, the algorithm computes the maximum similarity between any pair of prefixes $a_{1...i}$ and $b_{1...j}$ and stores them in a matrix $M_{i,j}$

$M_{i,j}$ is defined by recursion with the following formula:
$$
M_{i,j} = \max \begin{cases}
  M_{i-1,j-1} + s(a_i,b_j) \\
  M_{i-1,j} - \text{gap cost} &\\
  M_{i,j-1} - \text{gap cost}&
\end{cases}
$$

To facilitate the traceback, we also compute a matrix $D$ *(like Directions)* indicating which case achieved the maximum.

In [3]:
def compute_matrix_needleman_wunsch(seq_a:str, seq_b:str, fct=sim_basic, gap_cost=-2):
    """ compute the Needleman-Wunsch matrix """

    # check if we have a list of sequences (profile alignment)
    if type(seq_a) == list:
        n_a = len(seq_a[0])
        n_b = len(seq_b[0])
        sim_fct = lambda a, b, i, j: sim_multiple(a, b, i, j, fct, gap_cost, gap_cost)
    # or a single sequence (pairwise alignment)
    else:
        n_a = len(seq_a)
        n_b = len(seq_b)
        sim_fct = fct

    # construct matrices
    M = np.zeros((n_a+1, n_b+1), dtype=int)
    D = np.zeros((n_a+1, n_b+1), dtype=int)

    # init matrices
    for i in range(1, n_a+1):
        M[i,0] = M[i-1, 0] + gap_cost
    for j in range(1, n_b+1):
        M[0,j] = M[0, j-1] + gap_cost

    # fill matrices
    for i in range(1, n_a+1):
        for j in range(1, n_b+1):
            x1 = M[i-1,j-1] + sim_fct(seq_a, seq_b, i-1, j-1)
            x2 = M[i-1,j] + gap_cost
            x3 = M[i,j-1] + gap_cost
            if x1 >= x2 and x1 >= x3: # diagonal
                M[i,j] = x1
                D[i,j] = 1
            if x2 >= x1 and x2 >= x3: # up
                M[i,j] = x2
                D[i,j] = 2
            if x3 >= x1 and x3 >= x2: # left
                M[i,j] = x3
                D[i,j] = 3
    
    return M, D

def trace_back(a:str, b:str, D:np.array):
    """ traceback matrix D to get the alignment """
    aa = ""
    bb = ""
    
    i = len(a)
    j = len(b)
    while i>0 and j>0 :
        if D[i,j] == 1:
            aa = a[i-1] + aa
            bb = b[j-1] + bb
            i -= 1
            j -= 1
        elif D[i,j] == 2:
            aa = a[i-1] + aa
            bb = '-' + bb
            i -= 1
        else:
            aa = '-' + aa
            bb = b[j-1] + bb
            j -= 1
    if i == 0:
        aa = '-'*j + aa
        bb = b[:j] + bb
    elif j == 0:
        aa = a[:i] + aa
        bb = '-'*i + bb

    return aa, bb

def needleman_wunsch_global_alignment(a:str, b:str, sim_fct=sim_blossum, gap_cost=-8):
    """ Global alignment with linear gap cost """
    M, D = compute_matrix_needleman_wunsch(a, b, sim_fct, gap_cost)
    aa, bb = trace_back(a, b, D)
    print(aa)
    print(bb)
    return M, aa, bb

## Gotoh: alignment with affine gap cost

Instead of one matrix, we need 3:
- $F_{i,j}$ storing the best score for $a_{1...i}$ and $b_{1...j}$ that ends with $a_i$ and $b_j$
- $A_{i,j}$ storing the best score that ends with $a_i$ and a gap
- $B_{i,j}$ storing the best score that ends with a gap and $b_j$

We have:
$$
A_{i,j} = \max \begin{cases}
  A_{i-1,j} + \text{gap extension} \\
  F_{i-1,j} + \text{gap opening} &\\
\end{cases}
$$

$$
B_{i,j} = \max \begin{cases}
  B_{i,j-1} + \text{gap extension} \\
  F_{i,j-1} + \text{gap opening} &\\
\end{cases}
$$
$$
F_{i,j} = \max \begin{cases}
  F_{i-1,j-1} + s(a_i,b_j)  \\
  F_{i-1,j-1} + s(a_i,b_j)  \\
  F_{i-1,j-1} + s(a_i,b_j)  \\
\end{cases}
$$

The matrix $M_{i,j}$ as defined previously is the max (element wise) of this three matrices.

In [4]:
def compute_matrix_gotoh(seq_a:str, seq_b:str, fct, gap_opening:float=-2, gap_extension:float=-2):
    """ compute the Gotoh matrices """

    # check if we have a list of sequences (profile alignment)
    if type(seq_a) == list:
        n_a = len(seq_a[0])
        n_b = len(seq_b[0])
        sim_fct = lambda a, b, i, j: sim_multiple(a, b, i, j, fct, gap_opening, gap_extension)
    # or a single sequence (pairwise alignment)
    else:
        n_a = len(seq_a)
        n_b = len(seq_b)
        sim_fct = fct
    
    F = np.zeros((n_a+1, n_b+1))
    A = np.zeros((n_a+1, n_b+1))
    B = np.zeros((n_a+1, n_b+1))

    # init matrix A
    for j in range(0, n_b+1):
        A[0,j] = -np.inf
        A[1,j] = 2*gap_opening + (j-1)*gap_extension
    for i in range(1, n_a+1):    
        A[i,0] = gap_opening + (i-1)*gap_extension

    # init matrix B
    for i in range(0, n_a+1):
        B[i,0] = -np.inf
        B[i,1] = 2*gap_opening + (i-1)*gap_extension
    for j in range(1, n_b+1):
        B[0,j] = gap_opening + (j-1)*gap_extension

    # init matrix F
    F[0,0] = 0
    for i in range(1, n_a+1):
        F[i,0] = -np.inf
    for j in range(1, n_b+1):
        F[0,j] = -np.inf
        F[0,j] = -np.inf    

    # fill matrices
    for i in range(1, n_a+1):
        for j in range(1, n_b+1):
            if i>1:
                A[i,j] = max(F[i-1,j] + gap_opening, A[i-1,j] + gap_extension)
            if j>1:
                B[i,j] = max(F[i,j-1] + gap_opening, B[i,j-1] + gap_extension)
            sim = sim_fct(seq_a, seq_b, i-1, j-1)
            F[i,j] = max(F[i-1, j-1] + sim, A[i-1,j-1] + sim, B[i-1,j-1] + sim)

    # get score and traceback matrix
    M = np.maximum(F, np.maximum(A,B))
    D = np.zeros((n_a+1, n_b+1), dtype=int)
    for i in range(1, n_a+1):
        for j in range(1, n_b+1):
            if M[i,j] == F[i,j]:
                D[i,j] = 1
            elif M[i,j] == A[i,j]:
                D[i,j] = 2
            else:
                D[i,j] = 3
    
    return M, D

def gotoh_global_alignment(seq_a:str, seq_b:str, sim_fct=sim_basic, gap_opening=-2, gap_extension=-2):
    """ Global alignment with affine gap cost """
    M, D = compute_matrix_gotoh(seq_a, seq_b, sim_fct, gap_opening, gap_extension)
    seq_aa, seq_bb = trace_back(seq_a, seq_b, D)
    print(seq_aa)
    print(seq_bb)
    return M, seq_aa, seq_bb

In [5]:
# test
align_a = "CHATSASA"
align_b = "CATAA"

M, aa, bb = needleman_wunsch_global_alignment(align_a, align_b, sim_blossum, -8)
print(M)

M, aa, bb = gotoh_global_alignment(align_a, align_b, sim_blossum, -8, -8)
print(M)

M, aa, bb = gotoh_global_alignment(align_a, align_b, sim_basic, -5, -2)
# print(M)

CHATSASA
C-AT-A-A
[[  0  -8 -16 -24 -32 -40]
 [ -8   9   1  -7 -15 -23]
 [-16   1   7  -1  -9 -17]
 [-24  -7   5   7   3  -5]
 [-32 -15  -3  10   7   3]
 [-40 -23 -11   2  11   8]
 [-48 -31 -19  -6   6  15]
 [-56 -39 -27 -14  -2   7]
 [-64 -47 -35 -22 -10   2]]
CHATSASA
C-AT-A-A
[[  0.  -8. -16. -24. -32. -40.]
 [ -8.   9.   1.  -7. -15. -23.]
 [-16.   1.   7.  -1.  -9. -17.]
 [-24.  -7.   5.   7.   3.  -5.]
 [-32. -15.  -3.  10.   7.   3.]
 [-40. -23. -11.   2.  11.   8.]
 [-48. -31. -19.  -6.   6.  15.]
 [-56. -39. -27. -14.  -2.   7.]
 [-64. -47. -35. -22. -10.   2.]]
CHATSASA
C-AT--AA


# Pairwise profile alignment
We want to merge two alignments $A$ and $B$ into one by aligning them. To compute the similarity matrix, we just have to modify the previous algorithms by replacing the similarity cost of two bases by the average similarity over all the bases in the two alignments.

In [6]:
def trace_back_multiple(align_a:List[str], align_b:List[str], D:np.array):
    """ Determine the alignment given a matrix with directions """

    align_aa = ["" for _ in range(len(align_a))]
    align_bb = ["" for _ in range(len(align_b))]
    
    i = len(align_a[0])
    j = len(align_b[0])
    
    while i>0 and j>0 :
        if D[i,j] == 1:
            for a in range(len(align_a)):
                align_aa[a] = align_a[a][i-1] + align_aa[a]
            for b in range(len(align_b)):
                align_bb[b] = align_b[b][j-1] + align_bb[b]
            i -= 1
            j -= 1
        elif D[i,j] == 2:
            for a in range(len(align_a)):
                align_aa[a] = align_a[a][i-1] + align_aa[a]
            for b in range(len(align_b)):
                align_bb[b] = '-' + align_bb[b]
            i -= 1
        else:
            for a in range(len(align_a)):
                align_aa[a] = '-' + align_aa[a]
            for b in range(len(align_b)):
                align_bb[b] = align_b[b][j-1] + align_bb[b]
            j -= 1
    if i == 0:
        for a in range(len(align_a)):
            align_aa[a] = '-'*j + align_aa[a]
        for b in range(len(align_b)):
            align_bb[b] = align_b[b][:j] + align_bb[b]
    elif j == 0:
        for a in range(len(align_a)):
            align_aa[a] = align_a[a][:i] + align_aa[a]
        for b in range(len(align_b)):
            align_bb[b] = '-'*i + align_bb[b]

    return align_aa + align_bb   # list concatenation

def needleman_wunsch_global_alignment_multiple(align_a:List[str], align_b:List[str], sim_fct=sim_basic, gap_cost=-2):
    """ Global alignment with linear gap cost """
    M, D = compute_matrix_needleman_wunsch(align_a, align_b, sim_fct, gap_cost)
    aa, bb = trace_back_multiple(align_a, align_b, D)
    print(aa)
    print(bb)
    return M, aa, bb

def gotoh_global_alignment_multiple(align_a, align_b, sim_fct=sim_basic, gap_opening=-2, gap_extension=-2):
    """ Global alignment with affine gap cost """
    M, D = compute_matrix_gotoh(align_a, align_b, sim_fct, gap_opening, gap_extension)
    aa, bb = trace_back_multiple(align_a, align_b, D)
    print(aa)
    print(bb)
    return M, aa, bb

In [7]:
# test
align_a = "CHAT"
align_b = "CAT"

M, aa, bb = needleman_wunsch_global_alignment(align_a, align_b, sim_basic, -2)

M, aa, bb = needleman_wunsch_global_alignment_multiple([align_a], [align_b], sim_basic, -2)

M, aa, bb = gotoh_global_alignment(align_a, align_b, sim_basic, -2)

M, aa, bb = gotoh_global_alignment_multiple([align_a], [align_b], sim_basic, -2)

CHAT
C-AT
CHAT
C-AT
CHAT
C-AT
CHAT
C-AT


#  Progressive multiple alignment
To align N sequences, we do clustering. First, we create one cluster per sequence. Then, we align and merge the two closest clusters until we only have one big cluster, containing all the alignment sequences.

In [39]:
def progressive_alignment(seq:list, sim_fct=sim_blossum, gap_opening=-10, gap_extension=-0.5, linear=False):
  """ Computes multiple sequence alignement """

  # transforming each sequence into a cluster of one sequence
  clusters = [[s] for s in seq]
  # order of the clusters, to keep track of the original order
  order = list(range(len(seq)))

  # initialising distance matrix
  matrix = np.zeros((len(seq), len(seq)))
  for i in range(len(seq)):
    for j in range(i+1, len(seq)):
      if linear:
        matrix[i,j] = compute_matrix_needleman_wunsch(seq[i], seq[j], sim_fct, gap_opening)[0][-1,-1]
      else:
        matrix[i,j] = compute_matrix_gotoh(seq[i], seq[j], sim_fct, gap_opening, gap_extension)[0][-1,-1]
      matrix[j,i] = matrix[i,j]

  while len(clusters) > 1:
    # finding closest clusters (cad with highest similarity)
    max_sim = -np.inf
    for i in range(len(clusters)):
      for j in range(i+1,len(clusters)):
        if matrix[i,j] > max_sim:
          max_sim = matrix[i,j]
          best_i = i
          best_j = j

    # merging two clusters best_i < best_j
    cluster_j = clusters.pop(best_j)
    cluster_i = clusters.pop(best_i)
    
    order_j = order.pop(best_j)
    order_i = order.pop(best_i)
    order.append([order_i, order_j])

    if linear:
      M, D = compute_matrix_needleman_wunsch(cluster_i, cluster_j, sim_fct, gap_opening)
    else:
      M, D = compute_matrix_gotoh(cluster_i, cluster_j, sim_fct, gap_opening, gap_extension)
    new_cluster = trace_back_multiple(cluster_i, cluster_j, D)

    clusters.append(new_cluster)

    # updating distance matrix
    new_line = (len(cluster_i)*matrix[best_i,:] + len(cluster_j)*matrix[best_j,:]) / (len(cluster_i)+len(cluster_j))  # weighted average
    new_column = np.append(new_line, 0)  # adding a column of 0
    new_line = new_line.reshape(1, -1)
    new_column = new_column.reshape(-1, 1)

    matrix = np.append(matrix, new_line, axis=0)
    matrix = np.append(matrix, new_column, axis=1)

    matrix = np.delete(matrix, [best_i, best_j], axis=0)
    matrix = np.delete(matrix, [best_i, best_j], axis=1)

  def get_order(gt):
    """((4, (0, 1)), (2, 3)) -> [4, 0, 1, 2, 3]"""
    if type(gt) == int:
        return [gt]
    else:
        return get_order(gt[0]) + get_order(gt[1])
    
  order = get_order(order[0])
  reverse_order = [order.index(i) for i in range(len(order))]
  
  alignment = clusters[0]
  alignment = [alignment[i] for i in reverse_order]
  
  return alignment, D[-1,-1]

In [40]:
# test: easy example
seq = ["CAT", "CHAT", "HER", "CATS"]

align, score = progressive_alignment(seq, sim_blossum)

for seq in align:
    print(seq)

C-AT
CHAT
-HER
CATS


In [46]:
# test: real example
seq, ids = read_fasta("data/balibase/RV11.unaligned/BB11003.fasta")
#seq = read_fasta("data/paralogs.fasta")

for s in seq:
  print(s)
print()

alignment, score = progressive_alignment(seq, sim_blossum, gap_opening=-10, gap_extension=-0.5, linear=False)
for s in alignment:
  print(s)
print(f"Score of alignment: {score}")
print(f"Size of alignment: {len(alignment[0])}")

filename = "data/balibase/results/BB11003.fasta"
write_fasta(filename, alignment, ids)

SISDTVKRAREAFNSGKTRSLQFRIQQLEALQRMINENLKSISGALASDLGKNEWTSYYEEVAHVLEELDTTIKELPDWAEDEPVAKTRQTQQDDLYIHSEPLGVVLVIGAWNYPFNLTIQPMVGAVAAGNAVILKPSEVSGHMADLLATLIPQYMDQNLYLVVKGGVPETTELLKERFDHIMYTGSTAVGKIVMAAAAKHLTPVTLELGGKSPCYVDKDCDLDVACRRIAWGKFMNSGQTCVAPDYILCDPSIQNQIVEKLKKSLKDFYGEDAKQSRDYGRIINDRHFQRVKGLIDNQKVAHGGTWDQSSRYIAPTILVDVDPQSPVMQEEIFGPVMPIVCVRSLEEAIQFINQREKPLALYVFSNNEKVIKKMIAETSSGGVTANDVIVHITVPTLPFGGVGNSGMGAYHGKKSFETFSHRRSCLVKSLLNEEAHKARYPPSPA
MTVEPFRNEPIETFQTEEARRAMREALRRVREEFGRHYPLYIGGEWVDTKERMVSLNPSAPSEVVGTTAKAGKAEAEAALEAAWKAFKTWKDWPQEDRSRLLLKAAALMRRRKRELEATLVYEVGKNWVEASADVAEAIDFIEYYARAALRYRYPAVEVVPYPGEDNESFYVPLGAGVVIAPWNFPVAIFTGMIVGPVAVGNTVIAKPAEDAVVVGAKVFEIFHEAGFPPGVVNFLPGVGEEVGAYLVEHPRIRFINFTGSLEVGLKIYEAAGRLAPGQTWFKRAYVETGGKNAIIVDETADFDLAAEGVVVSAYGFQGQKCSAASRLILTQGAYEPVLERVLKRAERLSVGPAEENPDLGPVVSAEQERKVLSYIEIGKNEGQLVLGGKRLEGEGYFIAPTVFTEVPPKARIAQEEIFGPVLSVIRVKDFAEALEVANDTPYGLTGGVYSRKREHLEWARREFHVGNLYFNRKITGALVGVQPFGGFKLSGTNAKTGALDYLRLFLEMKAVAERF
TDNVFYATNAFTGEALPLAFPVHTEVEVNQAATAAA

# Include structural information

### Download and read PDB files
DSSP keys:
indice de chaine, indice de residue (avec code d'insertion, souvent vide)

indice de residu, type de residu, structure secondaire (E if beta strain, H, other), solvant accessibility, phi, psi, other...

In [11]:
def get_dssp(mol:str):
    # download PDB file
    PDBfile = f"data/pdb/{mol}.pdb"
    if not os.path.exists(PDBfile):
        pdbl = PDBList()
        oldPDBfile = pdbl.retrieve_pdb_file(mol, pdir='data/pdb/', file_format="pdb")
        os.rename(oldPDBfile, PDBfile)

    # parse PDB file
    parser = PDBParser()
    structure = parser.get_structure(mol, PDBfile)

    # get DSSP
    model = structure[0]
    dssp = DSSP(model, PDBfile)
    return model, dssp

In [12]:
# test        
mol = "1idy"
model, dssp = get_dssp(mol)

for chain in model:
    i = 0
    for residue in chain:
        print(dssp[(chain.id, residue.id)])
        i += 1
        if i > 5:
            break

(1, 'M', '-', 1.0, 360.0, -66.0, 0, 0.0, 0, 0.0, 0, 0.0, 0, 0.0)
(2, 'E', '-', 0.7628865979381443, 64.4, 100.8, 2, -0.1, 3, -0.0, 1, -0.0, 0, 0.0)
(3, 'V', '-', 0.7394366197183099, -62.9, 147.7, 1, -0.1, 2, -1.6, 0, 0.0, -1, -0.0)
(4, 'K', '-', 0.9414634146341463, -65.1, 89.4, 2, -0.0, 2, -0.2, 3, -0.0, -1, -0.1)
(5, 'K', '-', 0.4926829268292683, -89.1, 156.7, -2, -1.6, 2, -1.9, 1, -0.1, -1, -0.0)
(6, 'T', '-', 0.6197183098591549, -70.5, 84.3, -2, -0.2, 2, -1.3, 1, -0.2, -1, -0.1)


In [13]:
def read_fasta_and_PDB(filename) -> Tuple[List[str], List[List[Tuple]]]:
    """
    Read a fasta file and the corresponding PDB files
    Returns:
    - seqFASTA, list of str: the sequences in the fasta file
    - DSSPs, list of list of tuple: each tuple corresponds to the DSSP data of a residue (index, name, secondary structure, burial, ... )
    """
    seqFASTA = []
    DSSPs = []
    for record in SeqIO.parse(filename, "fasta"):

        seqFASTA.append(str(record.seq))

        name = record.name
        mol, chain = name.split("_")
        if chain == "":
            chain = "A"
        model, dssp = get_dssp(mol)

        DSSPextract = []
        for residue in model[chain]:
            if is_aa(residue):
                try:
                    DSSPextract.append(dssp[(chain, residue.id)])
                except KeyError:
                    print(f"residue {residue} not found in DSSP for {mol}_{chain}")
        DSSPs.append(DSSPextract)
    return seqFASTA, DSSPs

### Align fasta and PDB
Some amino-acids might be missing in the PDB file. Therefore we have to use Needleman-Wunsch to align properly the sequence obtained from the fasta and from the PDB file.

In [14]:
def trace_back_DSSP(seq:str, dssp:List[tuple], D:np.array) -> Tuple[str, List[tuple]]:
    """ traceback matrix D to align a sequence and a list with the data from the PDB file """
    new_seq = ""
    new_dssp = []
    
    i = len(seq)
    j = len(dssp)
    while i>0 and j>0 :
        if D[i,j] == 1:
            new_seq += seq[i-1]
            new_dssp.append(dssp[j-1])
            i -= 1
            j -= 1
        elif D[i,j] == 2:
            new_seq += seq[i-1]
            new_dssp.append(tuple(['-' for _ in range(14)]))
            i -= 1
        else:
            new_seq += '-'
            new_dssp.append(dssp[j-1])
            j -= 1
    if i == 0:
        new_seq += '-'*j
        new_dssp += dssp[:j]
    elif j == 0:
        new_seq += seq[:i]
        new_dssp += [tuple() for _ in range(i)]     
    return new_seq[::-1], new_dssp[::-1]

def align_DSSP(seq:str, dssp:List[tuple]):
    """ align dssp data with brute sequence"""
    sim_bool = lambda a, b, i, j: 1 if a[i] == b[j] else 0
    M, D = compute_matrix_needleman_wunsch(seq, "".join([d[1] for d in dssp]), sim_bool, gap_cost=-1)
    new_seq, new_dssp = trace_back_DSSP(seq, dssp, D)
    return new_seq, new_dssp

def read_and_align_fasta_and_PDB(filename) -> Tuple[List[str], List[List[Tuple]]]:
    Lseq, Lddps = read_fasta_and_PDB(filename)
    new_Lseq, new_Ldssp = [], []
    for i in range(len(Lseq)):
        seq, dssp = align_DSSP(Lseq[i], Lddps[i])
        new_Lseq.append(seq)
        new_Ldssp.append(dssp)
    return new_Lseq, new_Ldssp

In [15]:
# test
seqFASTA, Dssps = read_and_align_fasta_and_PDB("data/balibase/RV11.unaligned/BB11003.fasta")

seqPDB = []
burials = []
for dssp in Dssps:
    seqPDB.append("".join([d[1] for d in dssp]))
    burials.append([d[3] for d in dssp])

for i in range(len(seqFASTA)):
    print(seqFASTA[i])
    print(seqPDB[i])



residue <Residue MSE het=H_MSE resseq=78 icode= > not found in DSSP for 1o20_A
residue <Residue MSE het=H_MSE resseq=189 icode= > not found in DSSP for 1o20_A
SISDTVKRAREAFNSGKTRSLQFRIQQLEALQRMINENLKSISGALASDLGKNEWTSYYEEVAHVLEELDTTIKELPDWAEDEPVAKTRQTQQDDLYIHSEPLGVVLVIGAWNYPFNLTIQPMVGAVAAGNAVILKPSEVSGHMADLLATLIPQYMDQNLYLVVKGGVPETTELLKERFDHIMYTGSTAVGKIVMAAAAKHLTPVTLELGGKSPCYVDKDCDLDVACRRIAWGKFMNSGQTCVAPDYILCDPSIQNQIVEKLKKSLKDFYGEDAKQSRDYGRIINDRHFQRVKGLIDNQKVAHGGTWDQSSRYIAPTILVDVDPQSPVMQEEIFGPVMPIVCVRSLEEAIQFINQREKPLALYVFSNNEKVIKKMIAETSSGGVTANDVIVHITVPTLPFGGVGNSGMGAYHGKKSFETFSHRRSCLVKSLLNEEAHKARYPPSPA
SISDTVKRAREAFNSGKTRSLQFRIQQLEALQRMINENLKSISGALASDLGKNEWTSYYEEVAHVLEELDTTIKELPDWAEDEPVAKTRQTQQDDLYIHSEPLGVVLVIGAWNYPFNLTIQPMVGAVAAGNAVILKPSEVSGHMADLLATLIPQYMDQNLYLVVKGGVPETTELLKERFDHIMYTGSTAVGKIVMAAAAKHLTPVTLELGGKSPCYVDKDCDLDVACRRIAWGKFMNSGQTCVAPDYILCDPSIQNQIVEKLKKSLKDFYGEDAKQSRDYGRIINDRHFQRVKGLIDNQKVAHGGTWDQSSRYIAPTILVDVDPQSPVMQEEIFGPVMPIVCVRSLEEAIQFINQREKPLALYVFSNNEKVIKKMIAETSSGGVTANDVIVHIT

### Use burial to modulate similarity function in Needleman-Wunsh
Idea: we keep the same algorithm but modify the gap or mutation cost depending of the structural information.

The most easy data to use is the burial score : a mutation inside of the protein is more costly than on the outside. Therefore we use the same algorythm as previopusly but multiply the similarity by a coefficient depending on the burial.


$M_{i,j}$ is defined by recursion with the following formula:
$$
M_{i,j} = \max \begin{cases}
  M_{i-1,j-1} + f(x) \cdot s(a_i,b_j) \\
  M_{i-1,j} - f(x) \cdot \text{gap cost} &\\
  M_{i,j-1} - f(x) \cdot \text{gap cost}&
\end{cases}
$$

In [16]:
def sim_burial(dssp_a, dssp_b, i, j, gap=-8)->float:
    ''' a, b tuples (sequence, list of burial values)'''
    print(dssp_a)
    bur = 0.5 + (dssp_a[i][3] + dssp_b[j][3])/2
    return bur * sim_blossum([dssp_a[i][1]], [dssp_b[j][1]], 0, 0, gap)

# Benchmark !!!
For the sequence from balivase, we want to compare:
- our computed alignment (progressive alignment using affine/linear gap with/without structural information)
- the prediction from clustalW (best alignment without structural info)
- the consensus alignment given in the fasta file

In [52]:
from bali_score2 import  compare_seq, read_fasta2dict, test

test()

file = "BB11003.fasta"

unaligned = "data/balibase/RV11.unaligned/" + file
test = "data/balibase/results/" + file
reference = "data/balibase/RV11.aligned/" + file

seqs = read_fasta(unaligned)
alignment, score = progressive_alignment(seq, sim_blossum, gap_opening=-10, gap_extension=-0.5, linear=False)
write_fasta(test, alignment)

refkeylist, refseq = read_fasta2dict(reference)
testkeylist, testseq = read_fasta2dict(test)

sp, tc = compare_seq(refseq, refkeylist, testseq, testkeylist)
print(sp, tc)


ImportError: cannot import name 'test' from 'bali_score2' (/home/flopau/Documents/3A/P2/INFBIO588_projet_bioinfo/project/aligning_proteins/bali_score2.py)