## Main code

In [24]:
import numpy as np
def initialize_matrix(x: str, y:str)->np.ndarray:
    dim = (len(x)+1, len(y)+1)
    return np.full(dim, None)
def convert_dna(x: str)-> list[int]:
        alphabet = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
        return list(alphabet.get(char) for char in x)

def fill_matrix(
    mat: np.ndarray, A: list[int], B: list[int],
    score_matrix: np.ndarray, gap_cost: int,
    opt_fn = np.min
    ):
    def cost(x: int, y: int):
        return score_matrix[x, y]
    def C(i: int, j: int):
        if mat[i, j] is not None: return mat[i, j]
        match i:
            case 0 if j == 0:
                return 0
            case _ if j == 0:
                return i * gap_cost
            case 0 if j != 0:
                return j * gap_cost
        return opt_fn([
            C(i-1, j) + gap_cost,
            C(i, j-1) + gap_cost,
            C(i-1, j-1) + cost(A[i-1],B[j-1])
        ])

    for i in range(mat.shape[0]):
        for j in range(mat.shape[1]):
            mat[i, j] = C(i, j)
    return mat
    
def compute_optimal_aligment_cost(
    x: str, y: str, score_mat: np.ndarray,
    gap_cost = 1, opt_fn = np.min
    ):
    dyn_mat = initialize_matrix(x, y)
    dyn_mat = fill_matrix(
        dyn_mat, convert_dna(x), convert_dna(y), score_mat,
        gap_cost, opt_fn
    )
    return dyn_mat[len(x), len(y)]


Example 1:

In [25]:
score_mat = np.matrix(
[[10, 2, 5, 2],
[2, 10, 2, 5],
[5, 2, 10, 2],
[2, 5, 2, 10]]
)

compute_optimal_aligment_cost('TCGAT', 'TCCAGAGA', score_mat, gap_cost=-5, opt_fn=np.max)

27

Example 2

In [26]:
compute_optimal_aligment_cost('CGTGTCAAGTCT', 'ACGTCGTAGCTAGG', score_mat, gap_cost=-5, opt_fn=np.max)

61

Question 1: What is the optimal (here maximal) cost of an alignment of AATAAT and AAGG using the above substitution matrix and gap cost -5?

In [27]:
seq1 = "AATAAT"
seq2 = "AAGG"
compute_optimal_aligment_cost(seq1, seq2,  score_mat, gap_cost=-5, opt_fn=np.max)

20

In [28]:
!pip3.10 install biopython

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.10 -m pip install --upgrade pip[0m


In [29]:
from Bio import SeqIO
sequences1 = SeqIO.parse("seq1.fasta",'fasta')
sequences2 = SeqIO.parse("seq2.fasta",'fasta')
compute_optimal_aligment_cost(
    next(sequences1), next(sequences2),  score_mat, gap_cost=-5, opt_fn=np.max
    )

1346

Question 3 (optional): How does an optimal alignment look like for the above two pairs of sequences using the given substitution matrix and gap cost -5? (you probably want to implement the algorithm for finding an optimal alignment by backtracking through the dynamic programming table.)

In [30]:
def get_optimal_alignment(
    A: str, B: str,
    score_mat: np.ndarray, gap_cost=-5, opt_fn=np.max
    ) -> str:
    T = initialize_matrix(A, B)
    A, B = convert_dna(A), convert_dna(B)
    T = fill_matrix(
        T, A, B, score_mat,
        gap_cost, opt_fn
    )
    def cost(x: int, y: int):
        return score_mat[x, y]
    def backtracking(i: int, j: int):
        if (i > 0) and (j > 0) and (T[i,j] == T[i-1, j-1] + cost(A[i], B[j])):
            return backtracking(i-1, j-1)
        if (i > 0) and (j >= 0) and (T[i,j] == T[i-1,j] + gap_cost):
            return backtracking(i-1, j)
        if (i>=0) and (j > 0) and (T[i,j] == T[i,j-1] + gap_cost):
            return backtracking(i, j-1)
    return backtracking(len(A), len(B))

In [31]:
def from_number_to_dna(x: int):
    match x:
        case -1: return "-"
        case 0: return "A"
        case 1: return "C"
        case 2: return "G"
        case 3: return "T"

def get_optimal_alignment(
    A: str, B: str,
    score_mat: np.ndarray, gap_cost=-5, opt_fn=np.max
    ) -> str:
    T = initialize_matrix(A, B)
    A, B = convert_dna(A), convert_dna(B)
    T = fill_matrix(
        T, A, B, score_mat,
        gap_cost, opt_fn
    )
    def cost(x: int, y: int):
        return score_mat[x, y]
    path = []
    def backtracking(i: int, j: int):
        if (i > 0) and (j > 0) and (T[i,j] == T[i-1, j-1] + cost(A[i-1], B[j-1])):
            path.append((i-1, j-1))
            #print(f"i: {i-1}, j: {j-1}")
            return backtracking(i-1, j-1)
        if (i > 0) and (j >= 0) and (T[i,j] == T[i-1,j] + gap_cost):
            #print(f"i: {i-1}, j: {j}")
            path.append((i-1, j))
            return backtracking(i-1, j)
        if (i>=0) and (j > 0) and (T[i,j] == (T[i,j-1] + gap_cost)):
            #print(f"i: {i}, j: {j-1}")
            path.append((i, j-1))
            return backtracking(i, j-1)
    backtracking(len(A), len(B))
    path.reverse()
    old_index1, old_index2 = 0, 0
    new_seq1, new_seq2 = [], []
    for new_index1, new_index2 in path[1:]:
        if (old_index1 +1) == new_index1 and (old_index2+1) == new_index2:
            new_seq1.append(A[old_index1])
            new_seq2.append(B[old_index2])
        if old_index1 == new_index1 and (old_index2+1) == new_index2:
            new_seq1.append(-1)
            new_seq2.append(B[old_index2])
        if (old_index1 +1) == new_index1 and old_index2 == new_index2:
            new_seq1.append(A[old_index1])
            new_seq2.append(-1)
        old_index1, old_index2 = new_index1, new_index2
    return (
        "".join(from_number_to_dna(x) for x in new_seq1),
        "".join(from_number_to_dna(x) for x in new_seq2)
        )


score_mat = np.matrix(
[[10, 2, 5, 2],
[2, 10, 2, 5],
[5, 2, 10, 2],
[2, 5, 2, 10]]
)

print(get_optimal_alignment(
    'TCCAGAGA', 'TCGAT',
    score_mat, gap_cost=-5, opt_fn=np.max
    ))

('TCCAGAG', 'T-C-GA-')
