## Main code

Import required modules:

In [58]:
import numpy as np
from Bio import SeqIO
#import array_to_latex as a2l 

Define general functions for global pairwise alignment:

In [59]:
ALPHABET = alphabet = {'A': 0, 'C': 1, 'G': 2, 'T': 3, '-': 4}
def dna2int(x: str)-> list[int]:
    '''
    >>> dna2int('ACGT-A')
    [0, 1, 2, 3, 4, 0]
    '''
    return list(alphabet.get(char) for char in x)
def int2dna(x: list[int])-> str:
    '''
    >>> int2dna([0, 1, 2, 3, 4, 0])
    'ACGT-A'
    '''
    collapsed = "".join(ALPHABET)
    return "".join(collapsed[i] for i in x)
    
def generalized_C(
    i: int, j: int, dyn_mat: np.ndarray,
    gap_cost: int, cost: callable, optimize_fn: callable,
    sequence1: list[int], sequence2: list[int]
    )-> int:
    '''
    Calculate cost of cell in dynamic table for global pairwise alignment. 
    '''
    if dyn_mat[i, j] is not None: return dyn_mat[i, j]
    match i:
        case 0 if j == 0: return 0
        case _ if j == 0: return i * gap_cost
        case 0 if j != 0: return j * gap_cost
    args = [dyn_mat, gap_cost, cost, optimize_fn, sequence1, sequence2]
    return optimize_fn(
        [generalized_C(i-1, j, *args) + gap_cost,
        generalized_C(i, j-1, *args) + gap_cost,
        generalized_C(i-1, j-1, *args) + cost(sequence1[i-1],sequence2[j-1])
        ])


def fill_global_alignment_matrix(
    x: list[int], y: list[int], score_matrix: np.ndarray, gap_cost: int, optimize_fn: callable
    ) -> np.ndarray:
    '''
    Calculate the dynamic matrix for a global pairwise alignment. 
    '''
    # Init empty matrix
    dim = (len(x)+1, len(y)+1)
    dyn_mat = np.full(dim, None)
    # Define C
    def C(i: int, j: int)-> int:
        return generalized_C(
            i, j, dyn_mat, gap_cost,
            cost= lambda x, y : score_matrix[x, y],
            optimize_fn=optimize_fn,sequence1=x, sequence2= y)
    # Fill matrix
    for i in range(dyn_mat.shape[0]):
        for j in range(dyn_mat.shape[1]):
            dyn_mat[i, j] = C(i, j)
    return dyn_mat

def generalized_optimal_aligment_cost(
    x: str, y: str, score_matrix: np.ndarray, gap_cost: int, optimize_fn: callable
    ) -> int:
    '''
    Calculate the optimal cost of a global pairwise alignment. 
    '''
    # Convert DNA to list of int
    x, y = dna2int(x), dna2int(y)
    dyn_mat = fill_global_alignment_matrix(x, y, score_matrix, gap_cost, optimize_fn)
    return dyn_mat[len(x), len(y)]


## Optimal alignment cost

Finally, define a concrete function for solving our problem under this project specifications. 

We define the score matrix as follows:

In [60]:
SCORE_MATRIX = np.matrix(
    [[10, 2, 5, 2],
    [2, 10, 2, 5],
    [5, 2, 10, 2],
    [2, 5, 2, 10]]
    )
#a2l.to_ltx(SCORE_MATRIX)

\begin{bmatrix}
  10.00 &  2.00 &  5.00 &  2.00\\
  2.00 &  10.00 &  2.00 &  5.00\\
  5.00 &  2.00 &  10.00 &  2.00\\
  2.00 &  5.00 &  2.00 &  10.00
\end{bmatrix}

In [61]:

def optimal_aligment_cost(x: str, y: str) -> int:
    '''
    Calculate optimal alignment cost for this project specifications. 
    >>> optimal_aligment_cost("", "")
    0
    >>> optimal_aligment_cost("TCCAGAGA", "TCGAT")
    27
    >>> optimal_aligment_cost("CGTGTCAAGTCT", "ACGTCGTAGCTAGG")
    61
    '''
    return generalized_optimal_aligment_cost(
        x, y, SCORE_MATRIX, -5, np.max
    )


## Running doctest

In [83]:
import doctest
doctest.testmod(verbose=True)

Trying:
    dna2int('ACGT-A')
Expecting:
    [0, 1, 2, 3, 4, 0]
ok
Trying:
    int2dna([0, 1, 2, 3, 4, 0])
Expecting:
    'ACGT-A'
ok
Trying:
    optimal_aligment("", "")
Expecting:
    ('', '')
ok
Trying:
    optimal_aligment("TCCAGAGA", "TCGAT")
Expecting:
    ('TCCAGAGA', 'T-C-GA-T')
ok
Trying:
    optimal_aligment("CGTGTCAAGTCT", "ACGTCGTAGCTAGG")
Expecting:
    ('-CGT-GTCAAGT-CT', 'ACGTCGT-AGCTAGG')
ok
Trying:
    optimal_aligment_cost("", "")
Expecting:
    0
ok
Trying:
    optimal_aligment_cost("TCCAGAGA", "TCGAT")
Expecting:
    27
ok
Trying:
    optimal_aligment_cost("CGTGTCAAGTCT", "ACGTCGTAGCTAGG")
Expecting:
    61
ok
7 items had no tests:
    __main__
    __main__.C
    __main__.fill_global_alignment_matrix
    __main__.generalized_C
    __main__.generalized_optimal_aligment_cost
    __main__.optimal_aligment_cost_from_fasta
    __main__.recursive_back_tracking
4 items passed all tests:
   1 tests in __main__.dna2int
   1 tests in __main__.int2dna
   3 tests in __main__.op

TestResults(failed=0, attempted=8)

## Question 1
What is the optimal (here maximal) cost of an alignment of AATAAT and AAGG using the above substitution matrix and gap cost -5?

In [63]:
optimal_aligment_cost("AATAAT", "AAGG")

20

## Question 2
What is the optimal (here maximal) cost of an alignment of seq1.fasta and seq2.fasta using the same substitution matrix and gap cost? (You probably want to implement the algorithm for computing the cost of an optimal alignment.)

In [64]:
def optimal_aligment_cost_from_fasta(path1: str, path2: str)-> int:
    x: SeqIO.SeqRecord = next(SeqIO.parse(path1,'fasta'))
    y: SeqIO.SeqRecord = next(SeqIO.parse(path2,'fasta')) 
    return optimal_aligment_cost(x.seq, y.seq)


In [65]:
optimal_aligment_cost_from_fasta("seq1.fasta", "seq2.fasta")

1346

In [66]:


x, y = dna2int("AATAAT"), dna2int("AAGG")
# Init empty matrix
dim = (len(x)+1, len(y)+1)
dyn_mat = np.full(dim, None)
gap_cost = -5
# Define C
def C(i: int, j: int)-> int:
    return generalized_C(
        i, j, dyn_mat, gap_cost,
        cost= lambda x, y : SCORE_MATRIX[x, y],
        optimize_fn=np.max,sequence1=x, sequence2= y)
# Fill matrix
for i in range(dyn_mat.shape[0]):
    for j in range(dyn_mat.shape[1]):
        dyn_mat[i, j] = C(i, j)

recursive_back_tracking(len(x), len(y), dyn_mat, x, y, SCORE_MATRIX, gap_cost)


([3, 0, 0, 3, 0, 0], [4, 2, 2, 4, 0, 0])

In [71]:
from typing import Tuple
def recursive_back_tracking(
    i: int, j: int, T: np.ndarray,
    A: list[int], B: list[int], score_matrix: np.ndarray, gap_cost: int,
    aligned_1 = None, aligned_2 = None
    )-> Tuple[list[int], list[int]]:
    if aligned_1 is None and aligned_2 is None:
        aligned_1, aligned_2 = list(), list()
    cost = lambda x, y : score_matrix[x, y]
    args = [T, A, B, score_matrix, gap_cost, aligned_1, aligned_2]
    if (i > 0) and (j > 0) and T[i,j] == T[i-1, j-1] + cost(A[i-1], B[j-1]):
        aligned_1.append(A[i-1])
        aligned_2.append(B[j-1])
        return recursive_back_tracking(i-1, j-1, *args)
    if (i > 0) and (j >= 0) and T[i,j] == T[i-1,j] + gap_cost:
        aligned_1.append(A[i-1])
        aligned_2.append(ALPHABET["-"])
        return recursive_back_tracking(i-1, j, *args)
    if (i>=0) and (j > 0) and T[i,j] == T[i,j-1] + gap_cost:
        aligned_1.append(ALPHABET["-"])
        aligned_2.append(B[j-1])
        return recursive_back_tracking(i, j-1, *args)
    return (aligned_1, aligned_2)


In [82]:
def optimal_aligment(x: str, y: str) -> Tuple[str, str]:
    '''
    Calculate one optimal alignment for this project specifications. 
    >>> optimal_aligment("", "")
    ('', '')
    >>> optimal_aligment("TCCAGAGA", "TCGAT")
    ('TCCAGAGA', 'T-C-GA-T')
    >>> optimal_aligment("CGTGTCAAGTCT", "ACGTCGTAGCTAGG")
    ('-CGT-GTCAAGT-CT', 'ACGTCGT-AGCTAGG')
    '''
    # Convert DNA to list of int
    x, y = dna2int(x), dna2int(y)
    dyn_mat = fill_global_alignment_matrix(x, y, SCORE_MATRIX, -5, np.max)
    aln_1, aln_2 = recursive_back_tracking(
        len(x), len(y), dyn_mat, x, y, SCORE_MATRIX, gap_cost
        )
    
    return (int2dna(reversed(aln_1)), int2dna(reversed(aln_2)))

## Question 3 (optional)
How does an optimal alignment look like for the above two pairs of sequences using the given substitution matrix and gap cost -5? (you probably want to implement the algorithm for finding an optimal alignment by backtracking through the dynamic programming table.)

In [12]:
# def get_optimal_alignment(
#     A: str, B: str,
#     score_mat: np.ndarray, gap_cost=-5, opt_fn=np.max
#     ) -> str:
#     T = initialize_matrix(A, B)
#     A, B = dna2int(A), dna2int(B)
#     T = fill_matrix(
#         T, A, B, score_mat,
#         gap_cost, opt_fn
#     )
#     def cost(x: int, y: int):
#         return score_mat[x, y]
#     def backtracking(i: int, j: int):
#         if (i > 0) and (j > 0) and (T[i,j] == T[i-1, j-1] + cost(A[i], B[j])):
#             return backtracking(i-1, j-1)
#         if (i > 0) and (j >= 0) and (T[i,j] == T[i-1,j] + gap_cost):
#             return backtracking(i-1, j)
#         if (i>=0) and (j > 0) and (T[i,j] == T[i,j-1] + gap_cost):
#             return backtracking(i, j-1)
#     return backtracking(len(A), len(B))

In [11]:
# def get_optimal_alignment(
#     A: str, B: str,
#     score_mat: np.ndarray, gap_cost=-5, opt_fn=np.max
#     ) -> str:
#     T = initialize_matrix(A, B)
#     A, B = dna2int(A), dna2int(B)
#     T = fill_matrix(
#         T, A, B, score_mat,
#         gap_cost, opt_fn
#     )
#     def cost(x: int, y: int):
#         return score_mat[x, y]
#     path = []
#     def backtracking(i: int, j: int):
#         if (i > 0) and (j > 0) and (T[i,j] == T[i-1, j-1] + cost(A[i-1], B[j-1])):
#             path.append((i-1, j-1))
#             #print(f"i: {i-1}, j: {j-1}")
#             return backtracking(i-1, j-1)
#         if (i > 0) and (j >= 0) and (T[i,j] == T[i-1,j] + gap_cost):
#             #print(f"i: {i-1}, j: {j}")
#             path.append((i-1, j))
#             return backtracking(i-1, j)
#         if (i>=0) and (j > 0) and (T[i,j] == (T[i,j-1] + gap_cost)):
#             #print(f"i: {i}, j: {j-1}")
#             path.append((i, j-1))
#             return backtracking(i, j-1)
#     backtracking(len(A), len(B))
#     path.reverse()
#     old_index1, old_index2 = 0, 0
#     new_seq1, new_seq2 = [], []
#     for new_index1, new_index2 in path[1:]:
#         if (old_index1 +1) == new_index1 and (old_index2+1) == new_index2:
#             new_seq1.append(A[old_index1])
#             new_seq2.append(B[old_index2])
#         if old_index1 == new_index1 and (old_index2+1) == new_index2:
#             new_seq1.append(-1)
#             new_seq2.append(B[old_index2])
#         if (old_index1 +1) == new_index1 and old_index2 == new_index2:
#             new_seq1.append(A[old_index1])
#             new_seq2.append(-1)
#         old_index1, old_index2 = new_index1, new_index2
#     return (
#         "".join(from_number_to_dna(x) for x in new_seq1),
#         "".join(from_number_to_dna(x) for x in new_seq2)
#         )


# score_mat = np.matrix(
# [[10, 2, 5, 2],
# [2, 10, 2, 5],
# [5, 2, 10, 2],
# [2, 5, 2, 10]]
# )

# print(get_optimal_alignment(
#     'TCCAGAGA', 'TCGAT',
#     score_mat, gap_cost=-5, opt_fn=np.max
#     ))