In [22]:
import numpy as np
from typing import Tuple
import itertools
from Bio import SeqIO, SeqRecord, Seq

In [23]:
SCORE_MATRIX = np.matrix(
    [[0, 5, 2, 5, 5],  # A
    [5, 0, 5, 2, 5],  # C
    [2, 5, 0, 5, 5],  # G
    [5, 2, 5, 0, 5],  # T
    [5, 5, 5, 5, 0]]  #-'
    )
GAP_CHAR = 4
ALPHABET = alphabet = {'A': 0, 'C': 1, 'G': 2, 'T': 3, '-': 4}
def dna2int(x: str)-> list[int]:
    '''
    >>> dna2int('ACGT-A')
    [0, 1, 2, 3, 4, 0]
    '''
    return list(alphabet.get(char) for char in x)
def int2dna(x: list[int])-> str:
    '''
    >>> int2dna([0, 1, 2, 3, 4, 0])
    'ACGT-A'
    '''
    collapsed = "".join(ALPHABET)
    return "".join(collapsed[i] for i in x)

def score_sum_pairs(x: int, y: int, z: int)-> int:
    """
    >>> score_sum_pairs(0, 0, 0)
    0
    >>> score_sum_pairs(0, 1, 2)
    12
    """
    return SCORE_MATRIX[x, y] + SCORE_MATRIX[x, z] + SCORE_MATRIX[y, z]

Now, we define our function for computing the exact 3 dimensional matrix for a global alignment:

In [24]:
def get_plausible_combinations(index):
    """This function computes all possible combinations for a given index 
    assuming a global alignment. It' 1-indexed, so 0 means gap. 
    >>> get_plausible_combinations((0, 0, 0))
    set()
    >>> get_plausible_combinations((22, 1, 10))
    {(0, 0, 10), (0, 1, 10), (0, 1, 0), (22, 1, 10), (22, 0, 10), (22, 0, 0), (22, 1, 0)}
    """
    all_combs = np.array(list(itertools.product([0, 1], repeat=len(index))))
    return set(tuple(index * comb) for comb in all_combs if tuple(index * comb) != (0, 0, 0))
def get_previous_index(index, comb):
    """This function gets the previous index for a given combination and index
    The combination it's 1-indexed in reference to the sequence
    >>> get_previous_index((1, 1, 1), (1, 1, 1))
    (0, 0, 0)
    >>> get_previous_index((15, 12, 1), (1, 0, 1))
    (14, 12, 0)
    """
    return tuple(i - 1 if v else i for i, v in zip(index, comb))

def compute_3k_matrix(seq1: str, seq2: str, seq3: str)-> np.ndarray:
    """
    >>> compute_3k_matrix("", "", "")[-1, -1, -1]
    0
    >>> D = compute_3k_matrix("GTTCCGAAAGGCTAGCGCTAGGCGCC", "ATGGATTTATCTGCTCTTCG", "TGCATGCTGAAACTTCTCAACCA")
    >>> D[-1, -1, -1]
    198
    """
    x, y, z = dna2int(seq1), dna2int(seq2), dna2int(seq3)
    shapes = (len(x) + 1, len(y) +1, len(z) +1)
    D = np.zeros(shapes, dtype = "int")
    for index in np.ndindex(D.shape):
        possibilities = set()
        for comb in get_plausible_combinations(index):
            previous_cost = D[get_previous_index(index, comb)]
            extension_cost = score_sum_pairs(
                *[sequence[v-1] if v else GAP_CHAR for v, sequence in zip(comb, [x, y, z])]
            )
            possibilities.add(previous_cost + extension_cost)
        if possibilities:
            D[index] = min(possibilities)    
    return D



Now are going to run a longer test (this should return a 1482 cost)

In [25]:
# long_test = "tests/testdata_long.txt"
# compute_3k_matrix(
#     *[str(x.seq) for x in SeqIO.parse(long_test,'fasta')]
#     )[-1, -1, -1]

In [26]:
def linear_backtrack(D: np.ndarray,A: list[str], B: list[str], C: list[str])-> Tuple[str, str, str]:
    """Compute alignment in linear time using the whole cost matrix"""
    aligned_sequences = [list(), list(), list()]
    A, B, C = sequences = [dna2int(A), dna2int(B), dna2int(C)] 
    x, y, z = index = tuple(elm -1 for elm in D.shape)
    while x or y or z:
        for comb in get_plausible_combinations(index):
            previous_pos = get_previous_index(index, comb)
            new_aligned_col = [sequence[v-1] if v else GAP_CHAR for v, sequence in zip(comb, sequences)]
            if D[x,y, z] == D[previous_pos] + score_sum_pairs(*new_aligned_col):
                for seq, char in zip(aligned_sequences, new_aligned_col): seq.append(char)
                index = x, y, z = previous_pos
                break
    return tuple(int2dna(reversed(seq)) for seq in aligned_sequences)

def calculate_3k_alignment(seq1: str,seq2: str,seq3: str):
    """
    Compute an optimal global alignment of 3 sequences
    >>> calculate_3k_alignment("GTTCCGAAAGGCTAGCGCTAGGCGCC", "ATGGATTTATCTGCTCTTCG", "TGCATGCTGAAACTTCTCAACCA")
    ('GTTCCGAAAGGCTAGCGCTAGGC-GCC-', 'AT---GGAT--TT-AT-CTGCTC-TTCG', '-T---GCATG-CTGAAACTTCTCAACCA')
    """
    D = compute_3k_matrix(seq1, seq2, seq3)
    return linear_backtrack(D, seq1, seq2, seq3)



In [27]:
import doctest
doctest.testmod()

TestResults(failed=0, attempted=12)