-
Notifications
You must be signed in to change notification settings - Fork 1
/
mult.py
93 lines (73 loc) · 3.29 KB
/
mult.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import sys
from itertools import product
from rosalind_utility import parse_fasta
def multiple_alignment(str_list):
str_list = ["-" + string for string in str_list]
score_mat = {}
backtrack_mat = {}
def add_tuples_elemwise(t1, t2):
return tuple(sum(x) for x in zip(t1, t2))
## all possible "moves"
perm_list = list(product([0, -1], repeat=len(str_list)))[1:]
## fill n-dimensional score and backtrack matrices
for index in product(*map(range, map(lambda s: len(s) + 1, str_list))):
if index.count(0) >= len(str_list) - 1:
if sum(index) == 0:
score_mat[index] = 0
else:
score_mat[index] = -max(index)
move = tuple(0 if id == 0 else -1 for id in index)
bck = -1
for idx, perm in enumerate(perm_list):
if perm == move:
bck = idx
break
backtrack_mat[index] = bck
else:
possible_scores = []
for perm_idx, move in enumerate(perm_list):
prev_idx = add_tuples_elemwise(index, move)
if -1 not in prev_idx:
prev_score = score_mat[prev_idx]
chars = [str_list[i][index[i] - 1] if val == -1 else "-" for i, val in enumerate(move)]
current_cost = 0
for i in range(len(chars) - 1):
for j in range(i + 1, len(chars)):
if chars[i] != chars[j]:
current_cost -= 1
possible_scores.append((prev_score + current_cost, perm_idx))
score_mat[index], backtrack_mat[index] = max(possible_scores, key=lambda p: p[0])
## backtrack
alignment = ["" for x in str_list]
current_index = list(map(len, str_list))
max_score = score_mat[tuple(current_index)]
while sum(current_index) != 0:
back_perm_idx = backtrack_mat[tuple(current_index)]
permutation = perm_list[back_perm_idx]
for i, perm_value in enumerate(permutation):
if perm_value == 0:
alignment[i] = "-" + alignment[i]
else:
alignment[i] = str_list[i][current_index[i] - 1] + alignment[i]
current_index = add_tuples_elemwise(tuple(current_index), permutation)
## remove all "-" columns
to_rm_idx = []
for pos in range(len(alignment[0])):
temp = [x[pos] for x in alignment]
if all(x == "-" for x in temp):
to_rm_idx.append(pos)
for i in range(len(alignment)):
alignment[i] = "".join([char for idx, char in enumerate(alignment[i]) if idx not in to_rm_idx])
return max_score, alignment
if __name__ == "__main__":
'''
Given: A collection of four DNA strings of length at most 10 bp in FASTA format.
Return: A multiple alignment of the strings having maximum score, where we score matched symbols 0 (including
matched gap symbols) and all mismatched symbols -1 (thus incorporating a linear gap penalty of 1).
'''
input_lines = sys.stdin.read().splitlines()
DNA_strings_list = list(parse_fasta(input_lines).values())
score, alignment = multiple_alignment(DNA_strings_list)
print(score)
for aligned in alignment:
print(aligned)