In [None]:
#Helper functions

#convert list of bases ([0,0,1]) to string ("AAT")
def to_base_string(s):
    base_s = ""
    bases = ['A', 'T', 'G', 'C', '_']
    for i in range(len(s)):
        val = s[i]
        base_s += bases[val]
    return base_s

#convert base string ("AAT") to list of bases ([0,0,1])
def to_base_list(base_s):
    s = []
    for i in range(len(base_s)):
        b = base_s[i]
        if(b == 'A'):
            s.append(0)
        elif(b == 'T'):
            s.append(1)
        elif(b =='G'):
            s.append(2)
        elif(b =='C'):
            s.append(3)
        elif(b=='_'):
            s.append(4)
        else:
            raise Exception(f"Invalid base string, contains character {b}.")
    return s

In [None]:
#Vizualizing functions

def print_find_alignment(s,s1,s2,i1,i2):
    print(i2*"_" + to_base_string(s1))
    print(i1*"_" + to_base_string(s2))
    print(f"score:{s}")
    print("----")
    
def show_alignment(s,i):
    pan = 10
    if(i < 0):
        pan = pan + i
    print(pan * '+' + i * '_' + to_base_string(s) + f"    :{i}")

In [None]:
#score and alignment functions

#calculate number of bases overlapping
#AAA
# AAA
#overlap - 2
def calc_overlap(s1,s2,i1,i2):
    d1 = len(s1) - i1
    d2 = len(s2) - i2
    return min(d1,d2)

#score calculating function of a particular alignment eg
#AAAT
# AATT
def calc_score_basic(s1,s2,i1,i2):
    total_score = 0
    overlap = calc_overlap(s1,s2,i1,i2)
    
    while(i1 < len(s1) and i2 < len(s2)):
        score = -1
        if(s1[i1] == s2[i2]):
            score = 1
        
        i1 += 1
        i2 += 1
        total_score += score
    return total_score + overlap

#Find the most suitable alignment of two base lists, eg AAAT, AATT
def find_alignment(s1,s2,with_print = False):
    i1 = len(s1) - 1
    i2 = 0
    
    max_score = -100
    max_s2 = []
    max_i2 = 0
    
    while(i2 < len(s2)):
        score = calc_score_basic(s1,s2,i1,i2)
        
        if with_print:
            print_find_alignment(score,s1,s2,i1,i2)
        
        if(score > max_score):
            max_score = score
            max_s2 = s2
            if(i1 > 0):
                max_i2 = i1
            else:
                max_i2 = -1*i2
        if(i1 > 0):
            i1 -= 1
        else:
            i2 += 1
    return max_score, max_s2, max_i2

#wrapper
def align(s1,s2, with_print = False):
    score, s2, i2 = find_alignment(s1,s2,with_print) 
    return s2,i2

In [None]:
#test
s1 = to_base_list("AATTCCGG")
s2 = to_base_list("CCGGTT")
align(s1,s2,True)

In [None]:
path = "d:/programming/python/data/assemblydata.npy" #path to the label / unassembled sequences
import numpy as np

file = np.load(path, allow_pickle=True)
sequences_list = []
sequences = file[0]
reference = file[1]

for i in range(len(sequences)):
    sequences_list.append(to_base_list(sequences[i]))

In [None]:
import collections
alignments = []
max_length = 0

f = 0
t = len(sequences_list) - 1

#align first 2 sequences
s1 = sequences_list[f]
s2 = sequences_list[f+1]
b2, index = align(s1,s2)
alignments.append([0, s1])
alignments.append([index,b2])

#align next sequence with the last aligned
for i in range(f+1, t):
    indexes = []
    for j in range(0,2):
        if(i-j < 0):
            continue
        s1 = alignments[i - j - f][1]
        s2 = sequences_list[i+1]
                
        sequence, shift = align(s1,s2)
        indexes.append(shift)
    
    common_index = collections.Counter(indexes).most_common(1)[0][0]  
    index += common_index
    alignments.append([index,sequences_list[i+1]])
    
    l = index + len(sequences_list[i+1])
    if(l > max_length):
        max_length = l

#vizuaize alignments 
for i in range(len(alignments)):
    show_alignment(alignments[i][1], alignments[i][0])

In [None]:
#consensus - column by column, pick most occuring base in that column
import collections

dna = []
for col in range(max_length):
    col_bases = []
    for j in range(len(alignments)):   
        i = alignments[j][0]
        s = alignments[j][1]
        if(col >= i and col < i + len(s)):
            col_bases.append(s[col-i])

    freq_base = collections.Counter(col_bases).most_common(1)
    if(freq_base != []):
        dna.append(freq_base[0][0])
print(to_base_string(dna))