In [2]:
def FASTA_iterator(fasta_filename):

    """
    A generator function that reads a FASTA file. At each iteration, the
    function must return a tuple with the format (identifier, sequence).
    """

    with open(fasta_filename, 'rt') as fasta:
        sequence = ''
        identifier = ''
        my_list = []
        for line in fasta:
            if (line[0] == '>'):
                if (sequence != ''):
                    my_tuple = (identifier, sequence)
                    yield(my_tuple)
                    identifier= ''
                    sequence= ''
                identifier = line[1:].strip()
            else:
                sequence += line.strip()
        my_tuple = (identifier, sequence)
        yield(my_tuple)

In [69]:
def parsimony_add(s1, s2):
    n = len(s1)
    m = len(s2)
    a = 0
    for j in range(min(n, m)):
        if s1[n-j-1:] == s2[:j+1]:
            a = j + 1
    return s1[:n-a] + s2

def overlap(s1, s2):
    n = len(s1)
    m = len(s2)
    a = 0
    for j in range(min(n, m)):
        if s1[n-j-1:] == s2[:j+1]:
            a = j + 1
    if 2 * a > max(n, m):
        return True
    else:
        return False

def build_graph(dict_of_nodes):
    graph = {a: set() for a in dict_of_nodes}
    for a in dict_of_nodes:
        for b in dict_of_nodes:
            if a != b:
                if overlap(dict_of_nodes[a], dict_of_nodes[b]):
                    graph[a].add(b)
    return graph

In [70]:
def find_all_paths(graph, start, end, path=[]):
    path = path + [start]
    if start == end:
        return [path]
    if graph[start] == set():
        return []
    paths = []
    for node in graph[start]:
        if node not in path:
            newpaths = find_all_paths(graph, node, end, path)
            for newpath in newpaths:
                paths.append(newpath)
    return paths

In [85]:
def search_hamiltonian(graph):
    for a in graph:
        for b in graph:
            paths_ab = find_all_paths(graph, a, b)
            for path in paths_ab:
                if len(path) == len(graph):
                    return path

def superstring(fasta_filename):
    list_of_nodes = []
    for read in FASTA_iterator(fasta_filename):
        list_of_nodes.append(read)
    dict_of_nodes = dict(list_of_nodes)
    graph = build_graph(dict_of_nodes)
    path = search_hamiltonian(graph)
    superstring = ''
    for node in path:
        superstring = parsimony_add(superstring, dict_of_nodes[node])
    return superstring

In [90]:
fasta_filename = '/home/hanuman/docs/biomatics/rosalind/superstring/input-example.txt'
superstring(fasta_filename)

'ATTAGACCTGCCGGAATAC'

In [92]:
fasta_filename = '/home/hanuman/docs/biomatics/rosalind/superstring/input.txt'
print(superstring(fasta_filename))

ACCTCAACCGAACGCCTCAGACTATCTGTATCTATTTACGGACTATGTTGGGTGCTTGAGGGGTAAGCCCACGGTGTTGTTATACTGGCTGCCTGTACGGTGTTGCGGACAAGTATCTGTCGTTGGTGCGCCCTCCGCTAATCCGACTGAAGTATCGTCTAGAAAAGTCAACCCCGACTCGCTCTTGATGCCAATCTGATCTGCCAACTGTGTCCACACCGGTGCATCCTCTGCTTATCATCCTTGCCAAATTAACGTTCCAATGATCACTTCAATGGAATTGTGTATCTGCCTCGCGTTCGACGTGTATTTAGAGGAACTATTAAACGGTATTGTCACGTTTAGGTAGCGGCTCCTCGATAAGGCACCCCATGGATTGCTCCACAAGTATAAGGCGCACCGAGAGCTTCCAATCGGGACTCGAGGTGTATCTGGAGGGTATAGAAATCTTCTAAACATTGCATCTGGGTTTACCGTCAACCTCTTAAGCACTGCCCTGTATGTCCCAAATATTCAGAGACAGTATATGTTCCATGGCATGCGGGACTCAGGTATCATACCTCATAGGACTCTGTACGCGCCGGAAAAGCCTAGAAAGGTATATAGTACGCATAATGGTATGGCCCGCGCGAAGCTATAACCCGTTGAATCAATCAGCGTCTATGATCCTGCAGGTCTCCATATCTGCCTACCTTAAGGCAATTGTGTTCCTTACGCGGGCGTTCATGAGATTCCTTAAGCTCTGTTTGGGCGAATGCCGTAAGAGTGCCCATAAATACATGAAGGCGTCATGACGTGGTGCGGAACGTGTACCGTACGACCACCCTCGGCCACAAAGTGCTCGATCATCTTCTGCAAGTGGTGAATGGGAGATCCCAGTATGCTGGATCAAAATATTCGACACTCCTGCTTTTATCGCGAGAAGTCTCCCACCTTAGGCCAGACTTATCTCCCGTTTCAGGCAAATAAGACCGCGCCGATGGAGTGAAGGGCAAAGTTG