In [37]:
from collections import defaultdict

from utils import debruijn_from_patterns, path_to_genome

In [26]:
input = [
    "ATG",
    "ATG",
    "TGT",
    "TGG",
    "CAT",
    "GGA",
    "GAT",
    "AGA"
]

In [28]:
def in_out_degree(a_list):
    outgoing = dict()
    incoming = defaultdict(int)
    
    for h, l in a_list.items():
        outgoing[h] = len(l)
        for t in l:
            incoming[t] += 1
            
    return incoming,  outgoing

In [34]:
def maximal_nonbranching_paths(a_list):
    indegree, outdegree = in_out_degree(a_list)
    
    paths = list()
    
    for node in a_list.keys():
        if indegree.get(node,0) != 1 or outdegree.get(node,0) != 1:
            if outdegree.get(node,0) > 0:
                for out in a_list[node]:
                    path = [node]
                    path.append(out)
                    while indegree.get(out,0) == 1 and outdegree.get(out,0) == 1:
                        out = a_list[out][0]
                        path.append(out)
                    paths.append(path)
                    
    return paths

In [38]:
def find_contigs(patterns):
    graph = debruijn_from_patterns(patterns)
    paths = maximal_nonbranching_paths(graph)
    contigs = []
    for path in paths:
        contigs.append(path_to_genome(path))
    return sorted(contigs)

In [41]:
print(" ".join(find_contigs(input)))

AGA ATG ATG CAT GAT TGGA TGT


In [42]:
with open("../data/dataset_205_5.txt", "r") as fin:
    input = []
    for line in fin:
        input.append(line.strip())
print(" ".join(find_contigs(input)))

AAAAATTTAAACCTTAGCCCACAACAATGGTTTTCAGTGCAGGTTTGCTGACCTTCGTTGTTTGCAACACCGGACGTACGCGTCGTCAACGTCTAACGCTCGTCGATTGGTGATTACTAGGAACCCCGTGTCGAAGCAG AAAACTAGTTCCCATCCGCCCGGCAACTTGCATCTTCCAGCGAGCTCACCACACTCGCCCTGTGATTAACC AAAACTAGTTCCCATCCGCCCGGCAACTTGCATCTTCCAGCGAGCTCACCACACTCGCCCTGTGATTAACC AAAATCGGACTAGTTGAATAACGAGGTATCAATCAACGCGCTGTCACTCGTGCGGCAGACTGTTCACCAAA AAAATCGGACTAGTTGAATAACGAGGTATCAATCAACGCGCTGTCACTCGTGCGGCAGACTGTTCACCAAA AAACAGGAATGTATATTCTTATGTGTGGCGAAGTATGCGTGGTCTGGAATAGTCTATCCTTGATCACCACAAATGCCTTTAATGCCATTCGAGCTGGTCCGAGTATCTCTCGACCACCTTATATGCTGACAGTAACCGGT AAACTAGTTCCCATCCGCCCGGCAACTTGCATCTTCCAGCGAGCTCACCACACTCGCCCTGTGATTAACCC AAACTAGTTCCCATCCGCCCGGCAACTTGCATCTTCCAGCGAGCTCACCACACTCGCCCTGTGATTAACCC AAATCGGACTAGTTGAATAACGAGGTATCAATCAACGCGCTGTCACTCGTGCGGCAGACTGTTCACCAAAT AAATCGGACTAGTTGAATAACGAGGTATCAATCAACGCGCTGTCACTCGTGCGGCAGACTGTTCACCAAAT AAATCTGTGGAGATGACCCCCCTAAAAGCGACGACACGATGCGAGCAGGCGACTGGATCACACATCATGAC AAATCTGTGGAGATGACCCCCCTAAAAGCGACGACACGATGCGAGCAGGCGACTGGATCACACATCATGAC