In [1]:
def overlap(seq_a, seq_b, minlength):
    start = 0; # starts all the way to the left
    while (True):
        start = seq_a.find(seq_b[:minlength], start) # looks for b's suffix in a
        if start == -1: # no more occurrences to the right
            return 0
            # found occurrences, checks for full suffix-prefix match
        if seq_b.startswith(seq_a[start:]):
            return len(seq_a) - start
        start += 1 # Moves just past the previous match

In [4]:
from itertools import permutations
# Brute force shortest common string algorithm
def brute_force_assembly(reads):
    """
    Compute shortest common superstrings from permutation of reads
    """
    overlap_map = dict()
    shortest_superstring = None
    for read in permutations(reads):
        current_superstring = read[0]
        for i in range(len(read) - 1):
            overlap_len = overlap(read[i], read[i + 1], 1)
            # We also have to concatenate two substrings even though they have no overlaps
            current_superstring += read[i + 1][overlap_len:]
        if shortest_superstring is None or len(current_superstring) < len(shortest_superstring):
            shortest_superstring = current_superstring
    return shortest_superstring

In [5]:
# Greedy shortest common string algorithm
def find_max_overlap(reads, min_len_overlap=2):
    max_overlap_length = 0
    read_a_max, read_b_max = None, None
    for read_a, read_b in permutations(reads, 2): # permutation of 2 to choose the pair of reads
        overlap_len = overlap(read_a, read_b, min_len_overlap)
        if max_overlap_length < overlap_len:
            max_overlap_length = overlap_len
            read_a_max = read_a
            read_b_max = read_b
    return read_a_max, read_b_max, max_overlap_length

def greedy_SCS(reads, min_len_overlap):
    read_a, read_b, max_overlap_len = find_max_overlap(reads, min_len_overlap)
    while max_overlap_len > 0: # If there is no more overlap in the read list, just concat them together!
        reads.remove(read_a)
        reads.remove(read_b)
        reads.append(read_a + read_b[max_overlap_len:])
        read_a, read_b, max_overlap_len = find_max_overlap(reads, min_len_overlap)
    return "".join(reads)

In [7]:
greedy_SCS(["ABC", "BCA", "CAB"], 2)

'CABCA'

In [8]:
greedy_SCS(["ABCD", "CDBC", "BCDA"], 1) # This doesn't give the shortest common string!

'CDBCABCDA'