In [7]:
def overlap(a, b, min_length=3):
    """ Return length of longest suffix of 'a' matching
        a prefix of 'b' that is at least 'min_length'
        characters long.  If no such overlap exists,
        return 0. """
    start = 0  # start all the way at the left
    while True:
        start = a.find(b[:min_length], start)  # look for b's suffx in a
        if start == -1:  # no more occurrences to right
            return 0
        # found occurrence; check for full suffix/prefix match
        if b.startswith(a[start:]):
            return len(a)-start
        start += 1  # move just past previous match

import itertools

def scs(ss):
    """ Returns shortest common superstring of given
        strings, which must be the same length """
    shortest_sup = None
    for ssperm in itertools.permutations(ss):
        sup = ssperm[0]  # superstring starts as first string
        for i in range(len(ss)-1):
            # overlap adjacent strings A and B in the permutation
            olen = overlap(ssperm[i], ssperm[i+1], min_length=1)
            # add non-overlapping portion of B to superstring
            sup += ssperm[i+1][olen:]
        if shortest_sup is None or len(sup) < len(shortest_sup):
            shortest_sup = sup  # found shorter superstring
    return shortest_sup  # return shortest

In [8]:
# Question 1
len(scs(["CCT", "CTT", "TGC", "TGG", "GAT", "ATT"]))

11

In [9]:
def scs_all_versions(ss):
    """ Returns every version of shortest common superstring of given
        strings, which must be the same length """
    shortest_sup_list = set()
    shortest_sup_len = 0
    for ssperm in itertools.permutations(ss):
        sup = ssperm[0]  # superstring starts as first string
        for i in range(len(ss)-1):
            # overlap adjacent strings A and B in the permutation
            olen = overlap(ssperm[i], ssperm[i+1], min_length=1)
            # add non-overlapping portion of B to superstring
            sup += ssperm[i+1][olen:]
        # Process of evaluating and adding to the set
        if not bool(shortest_sup_list) or shortest_sup_len == len(sup):
            shortest_sup_list.add(sup)
            shortest_sup_len = len(sup)
        elif shortest_sup_len > len(sup):
            shortest_sup_list.clear()
            shortest_sup_list.add(sup)
            shortest_sup_len = len(sup)
    return shortest_sup_list, shortest_sup_len  # return shortest

In [10]:
# Question 2
shortest_sup_list, shortest_sup_len = scs_all_versions(["CCT", "CTT", "TGC", "TGG", "GAT", "ATT"])

In [11]:
shortest_sup_list

{'CCTTGGATTGC', 'GATTGCCTTGG', 'TGCCTTGGATT', 'TGGATTGCCTT'}

In [12]:
len(shortest_sup_list)

4

In [13]:
# !wget https://d28rh4a8wq0iu5.cloudfront.net/ads1/data/ads1_week4_reads.fq

In [14]:
class DenovoAssembly:
    def __init__(self, reads, min_len = 3):
        self.min_len = min_len
        self.reads = reads
        self.k_mer_map = dict()
        self.overlap_map = []
        self.overlap_lenmap = []

    def analyze_kmer_reads(self):
        for read in self.reads:
            k_mer_set = set()
            for i in range(len(read) - self.min_len + 1):
                k_mer_set.add(read[i:i+self.min_len])
            self.k_mer_map[read] = k_mer_set

    def assemble(self):
        self.analyze_kmer_reads()
        k_mer_map_keys = list(self.k_mer_map.keys())
        for i in range(len(self.k_mer_map)):
            current_read = k_mer_map_keys[i]
            for j in range(len(self.k_mer_map)):
                compared_read = k_mer_map_keys[j]
                if (compared_read[:self.min_len] not in self.k_mer_map[current_read]) or i == j:
                    continue
                else:
                    overlap_len = overlap(current_read, compared_read, min_length=self.min_len)
                    if overlap_len > 0:
                        self.overlap_map.append((current_read, compared_read))
                        self.overlap_lenmap.append(overlap_len)
        return self.overlap_map

    def get_overlaplenmap(self):
        return self.overlap_lenmap

In [15]:
def read_fastq(filename):
    sequences = []
    qualities = []
    with open(filename, "r") as fh:
        while True:
            fh.readline()
            seq = fh.readline().rstrip()
            fh.readline()
            Q_encoded = fh.readline().rstrip()
            if len(seq) == 0:
                break
            sequences.append(seq)
            qualities.append(Q_encoded)
    return sequences, qualities

In [16]:
sequences, qualities = read_fastq("ads1_week4_reads.fq")

In [26]:
# Greedy shortest common string algorithm
def find_max_overlap(reads, min_len_overlap=1):
    max_overlap_length = 0
    read_a_max, read_b_max = None, None
    assem_obj = DenovoAssembly(reads, 30)
    assembler = assem_obj.assemble()
    olenmap = assem_obj.get_overlaplenmap()
    for i in range(len(assembler)):
        read_a = assembler[i][0]
        read_b = assembler[i][1]
        overlap_len = olenmap[i]
        if max_overlap_length < overlap_len:
            max_overlap_length = overlap_len
            read_a_max = read_a
            read_b_max = read_b
    del assem_obj
    return read_a_max, read_b_max, max_overlap_length

def greedy_SCS(reads, min_len_overlap):
    read_a, read_b, max_overlap_len = find_max_overlap(reads, min_len_overlap)
    while max_overlap_len > 0: # If there is no more overlap in the read list, just concat them together!
        reads.remove(read_a)
        reads.remove(read_b)
        reads.append(read_a + read_b[max_overlap_len:])
        read_a, read_b, max_overlap_len = find_max_overlap(reads, min_len_overlap)
    return "".join(reads)

In [27]:
synthetic_genome = greedy_SCS(sequences, 1)

In [28]:
print(len(synthetic_genome))

15894


In [30]:
# Question 3 & 4
A = 0
T = 0
for base in synthetic_genome:
    if base == "A":
        A += 1
    elif base == "T":
        T += 1
print(A)
print(T)

4633
3723
