In [9]:
def phredQ(q):
    return ord(q) - 33

def readFastq(filename):
    sequenses = []
    qualities = []
    with open(filename) as f:
        while True :
            f.readline()
            seq = f.readline().rstrip()
            f.readline()
            qual = f.readline().rstrip()
            if len(seq) == 0:
                break
            sequenses.append(seq)
            qualities.append(qual)
    return sequenses, qualities


def overlap(a, b, min_length = 3):
    start = 0
    while True:
        start = a.find(b[:min_length],start)
        if start == -1:
            return 0
        if b.startswith(a[start:]):
            return len(a)-start
        start += 1


def createKmerIndex(reads, k):
    dic = {}
    for read in reads:
        for i in range(len(read) - k + 1):  # for each k-mer
            kmer = read[i:i+k]
            if kmer in dic:
                dic[kmer].add(read)
            else:
                dic[kmer] = set([read])    
    return dic  


import itertools

def pick_maximal_overlap(reads,k):
    reada, readb = None,None
    best_olen = 0
    for a,b in itertools.permutations(reads,2):
        olen = overlap(a,b, min_length=k)
        if olen == 100:
            return a,b,olen
        if olen > best_olen:
            reada, readb = a,b
            best_olen = olen
    return reada, readb, best_olen


def pick_maximal_overlap_kmer(reads,k):
    dic = createKmerIndex(reads,k)
    reada, readb = None,None
    best_olen = 0
    for kmer in dic:
        for a,b in itertools.permutations(dic[kmer],2):
            olen = overlap(a,b, min_length=k)
            if olen == 100:
                return a,b,olen
            if olen > best_olen:
                reada, readb = a,b
                best_olen = olen
    return reada, readb, best_olen


def gready_scs(reads,k):
    read_a, read_b, olen = pick_maximal_overlap(reads,k)
    while olen > 0:
        reads.remove(read_a)
        reads.remove(read_b)
        reads.append(read_a+read_b[olen:])
        read_a, read_b, olen= pick_maximal_overlap(reads,k)
    return ''.join(reads)

def gready_scs_kmer(reads,k):
    read_a, read_b, olen = pick_maximal_overlap_kmer(reads,k)
    while olen > 0:
        reads.remove(read_a)
        reads.remove(read_b)
        reads.append(read_a+read_b[olen:])
        read_a, read_b, olen= pick_maximal_overlap_kmer(reads,k)
    return ''.join(reads)

def removeExactMatches(reads,k):
    for a,b in itertools.permutations(reads,2):
        olen = overlap(a,b, min_length=k)
        if olen == len(a):
            reads.remove(a)
    return reads

In [16]:
readsAsm , quals = readFastq('ads1_week4_reads.fq')
print(len(readsAsm))
readsAsm = removeExactMatches(readsAsm,100)
print(len(readsAsm))


1881
1853


In [None]:
c = gready_scs_kmer(readsAsm,10)
print(len(c))

In [11]:
x = pick_maximal_overlap(readsAsm,10)
print(x)

('ACCAAACAAAGTTGGGTAAGGATAGATCAATCAATGATCATATTCTAGTACACTTAGGATTCAAGATCCTATTATCAGGGACAAGAGCAGGATTAGGGAT', 'ACCAAACAAAGTTGGGTAAGGATAGATCAATCAATGATCATATTCTAGTACACTTAGGATTCAAGATCCTATTATCAGGGACAAGAGCAGGATTAGGGAT', 100)


In [None]:
pick_maximal_overlap_kmer(readsAsm,10)


In [16]:
x,y = itertools.permutations(['A'],2)
print(x,y)

ValueError: not enough values to unpack (expected 2, got 0)