In [1]:
def overlap(a, b, min_length=3):
    """ Return length of longest suffix of 'a' matching
        a prefix of 'b' that is at least 'min_length'
        characters long.  If no such overlap exists,
        return 0. """
    start = 0  # start all the way at the left
    while True:
        start = a.find(b[:min_length], start)  # look for b's prefix in a
        if start == -1:  # no more occurrences to right
            return 0
        # found occurrence; check for full suffix/prefix match
        if b.startswith(a[start:]):
            return len(a)-start
        start += 1  # move just past previous match

In [2]:
overlap('ACGT','GTAA',2)

2

In [3]:
def createKmerIndex(reads, k):
    dic = {}
    for read in reads:
        for i in range(len(read) - k + 1):  # for each k-mer
            kmer = read[i:i+k]
            if kmer in dic:
                dic[kmer].add(read)
            else:
                dic[kmer] = set([read])    
    return dic   

In [4]:
createKmerIndex({'GATTA'},3)

{'ATT': {'GATTA'}, 'GAT': {'GATTA'}, 'TTA': {'GATTA'}}

In [5]:
def phredQ(q):
    return ord(q) - 33

def readFastq(filename):
    sequenses = []
    qualities = []
    with open(filename) as f:
        while True :
            f.readline()
            seq = f.readline().rstrip()
            f.readline()
            qual = f.readline().rstrip()
            if len(seq) == 0:
                break
            sequenses.append(seq)
            qualities.append(qual)
    return sequenses, qualities


In [6]:
readsAsm , quals = readFastq('ERR266411_1.for_asm.fastq')

len(readsAsm)

10000

In [7]:

def findOverlapSuffix(reads, k):
    dic = createKmerIndex(reads, k)
    count = 0
    result = []
    for a in reads:
        added = False;
        kmer = a[-1*k:]
        for v in dic[kmer]:
            if not a == v:
                if overlap(a,v,k) > 0:
                    result.append((a,v))
                    added = True
        if added == True:
            count +=1
        #else:
         #   print(a)
    return result,count
    #return count

In [9]:
#print(readsAsm[0:2])
r,c = findOverlapSuffix(['CGTACG', 'TACGTA', 'GTACGT', 'ACGTAC', 'GTACGA', 'TACGAT'],5)
#print(len(olaps))
print(r)
print(c)

[('CGTACG', 'GTACGT'), ('CGTACG', 'GTACGA'), ('TACGTA', 'ACGTAC'), ('GTACGT', 'TACGTA'), ('ACGTAC', 'CGTACG'), ('GTACGA', 'TACGAT')]
5


In [25]:
#readsAsm[0:10]
olaps,c = findOverlapSuffix(readsAsm,30)
print(len(olaps))
print(c)

904746
7161
