In [None]:
from Py.de_bruijn_ize import de_bruijn_ize
from Py.pick_maximal_overlap import pick_maximal_overlap
from Py.scs import scs
from Py.greedy_scs import greedy_scs
from Py.visualize_de_bruijn import visualize_de_bruijn
from Py.overlap import overlap

#### What is the shortest common superstring? 

In [None]:
def overlap(a, b, min_length=3):
    """ Return length of longest suffix of 'a' matching
        a prefix of 'b' that is at least 'min_length'
        characters long.  If no such overlap exists,
        return 0. """
    start = 0  # start all the way at the left
    while True:
        start = a.find(b[:min_length], start)  # look for b's suffx in a
        if start == -1:  # no more occurrences to right
            return 0
        # found occurrence; check for full suffix/prefix match
        if b.startswith(a[start:]):
            return len(a)-start
        start += 1  # move just past previous match


In [None]:
import itertools

def scs(ss):
    
    """ Returns shortest common superstring of given
        strings, which must be the same length """
    
    shortest_sup = None
    
    for ssperm in itertools.permutations(ss):
        sup = ssperm[0]  # superstring starts as first string
        for i in range(len(ss)-1):
            # overlap adjacent strings A and B in the permutation
            olen = overlap(ssperm[i], ssperm[i+1], min_length=1)
            # add non-overlapping portion of B to superstring
            sup += ssperm[i+1][olen:]
        if shortest_sup is None or len(sup) < len(shortest_sup):
            shortest_sup = sup  # found shorter superstring
    return shortest_sup  # return shortest

In [None]:
ss = [ "CCT", "CTT", "TGC", "TGG", "GAT", "ATT" ]

In [None]:
ss = ['ABC', 'BCA', 'CAB']

In [None]:
?ss

In [None]:
scs ( ss )

In [None]:
print ( " Length of the scs for the set of strings, ss: ", len ( scs ( ss ) ) )

#### Is there more than one scs?

In [None]:
def scsList ( ss ) : 
    
    import itertools

    from Py.overlap import overlap

    """ Returns shortest common superstring of given strings,
        assuming no string is a strict substring of another """

    shortest_sup = None

    supList = list () # mod

    for ssperm in itertools.permutations ( ss ) :

        sup = ssperm [ 0 ]

        for i in range ( len ( ss ) - 1 ) :

            olen = overlap ( ssperm [ i ], ssperm [ i + 1 ], min_length = 1 )

            sup += ssperm [ i + 1 ] [ olen : ]


        if shortest_sup is None or len ( sup ) < len ( shortest_sup ) :

            shortest_sup = sup

        supList . append ( sup ) # mod

    ## modded rest 

    scslist = list () 

    for s in range ( len ( supList ) - 1 ) : 

        if  not (len ( supList [ s ] ) > len ( shortest_sup ) )  :

            scslist . append ( supList [ s ] )
    
    return scslist

In [None]:
reads = ['ABC', 'BCA', 'CAB']

In [None]:
reads = [ "CCT", "CTT", "TGC", "TGG", "GAT", "ATT" ]

In [None]:
from Py.geneReader_Q import geneReader_Q

filename = "SeqFiles/ads1_week4_reads.fq"

reads = geneReader_Q ( filename )

In [None]:
print ( "All scs': ", scsList ( reads ) )

In [None]:
print ( "Count of strings that are tied for the shortest common superstring:",
      len ( scsList ( reads ) ) )

How many A's are there in a fully assembled genome?

In [12]:
from Py.geneReader_Q import geneReader_Q

filename = "SeqFiles/ads1_week4_reads.fq"

reads = geneReader_Q ( filename )

?reads

[1;31mType:[0m        list
[1;31mString form:[0m ['GTCCAGCAGAGCAAGTGATGCGAGAGCTGCCCATCCTCCAACCAGCATGCCCCTAGACATTGACACTGCATCGGAGTCAGGCCAAGATCCGCAGG <...> CGTTAGTGCAAGGGGACAATCAGACCATAGCCGTAACAAAAAGGGTACCCAGCACATGGCCTTACAACCTTAAGAAACGGGAAGCTGCTAGAGTA']
[1;31mLength:[0m      1881
[1;31mDocstring:[0m  
Built-in mutable sequence.

If no argument is given, the constructor creates a new empty list.
The argument must be an iterable if specified.

In [None]:
from itertools import permutations
from Py.overlap import overlap

def pick_maximal_overlap ( reads, k ) :
    
    """ Return a pair of reads from the list with a
        maximal suffix/prefix overlap >= k.  Returns
        overlap length 0 if there are no such overlaps."""
        
    reada, readb = None, None
    
    best_olen = 0
    
    for a, b in permutations ( reads, 2 ) :
        
        olen = overlap ( a, b, min_length = k )
        
        if olen > best_olen :
            
            reada, readb = a, b
            
            best_olen = olen
            
    return reada, readb, best_olen

In [None]:
from collections import defaultdict

kmerDict = defaultdict ( set )

reads = [ "CCT", "CTT", "TGC", "TGG", "GAT", "ATT" ]

k = 3

for read in reads : 
    
    for i in range ( 0, len ( read ) - k + 1 ) :
        
        kmerDict [ read [ i : i + k ] ] . add ( read )
        
?kmerDict        

In [None]:
%%time

greedy_scs ( kmerDict, k )

In [None]:
reads = [ "CCT", "CTT", "TGC", "TGG", "GAT", "ATT" ]

k = 2

In [None]:
pick_maximal_overlap ( reads, k )

In [14]:

def greedy_scs ( reads, k ) :
    
    """ Greedy shortest-common-superstring merge.
        Repeat until no edges (overlaps of length >= k)
        remain. """
        
    read_a, read_b, olen = pick_maximal_overlap ( reads, k )
    
    while olen > 0 :
        
        reads . remove ( read_a )
        
        reads . remove ( read_b )
        
        reads . append ( read_a + read_b [ olen : ] )
        
        read_a, read_b, olen = pick_maximal_overlap ( reads, k )
        
    return '' . join ( reads )

In [15]:
%%time

wholegenes = greedy_scs ( reads, k = 30 )



KeyboardInterrupt: 

In [None]:
# 15,894 is the actual length of genome

len(wholegenes)

In [None]:
from Py.geneReader_Q import geneReader_Q

filename = "SeqFiles/ads1_week4_reads.fq"

reads = geneReader_Q ( filename )

In [17]:
from collections import defaultdict

kmerDict = defaultdict ( set )

k = 30

for read in reads : 
    
    for i in range ( 0, len ( read ) - k + 1 ) :
        
        kmerDict [ read [ i : i + k ] ] . add ( read )
        
?kmerDict

[1;31mType:[0m        defaultdict
[1;31mString form:[0m defaultdict(<class 'set'>, {'GTCCAGCAGAGCAAGTGATGCGAGAGCTGC': {'GAGAAGCCAGGGAGAGCTACAGAGAAACCGGGT <...> ACAACCCAGGACAGGACCGAGCCACCTGCAAGGAAGAGGAGGCAGGCAGTTCGGGTCTCAGCAAACCATGCCTCTCAGCAATTGGATCAACTG'}})
[1;31mLength:[0m      15865
[1;31mFile:[0m        c:\users\ecmos\anaconda3\lib\collections\__init__.py
[1;31mDocstring:[0m  
defaultdict(default_factory[, ...]) --> dict with default factory

The default factory is called without arguments to produce
a new value when a key is not present, in __getitem__ only.
A defaultdict compares equal to a dict with the same items.
All remaining arguments are treated the same as if they were
passed to the dict constructor, including keyword arguments.

In [13]:
from itertools import permutations

from Py.overlap import overlap

from collections import defaultdict


def pick_maximal_overlap ( reads, k = 30 ) :
    
    """ Return a pair of reads from the list with a
        maximal suffix/prefix overlap >= k.  Returns
        overlap length 0 if there are no such overlaps."""
        
   # a, b = None, None
    
  #  best_olen = 0
    
    ###
    

    kmerDict = defaultdict ( set )

    for read in reads : 

        for i in range ( 0, len ( read ) - k + 1 ) :

            kmerDict [ read [ i : i + k ] ] . add ( read )
            
    #####      
            
    pairs = []
    
    for a in reads : 

        asfx = a [ - k : ]

        for b in kmerDict [ asfx ] :

            if ( b != a ) : 

                olen = overlap ( a, b, k )

                if ( olen >= k ) :

                    return a, b, olen

    
    ###
    
    """
    
    for a, b in permutations ( reads, 2 ) :
        
        olen = overlap ( a, b, min_length = k )
        
        if olen > best_olen :
            
            reada, readb = a, b
            
            best_olen = olen
            
    """
        