# Assembly Algorithms: My Analysis

In [None]:
from Py.overlap import overlap

from Py.de_bruijn_ize import de_bruijn_ize

#from Py.pick_maximal_overlap import pick_maximal_overlap

#from Py.scs import scs

#from Py.greedy_scs import greedy_scs

from Py.visualize_de_bruijn import visualize_de_bruijn

#### What is the shortest common superstring? 

In [None]:
from itertools import permutations

""" aka, the BRUTE FORCE VERSION """

""" Returns shortest common superstring of given
    strings, which must be the same length """

def scs ( ss ) :
    
    shortest_sup = None
    
    for ssperm in permutations ( ss ) :
        
        sup = ssperm [ 0 ]  # superstring starts as first string
        
        for i in range ( len ( ss ) - 1 ) :
            
            # overlap adjacent strings A and B in the permutation
            
            olen = overlap ( ssperm [ i ], ssperm [ i + 1 ], min_length = 1 )
            
            # add non-overlapping portion of B to superstring
            
            sup += ssperm [ i + 1 ] [olen : ]
            
        if shortest_sup is None or len ( sup ) < len ( shortest_sup ) :
            
            shortest_sup = sup  # found shorter superstring
            
    return shortest_sup  # return shortest

In [None]:
ss = [ "CCT", "CTT", "TGC", "TGG", "GAT", "ATT" ]

In [None]:
ss = ['ABC', 'BCA', 'CAB']

In [None]:
?ss

In [None]:
scs ( ss )

In [None]:
print ( "Length of the scs for the set of strings, ss: ", len ( scs ( ss ) ) )

#### Is there more than one scs?

In [None]:

""" LIST VERSION OF SCS ( ) - BRUTE FORCE """

def scsList ( ss ) : 
    
    import itertools

    from Py.overlap import overlap

    """ Returns shortest common superstring of given strings,
        assuming no string is a strict substring of another """

    shortest_sup = None

    supList = list () # mod

    for ssperm in itertools.permutations ( ss ) :

        sup = ssperm [ 0 ]

        for i in range ( len ( ss ) - 1 ) :

            olen = overlap ( ssperm [ i ], ssperm [ i + 1 ], min_length = 1 )

            sup += ssperm [ i + 1 ] [ olen : ]


        if shortest_sup is None or len ( sup ) < len ( shortest_sup ) :

            shortest_sup = sup

        supList . append ( sup ) # mod

    ## modded rest 

    scslist = list () 

    for s in range ( len ( supList ) - 1 ) : 

        if  not ( len ( supList [ s ] ) > len ( shortest_sup ) )  :

            scslist . append ( supList [ s ] )
    
    return scslist

In [None]:
reads = ['ABC', 'BCA', 'CAB']

In [None]:
reads = [ "CCT", "CTT", "TGC", "TGG", "GAT", "ATT" ]

In [None]:
from Py.geneReader_Q import geneReader_Q

filename = "SeqFiles/ads1_week4_reads.fq"

reads = geneReader_Q ( filename )

In [None]:
print ( "All scs' in a fully assembled genome: ", scsList ( reads ) )

In [None]:
print ( "Count of strings that are tied for the shortest common superstring:",
      len ( scsList ( reads ) ) )

### How many A's are there in a fully assembled genome?

In [None]:
from itertools import permutations
from Py.overlap import overlap


""""This is a helper function for the greedy_scs() function"""

""" UNMODIFIED """

def pick_maximal_overlap ( reads, k ) :
    
    """ Return a pair of reads from the list with a
        maximal suffix/prefix overlap >= k.  Returns
        overlap length 0 if there are no such overlaps."""
        
    reada, readb = None, None
    
    best_olen = 0
    
    for a, b in permutations ( reads, 2 ) :
                    
        olen = overlap ( a, b, min_length = k )

        if olen > best_olen :

            reada, readb = a, b

            best_olen = olen

    return reada, readb, best_olen

In [None]:
from collections import defaultdict

def kmerdict ( reads, k ) :
    
    kdict = defaultdict ( set )

    for read in reads : 

        for i in range ( 0, len ( read ) - k + 1 ) :

            kdict [ read [ i : i + k ] ] . add ( read )

    return kdict

In [None]:
def greedy_scs ( reads, k ) :
    
    """ Greedy shortest-common-superstring merge.
        Repeat until no edges (overlaps of length >= k)
        remain. """
        
    read_a, read_b, olen = pick_maximal_overlap ( reads, k )
    
    while olen > 0 :
        
        reads . remove ( read_a )
        
        reads . remove ( read_b )
        
        reads . append ( read_a + read_b [ olen : ] )
        
        read_a, read_b, olen = pick_maximal_overlap ( reads, k )
        
    return '' . join ( reads )

In [None]:
from itertools import permutations
from Py.overlap import overlap


""""This is a helper function for the greedy_scs() function"""

""" MODIFIED VERSION """

def pick_maximal_overlap_faster ( reads, k ) :
    
    """ Return a pair of reads from the list with a
        maximal suffix/prefix overlap >= k.  Returns
        overlap length 0 if there are no such overlaps."""
        
    reada, readb = None, None
    
    best_olen = 0
    
    kmers = kmerdict ( reads, k )#
    
    for a in reads : #
            
        asfx = a [ - k : ]#

        for b in kmers [ asfx ] :#

            if ( b != a ) : #
            
                olen = overlap ( a, b, min_length = k )

                if olen >= best_olen :#

                    reada, readb = a, b

                    best_olen = olen

    return reada, readb, best_olen

In [None]:
def greedy_scs_faster ( reads, k ) :
    
    """ Greedy shortest-common-superstring merge.
        Repeat until no edges (overlaps of length >= k)
        remain. """
        
    """ ( slightly ) MODIFIED VERSION """
    
    read_a, read_b, olen = pick_maximal_overlap ( reads, k )
    
    while olen > 0 :
        
        reads . remove ( read_a )
        
        reads . remove ( read_b )
        
        reads . append ( read_a + read_b [ olen : ] )
        
        read_a, read_b, olen = pick_maximal_overlap_faster ( reads, k ) # mod
        
    return '' . join ( reads )

In [None]:
%%time 

reads = ['GAT', 'TAG', 'TCG', 'TGC', 'AAT', 'ATA']

res = greedy_scs ( reads , k = 2 )

res, len ( res )

In [None]:
%%time 

reads = ['GAT', 'TAG', 'TCG', 'TGC', 'AAT', 'ATA']

res = greedy_scs_faster ( reads , k = 2 )

res, len ( res )

In [None]:
%%time

from Py.geneReader_Q import geneReader_Q

filename = "SeqFiles/ads1_week4_reads.fq"

reads = geneReader_Q ( filename )

res = greedy_scs ( reads , k = 30 )

res, len ( res )

In [None]:
res.count("A")

In [None]:
res.count("T")

In [None]:
from itertools import permutations

def scs ( ss ) :
    
    """ Returns shortest common superstring of given strings,
        assuming no string is a strict substring of another """
    
        
    shortest_sup = None
    
    for ssperm in permutations ( ss ) :
        
        sup = ssperm [ 0 ]
        
        for i in range ( len ( ss ) - 1 ) :
            
            olen = overlap ( ssperm [ i ], ssperm [ i + 1 ], min_length = 1 )
            
            sup += ssperm [ i + 1 ] [ olen : ]
            
        if shortest_sup is None or len ( sup ) < len ( shortest_sup ) :
            
            shortest_sup = sup
        
        print ( shortest_sup )
            
    return shortest_sup

In [None]:
reads = ['GAT', 'TAG', 'TCG', 'TGC', 'AAT', 'ATA']

brute = scs ( reads )

brute, len ( brute )

In [None]:
len ( brute ), brute