# Overlaps and Edit Distance - An Analysis



> In computational linguistics and computer science, edit distance is a string metric, i.e. a way of quantifying how dissimilar two strings (e.g., words) are to one another, that is measured by counting the minimum number of operations required to transform one string into the other. Edit distances find applications in natural language processing, where automatic spelling correction can determine candidate corrections for a misspelled word by selecting words from a dictionary that have a low distance to the word in question. In bioinformatics, it can be used to quantify the similarity of DNA sequences, which can be viewed as strings of the letters A, C, G and T.

*[Edit Distance - Wikipedia](https://en.wikipedia.org/wiki/Edit_distance)*



In [None]:
from Py.geneReader import geneReader

filename = 'SeqFiles/chr1.GRCh38.excerpt.fasta'

data = open ( filename, 'r' )

reads = geneReader ( filename )

data.close ()

In [None]:
from Py.editDistance import editDistance

import numpy as np

In [None]:
x = "GATTTACCAGATTGAG"

y = reads

D = [ ]

In [None]:
# Range covers the offset row plus the length of the pattern

for i in range ( len ( x ) + 1 ) :

    # Initializes the dimensions of the matrix with 0s. 

    D.append ( [ 0 ] * ( len ( y ) + 1 ) )

In [None]:
print ( 'Length of pattern:', len  ( x ) )

In [None]:
print ( 'Length of sequence:', len ( y ) )

In [None]:
D1 = np.matrix ( D )

D1 = D1.view ( )

print ( D1 )

In [None]:
np.shape ( D )

In [None]:
for i in range ( len ( x ) + 1 ) :

    D [ i ] [ 0 ] = i

In [None]:
D1 = np.matrix ( D )

D1 = D1.view ( )

print ( D1 )

In [None]:
for j in range ( len ( y ) + 1 ) :
        
    D [ 0 ] [ j ] = 0

In [None]:
D1 = np.matrix ( D )

D1 = D1.view ( )

print ( D1 )

In [None]:
# Fills in the rest of the matrix rows and columns.
#
# Starts at  row 1. 

for i in range ( 1, len ( x ) + 1 ) :

    # goes by column, starts at column 1

    for j in range ( 1, len ( y ) + 1 ) : 

    # value that is left adjacent to the current value, 
        # plus 1 is the penalty for character skipping

        distHor = D [ i ] [ j - 1 ] + 1 

        # value that is up adjacent to the current value, 
            # plus 1 is the penalty for character skipping

        distVer = D [ i - 1 ] [ j ] + 1

        # edit distance does not further increase if there is a match

            # aka, if matches, does not incur penalty

        if x [ i - 1 ] == y [ j - 1 ] : 

            # Diagonal up/left distance

            distDiag = D [ i - 1 ] [ j - 1 ] 


        # otherwise, diagonal distance value increases by 1

        else :

            distDiag = D [ i - 1 ] [ j - 1 ] + 1 


        # min () takes the minimum edit distance of the 3 possible values
        # so this value will be inserted for the current iteration
        # of row i, column j. 

        D [ i ] [ j ] = min ( distHor, distVer, distDiag ) 

In [None]:
D1 = np.matrix ( D )

D1 = D1.view ( )

print ( D1 )

In [None]:
# We are interested in the minimum value of the bottom row.

print ( min ( D [ -1 ] ) )

In [1]:
%%time

from Py.geneReader_Q import geneReader_Q

filename = 'SeqFiles/ERR266411_1.for_asm.fastq'

reads = geneReader_Q ( filename )

?reads

CPU times: total: 0 ns
Wall time: 20.5 ms


[1;31mType:[0m        list
[1;31mString form:[0m ['TAAACAAGCAGTAGTAATTCCTGCTTTATCAAGATAATTTTTCGACTCATCAGAAATATCCGAAAGTGTTAACTTCTGCGTCATGGAAGCGATAA <...> GGCCTCTATTAAGCTCATTCAGGCTTCTGCCGTTTTGGATTTAACCGAAGATGATTTCGATTTTCTGACTAGTAACAAAGTTTGGATTGCTACTG']
[1;31mLength:[0m      10000
[1;31mDocstring:[0m  
Built-in mutable sequence.

If no argument is given, the constructor creates a new empty list.
The argument must be an iterable if specified.

In [7]:
from itertools import permutations

list ( permutations ( [ 1, 2, 3 ], 2 ) )

[(1, 2), (1, 3), (2, 1), (2, 3), (3, 1), (3, 2)]

In [2]:
# Starting with an empty set object, we will then add every k-mer association to it

k = 30

setObj = set()

?setObj

[1;31mType:[0m        set
[1;31mString form:[0m set()
[1;31mLength:[0m      0
[1;31mDocstring:[0m  
set() -> new empty set object
set(iterable) -> new set object

Build an unordered collection of unique elements.

In [21]:
%%time

k = 30

# Example is for only the first read in the file. 

read = reads [ 0 ]

for i in range ( 0, len ( read ) - k + 1 ) :
    
    # We use the add method because we are dealing with a set, not a list.
    
    setObj.add ( read [ i : i + k ] )
    
?setObj

CPU times: total: 0 ns
Wall time: 0 ns


[1;31mType:[0m        set
[1;31mString form:[0m {'TTATCAAGATAATTTTTCGACTCATCAGAA', 'TCCTGCTTTATCAAGATAATTTTTCGACTC', 'CGAAAGTGTTAACTTCTGCGTCATGGA <...> GAAAGTGTTAACTTCTGCGTCATGGAA', 'TCAAGATAATTTTTCGACTCATCAGAAATA', 'TTAACTTCTGCGTCATGGAAGCGATAAAAC'}
[1;31mLength:[0m      71
[1;31mDocstring:[0m  
set() -> new empty set object
set(iterable) -> new set object

Build an unordered collection of unique elements.

In [17]:
# Starting with an empty set object, we will then add every k-mer association to it

k = 30

def kmerExtract ( read, k ) :

    setObj = set ()

    for i in range ( 0, len ( read ) - k + 1 ) :

        # We use the add method because we are dealing with a set, 
        # not a list.

        setObj.add ( read [ i : i + k ] )
    
    return setObj

?setObj

[1;31mType:[0m        set
[1;31mString form:[0m set()
[1;31mLength:[0m      0
[1;31mDocstring:[0m  
set() -> new empty set object
set(iterable) -> new set object

Build an unordered collection of unique elements.

In [5]:
%%time

kmerDictionary = { }

for read in reads :
    
    kmerSetObjs = kmerExtract ( read, k )
    
    for kmerSetObj in kmerSetObjs : 
        
        if not kmerSetObj in kmerDictionary.keys () :
            
            kmerDictionary [ kmerSetObj ] = set ()
            
            kmerDictionary [ kmerSetObj ].add ( read )
            
?kmerDictionary           

CPU times: total: 203 ms
Wall time: 588 ms


[1;31mType:[0m        dict
[1;31mString form:[0m {'AGTAGTAATTCCTGCTTTATCAAGATAATT': {'TAAACAAGCAGTAGTAATTCCTGCTTTATCAAGATAATTTTTCGACTCATCAGAAATATC <...> AACGCGAACAATTCAGCGGCTTTAACCGGACGGTCGGCCCCGATAATAATGATTGCCGTAAATTCAGGGCTTTCCAGGATTAGGCAGGCCGTTT'}}
[1;31mLength:[0m      108344
[1;31mDocstring:[0m  
dict() -> new empty dictionary
dict(mapping) -> new dictionary initialized from a mapping object's
    (key, value) pairs
dict(iterable) -> new dictionary initialized as if via:
    d = {}
    for k, v in iterable:
        d[k] = v
dict(**kwargs) -> new dictionary initialized with the name=value pairs
    in the keyword argument list.  For example:  dict(one=1, two=2)

In [12]:
%%time

from Py.overlap import overlap

kmerPairs = [ ]

overgraph = { }

for a in reads :
    
    a_suffix = a [ - k : ]
    
    a_suffix_reads = kmerDictionary [ a_suffix ]
    
    for b in a_suffix_reads : 
        
        if ( not a == b and overlap ( a, b, k ) ) :
            
            kmerPairs.append ( ( a, b ) )
            
            overgraph [ a ] = b 
            

?overgraph

CPU times: total: 0 ns
Wall time: 21.6 ms


[1;31mType:[0m        dict
[1;31mString form:[0m {'CTGTAGCCGACGTTTTGGCGGCGCAACCTGTGACGACAAATCTGCTCAAATTTATGCGCGCTTCGATAAAAATGATTGGCGTATCCAACCTGCAG <...> GTTTTGGATTTAACCGAAGATGATTTCGATTTTCTGACTAGTAACAAAGTTTGGATTGCTACTGACCGCTCTCGTGCTCGTCGCTGCGTTGAGGC'}
[1;31mLength:[0m      5274
[1;31mDocstring:[0m  
dict() -> new empty dictionary
dict(mapping) -> new dictionary initialized from a mapping object's
    (key, value) pairs
dict(iterable) -> new dictionary initialized as if via:
    d = {}
    for k, v in iterable:
        d[k] = v
dict(**kwargs) -> new dictionary initialized with the name=value pairs
    in the keyword argument list.  For example:  dict(one=1, two=2)

In [26]:
len ( kmerPairs )

5274

In [None]:
for pair in kmerPairs : 
    
    print ( set ( pair [ 0 ] ) )

5274


In [14]:
?kmerDictionary

[1;31mType:[0m        dict
[1;31mString form:[0m {'AGTAGTAATTCCTGCTTTATCAAGATAATT': {'TAAACAAGCAGTAGTAATTCCTGCTTTATCAAGATAATTTTTCGACTCATCAGAAATATC <...> AACGCGAACAATTCAGCGGCTTTAACCGGACGGTCGGCCCCGATAATAATGATTGCCGTAAATTCAGGGCTTTCCAGGATTAGGCAGGCCGTTT'}}
[1;31mLength:[0m      108344
[1;31mDocstring:[0m  
dict() -> new empty dictionary
dict(mapping) -> new dictionary initialized from a mapping object's
    (key, value) pairs
dict(iterable) -> new dictionary initialized as if via:
    d = {}
    for k, v in iterable:
        d[k] = v
dict(**kwargs) -> new dictionary initialized with the name=value pairs
    in the keyword argument list.  For example:  dict(one=1, two=2)

In [16]:
aSuffixDictionary = {}

for a in reads :
    
    aSuffixDictionary [ a ] = reads[a]
    
    
    
    

SyntaxError: incomplete input (3370027603.py, line 5)

In [None]:
%%time 


# Returns all kmers of of min length k for a read 

k = 30

kmerExtract ( read, k )

kmerDict = { }

i = 0

for read in reads :
    
    if ( not i > 0 ) :
        
        kmerDict [ read ] = kmerExtract ( read, k )
        
    i += 1
    
kmerDict

In [77]:
%%time 


# Returns all kmers of of min length k for a read 

k = 30

kmerExtract ( read, k )

kmerDict = { }

i = 0

for read in reads :    
    
    if ( not i > 1 ) :
        
        kmerDict [ read ] = kmerExtract ( read, k )
        
    
    i += 1
    
?kmerDict

CPU times: total: 0 ns
Wall time: 1.5 ms


[1;31mType:[0m        dict
[1;31mString form:[0m {'TAAACAAGCAGTAGTAATTCCTGCTTTATCAAGATAATTTTTCGACTCATCAGAAATATCCGAAAGTGTTAACTTCTGCGTCATGGAAGCGATAA <...> AAAGTGTTAACTTCTGCGTCATGGAC', 'ACAAGCAGTAGTAATTCCTGCTTTATCAAG', 'TTTTCGACTCATCAGAAATATACGAAAGTG'}}
[1;31mLength:[0m      2
[1;31mDocstring:[0m  
dict() -> new empty dictionary
dict(mapping) -> new dictionary initialized from a mapping object's
    (key, value) pairs
dict(iterable) -> new dictionary initialized as if via:
    d = {}
    for k, v in iterable:
        d[k] = v
dict(**kwargs) -> new dictionary initialized with the name=value pairs
    in the keyword argument list.  For example:  dict(one=1, two=2)

In [69]:
i = 0

pairset = set()

for a in kmerDict ( ) :
    
    temp = a
    
    sfxA = a [ - k : ]
    
    
    

AACTTCTGCGTCATGGAAGCGATAAAACTC TAAACAAGCAGTAGTAATTCCTGCTTTATCAAGATAATTTTTCGACTCATCAGAAATATCCGAAAGTGTTAACTTCTGCGTCATGGAAGCGATAAAACTC
CTTCTGCGTCATGGACACGAAAAAACTCCC AACAAGCAGTAGTAATTCCTGCTTTATCAAGATAATTTTTCGACTCATCAGAAATATACGAAAGTGTTAACTTCTGCGTCATGGACACGAAAAAACTCCC


In [86]:
for i in kmerDict :
    
    
    print ( kmerDict[i] )
    
    i

{'AGTAGTAATTCCTGCTTTATCAAGATAATT', 'TTCCTGCTTTATCAAGATAATTTTTCGACT', 'TCAGAAATATCCGAAAGTGTTAACTTCTGC', 'CCTGCTTTATCAAGATAATTTTTCGACTCA', 'CTTTATCAAGATAATTTTTCGACTCATCAG', 'AATATCCGAAAGTGTTAACTTCTGCGTCAT', 'AAAGTGTTAACTTCTGCGTCATGGAAGCGA', 'TTAACTTCTGCGTCATGGAAGCGATAAAAC', 'ATTTTTCGACTCATCAGAAATATCCGAAAG', 'CGACTCATCAGAAATATCCGAAAGTGTTAA', 'ATCAGAAATATCCGAAAGTGTTAACTTCTG', 'GCTTTATCAAGATAATTTTTCGACTCATCA', 'AGAAATATCCGAAAGTGTTAACTTCTGCGT', 'CAAGCAGTAGTAATTCCTGCTTTATCAAGA', 'TTTATCAAGATAATTTTTCGACTCATCAGA', 'AAACAAGCAGTAGTAATTCCTGCTTTATCA', 'AAGCAGTAGTAATTCCTGCTTTATCAAGAT', 'ATAATTTTTCGACTCATCAGAAATATCCGA', 'AATTCCTGCTTTATCAAGATAATTTTTCGA', 'GAAATATCCGAAAGTGTTAACTTCTGCGTC', 'AACTTCTGCGTCATGGAAGCGATAAAACTC', 'TCCGAAAGTGTTAACTTCTGCGTCATGGAA', 'CAGAAATATCCGAAAGTGTTAACTTCTGCG', 'GTTAACTTCTGCGTCATGGAAGCGATAAAA', 'ACTCATCAGAAATATCCGAAAGTGTTAACT', 'ATATCCGAAAGTGTTAACTTCTGCGTCATG', 'AGTGTTAACTTCTGCGTCATGGAAGCGATA', 'TATCAAGATAATTTTTCGACTCATCAGAAA', 'AAGATAATTTTTCGACTCATCAGAAATATC', 'GTAGTAATTCCT