# Overlaps and Edit Distance - An Analysis



> In computational linguistics and computer science, edit distance is a string metric, i.e. a way of quantifying how dissimilar two strings (e.g., words) are to one another, that is measured by counting the minimum number of operations required to transform one string into the other. Edit distances find applications in natural language processing, where automatic spelling correction can determine candidate corrections for a misspelled word by selecting words from a dictionary that have a low distance to the word in question. In bioinformatics, it can be used to quantify the similarity of DNA sequences, which can be viewed as strings of the letters A, C, G and T.

*[Edit Distance - Wikipedia](https://en.wikipedia.org/wiki/Edit_distance)*



In [1]:
from Py.geneReader import geneReader

filename = 'SeqFiles/chr1.GRCh38.excerpt.fasta'

data = open ( filename, 'r' )

reads = geneReader ( filename )

data.close ()

In [2]:
from Py.editDistance import editDistance

import numpy as np

In [3]:
x = "GATTTACCAGATTGAG"

y = reads

D = [ ]

In [4]:
# Range covers the offset row plus the length of the pattern

for i in range ( len ( x ) + 1 ) :

    # Initializes the dimensions of the matrix with 0s. 

    D.append ( [ 0 ] * ( len ( y ) + 1 ) )

In [5]:
print ( 'Length of pattern:', len  ( x ) )

Length of pattern: 16


In [16]:
print ( 'Length of sequence:', len ( y ) )

Length of sequence: 800000


In [7]:
D1 = np.matrix ( D )

D1 = D1.view ( )

print ( D1 )

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [8]:
np.shape ( D )

(17, 800001)

In [9]:
for i in range ( len ( x ) + 1 ) :

    D [ i ] [ 0 ] = i

In [10]:
D1 = np.matrix ( D )

D1 = D1.view ( )

print ( D1 )

[[ 0  0  0 ...  0  0  0]
 [ 1  0  0 ...  0  0  0]
 [ 2  0  0 ...  0  0  0]
 ...
 [14  0  0 ...  0  0  0]
 [15  0  0 ...  0  0  0]
 [16  0  0 ...  0  0  0]]


In [11]:
for j in range ( len ( y ) + 1 ) :
        
    D [ 0 ] [ j ] = 0

In [12]:
D1 = np.matrix ( D )

D1 = D1.view ( )

print ( D1 )

[[ 0  0  0 ...  0  0  0]
 [ 1  0  0 ...  0  0  0]
 [ 2  0  0 ...  0  0  0]
 ...
 [14  0  0 ...  0  0  0]
 [15  0  0 ...  0  0  0]
 [16  0  0 ...  0  0  0]]


In [13]:
# Fills in the rest of the matrix rows and columns.
#
# Starts at  row 1. 

for i in range ( 1, len ( x ) + 1 ) :

    # goes by column, starts at column 1

    for j in range ( 1, len ( y ) + 1 ) : 

    # value that is left adjacent to the current value, 
        # plus 1 is the penalty for character skipping

        distHor = D [ i ] [ j - 1 ] + 1 

        # value that is up adjacent to the current value, 
            # plus 1 is the penalty for character skipping

        distVer = D [ i - 1 ] [ j ] + 1

        # edit distance does not further increase if there is a match

            # aka, if matches, does not incur penalty

        if x [ i - 1 ] == y [ j - 1 ] : 

            # Diagonal up/left distance

            distDiag = D [ i - 1 ] [ j - 1 ] 


        # otherwise, diagonal distance value increases by 1

        else :

            distDiag = D [ i - 1 ] [ j - 1 ] + 1 


        # min () takes the minimum edit distance of the 3 possible values
        # so this value will be inserted for the current iteration
        # of row i, column j. 

        D [ i ] [ j ] = min ( distHor, distVer, distDiag ) 

In [14]:
D1 = np.matrix ( D )

D1 = D1.view ( )

print ( D1 )

[[ 0  0  0 ...  0  0  0]
 [ 1  1  1 ...  1  0  0]
 [ 2  2  2 ...  0  1  1]
 ...
 [14 13 12 ...  8  7  7]
 [15 14 13 ...  7  8  8]
 [16 15 14 ...  8  7  8]]


In [15]:
# We are interested in the minimum value of the bottom row.

print ( min ( D [ -1 ] ) )

2


In [None]:
from Py.geneReaderQ import geneReaderQ

filename = 'SeqFiles/ERR266411_1.for_asm.fastq'

data = open ( filename, 'r' )

reads = geneReaderQ ( filename )

data.close ()

In [17]:
start = 0  # start all the way at the left

a = 

b = reads 

k = 30

min_length = k



In [2]:
setobj = [ ]
dictobj = { }

reads = ['ABCDEFG', 'EFGHIJ', 'HIJABC']

k = 3


if len ( reads [ i ] ) < k :
                
    return 0
    
else :
        
    for i in reads :

        for j in reads [ i ] :
            
            if ( ( j + 2 ) < )
            setobj.append ( reads [ i ] [ j ] )

        



'B'