# String Metrics

Some examples for string similarity.

In [1]:
#!pip install python-Levenshtein

In [2]:
#!pip install nltk

In [3]:
#!pip install textdistance
# https://pypi.org/project/textdistance/

In [4]:
import numpy as np
import textdistance
import Levenshtein
import nltk
from scipy.spatial import distance
from sklearn.metrics import jaccard_score

## Levenshtein

In [5]:
string1 = "bbva"
string2 = "bbvaelmejorbanco"

In [6]:
def levenshtein(seq1, seq2):
    size_x = len(seq1) + 1
    size_y = len(seq2) + 1
    matrix = np.zeros ((size_x, size_y))
    for x in range(size_x):
        matrix [x, 0] = x
    for y in range(size_y):
        matrix [0, y] = y

    for x in range(1, size_x):
        for y in range(1, size_y):
            if seq1[x-1] == seq2[y-1]:
                matrix [x,y] = min(
                    matrix[x-1, y] + 1,
                    matrix[x-1, y-1],
                    matrix[x, y-1] + 1
                )
            else:
                matrix [x,y] = min(
                    matrix[x-1,y] + 1,
                    matrix[x-1,y-1] + 1,
                    matrix[x,y-1] + 1
                )
    return (matrix[size_x - 1, size_y - 1])

levenshtein(string1, string2)

12.0

In [7]:
def levenshtein2(str1, str2):
    d=dict()
    for i in range(len(str1)+1):
        d[i]=dict()
        d[i][0]=i
    for i in range(len(str2)+1):
        d[0][i] = i
    for i in range(1, len(str1)+1):
        for j in range(1, len(str2)+1):
            d[i][j] = min(d[i][j-1]+1, d[i-1][j]+1, d[i-1][j-1]+(not str1[i-1] == str2[j-1]))
    return d[len(str1)][len(str2)]

levenshtein2(string1, string2)

12

In [8]:
Levenshtein.distance(string1, string2)

12

In [9]:
textdistance.levenshtein(string1, string2)

12

In [10]:
"""
distance_ratio = (Levenshtein Distance)/(Alignment length )

p = (1 - L/m)

Where L is the levenshtein distance and m is the length of the sum of the two words
"""
Levenshtein.ratio(string1, string2) == (1 - Levenshtein.distance(string1, string2)/len(string1+string2))

True

## Jaro

In [11]:
string1 = "bbva"
string2 = "bbvaelmejorbanco"

In [12]:
Levenshtein.jaro(string1, string2)

0.75

In [13]:
textdistance.jaro.similarity(string1, string2)

0.75

## Hamming

In [14]:
string1 = "bbqva"
string2 = "bBrve"

In [15]:
Levenshtein.hamming(string1, string2) # hamming expects two unicodes of the same length

3

In [16]:
textdistance.hamming(string1, string2)

3

In [17]:
textdistance.hamming.similarity(string1, string2)

2

## Jaccard

The Jaccard index, or Jaccard similarity coefficient, defined as the size of the intersection divided by the size of the union of two label sets, is used to compare set of predicted labels for a sample to the corresponding set of real labels.

In [18]:
string1 = "bbva"
string2 = "bbvaelmejorbanco"

array1 = np.array([1, 1, 0])
array2 = np.array([1, 0, 1])

In [19]:
def jaccard_similarity(list1, list2):
    s1 = set(list1)
    s2 = set(list2)
    return len(s1.intersection(s2)) / len(s1.union(s2))

1 - jaccard_similarity(string1, string2) # 0 equals 1 differents

0.7272727272727273

In [20]:
# https://www.nltk.org/_modules/nltk/metrics/distance.html

word1 = set(string1)
word2 = set(string2)
 
nltk.jaccard_distance(word1, word2) # 0 equals 1 differents

0.7272727272727273

In [21]:
# Function used by nltk
def jaccard_distance(label1, label2):
    label1 = set(label1)
    label2 = set(label2)
    return (len(label1.union(label2)) - len(label1.intersection(label2))) / len(label1.union(label2))

jaccard_distance(string1, string2)

0.7272727272727273

In [22]:
from scipy.spatial import distance
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.jaccard.html

distance.jaccard(array1, array2)

0.6666666666666666

In [23]:
#from sklearn.metrics import jaccard_score
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.jaccard_score.html

1 - jaccard_score(array1, array2)

0.6666666666666667