# Testing the Levenshtein algorithm on the pharmacist csv file

Levenshtein function

In [1]:
import numpy as np
def levenshtein(s, t, ratio_calc = False):
    """ levenshtein_ratio_and_distance:
        Calculates levenshtein distance between two strings.
        If ratio_calc = True, the function computes the
        levenshtein distance ratio of similarity between two strings
        For all i and j, distance[i,j] will contain the Levenshtein
        distance between the first i characters of s and the
        first j characters of t
    """
    # Initialize matrix of zeros
    rows = len(s)+1
    cols = len(t)+1
    distance = np.zeros((rows,cols),dtype = int)

    # Populate matrix of zeros with the indeces of each character of both strings
    for i in range(1, rows):
        for k in range(1,cols):
            distance[i][0] = i
            distance[0][k] = k

    # Iterate over the matrix to compute the cost of deletions,insertions and/or substitutions    
    for col in range(1, cols):
        for row in range(1, rows):
            if s[row-1] == t[col-1]:
                cost = 0 # If the characters are the same in the two strings in a given position [i,j] then the cost is 0
            else:
                # In order to align the results with those of the Python Levenshtein package, if we choose to calculate the ratio
                # the cost of a substitution is 2. If we calculate just distance, then the cost of a substitution is 1.
                if ratio_calc == True:
                    cost = 2
                else:
                    cost = 1
            distance[row][col] = min(distance[row-1][col] + 1,      # Cost of deletions
                                 distance[row][col-1] + 1,          # Cost of insertions
                                 distance[row-1][col-1] + cost)     # Cost of substitutions
    if ratio_calc == True:
        # Computation of the Levenshtein Distance Ratio
        Ratio = ((len(s)+len(t)) - distance[row][col]) / (len(s)+len(t))
        return Ratio
    else:
        # print(distance) # Uncomment if you want to see the matrix showing how the algorithm computes the cost of deletions,
        # insertions and/or substitutions
        # This is the minimum number of edits needed to convert string a to string b
        return "The strings are {} edits away".format(distance[row][col])

**Import the csv file into a pandas object**

In [2]:
import pandas as pd

In [12]:
df = pd.read_csv('ph1.csv',delimiter=';',index_col=0)

In [13]:
df

Unnamed: 0,pharmacie,lien,access,coordonnee
0,Pharmacie Nasri,https://www.telecontact.ma/annonceur/pharmacie...,https://www.telecontact.ma/plan-acces/pharmaci...,"33.562002091995716, -7.672472274085294"
1,Pharmacie Nasser,https://www.telecontact.ma/annonceur/pharmacie...,https://www.telecontact.ma/plan-acces/pharmaci...,"33.561755848280285, -7.6044817881133895"
2,Pharmacie Nassim Islane,https://www.telecontact.ma/annonceur/pharmacie...,https://www.telecontact.ma/plan-acces/pharmaci...,"33.51445505683616, -7.666007543630826"
3,Pharmacie Nassime,https://www.telecontact.ma/annonceur/pharmacie...,https://www.telecontact.ma/plan-acces/pharmaci...,"33.5999362186051, -7.588839631211711"
4,Pharmacie Nationale Diouri,https://www.telecontact.ma/annonceur/pharmacie...,https://www.telecontact.ma/plan-acces/pharmaci...,"33.570732870, -7.600641480"
5,Pharmacie Nejma,https://www.telecontact.ma/annonceur/pharmacie...,https://www.telecontact.ma/plan-acces/pharmaci...,"33.57454957680463, -7.676419742582766"
6,Pharmacie Nejmat Sidi Moumen,https://www.telecontact.ma/annonceur/pharmacie...,NO ACCESS,NO COORDINATES
7,Pharmacie Nice,https://www.telecontact.ma/annonceur/pharmacie...,https://www.telecontact.ma/plan-acces/pharmaci...,"33.5983, -7.64958"
8,Pharmacie Nigelle,https://www.telecontact.ma/annonceur/pharmacie...,https://www.telecontact.ma/plan-acces/pharmaci...,"33.53338031653804, -7.616745156671868"
9,Pharmacie Nisrine,https://www.telecontact.ma/annonceur/pharmacie...,https://www.telecontact.ma/plan-acces/pharmaci...,"33.56658458094873, -7.572862314430957"


The orthograph of some words can vary from a source to another, specially with retranscription from other languages (in this case arabic).

To solve the problem we can take the String matching approach and more particularly the Levenshtein algorithm that helps compute a "closeness ratio" between 2 string depending on the amount of change we had to make to change a String A to a String B.

## Example 

We gonna try to find the coordinates of the pharmacy "Anfa Bay" supposing it was mispelled in "Anfa Bey". An find the closest ratio to 1.

In [21]:
high = 0
for i in df['pharmacie']:
    print(levenshtein(i,'Pharmacie Anfa Bey',ratio_calc=True))
    ratio = levenshtein(i,'Pharmacie Anfa Bey',ratio_calc=True)
    if ratio > high:
        high = ratio
        lev = i

0.647058823529
0.685714285714
0.619047619048
0.666666666667
0.577777777778
0.705882352941
0.553191489362
0.727272727273
0.666666666667
0.666666666667
0.666666666667
0.594594594595
0.631578947368
0.611111111111
0.648648648649
0.666666666667
0.521739130435
0.585365853659
0.648648648649
0.578947368421
0.564102564103
0.634146341463
0.611111111111
0.615384615385
0.628571428571
0.648648648649
0.585365853659
0.648648648649
0.631578947368
0.5
0.615384615385
0.536585365854
0.604651162791
0.594594594595
0.651162790698
0.577777777778
0.578947368421
0.611111111111
0.594594594595
0.565217391304
0.6
0.558139534884
0.604651162791
0.634146341463
0.6
0.6
0.742857142857
0.577777777778
0.585365853659
0.631578947368
0.6
0.571428571429
0.631578947368
0.571428571429
0.6
0.648648648649
0.684210526316
0.6
0.545454545455
0.509803921569
0.727272727273
0.604651162791
0.648648648649
0.647058823529
0.631578947368
0.705882352941
0.648648648649
0.55
0.615384615385
0.48
0.611111111111
0.611111111111
0.666666666667
0.

In [23]:
print(high)
print(lev)

0.918918918919
Pharmacie  Anfa Bay


## Make the process more efficient

Doing so by lowercasing all the elements

In [24]:
high = 0
for i in df['pharmacie']:
    print(levenshtein(i.lower(),'Pharmacie Anfa Bey'.lower(),ratio_calc=True))
    ratio = levenshtein(i.lower(),'Pharmacie Anfa Bey'.lower(),ratio_calc=True)
    if ratio > high:
        high = ratio
        lev = i

0.705882352941
0.742857142857
0.666666666667
0.722222222222
0.622222222222
0.705882352941
0.595744680851
0.727272727273
0.666666666667
0.666666666667
0.666666666667
0.648648648649
0.684210526316
0.666666666667
0.702702702703
0.666666666667
0.608695652174
0.585365853659
0.648648648649
0.578947368421
0.564102564103
0.634146341463
0.666666666667
0.615384615385
0.685714285714
0.648648648649
0.585365853659
0.648648648649
0.631578947368
0.5
0.615384615385
0.585365853659
0.604651162791
0.648648648649
0.651162790698
0.622222222222
0.631578947368
0.611111111111
0.648648648649
0.608695652174
0.6
0.558139534884
0.604651162791
0.634146341463
0.65
0.6
0.742857142857
0.577777777778
0.634146341463
0.631578947368
0.6
0.571428571429
0.631578947368
0.571428571429
0.6
0.648648648649
0.684210526316
0.6
0.545454545455
0.549019607843
0.727272727273
0.651162790698
0.702702702703
0.647058823529
0.631578947368
0.705882352941
0.648648648649
0.6
0.615384615385
0.52
0.666666666667
0.611111111111
0.666666666667
0.

The effect is more moticeable on the farthest elements

## Extracting the coordinates

In [30]:
coord = df[df['pharmacie'] == lev]['coordonnee'].values

In [38]:
lcoord = coord.tolist()

In [39]:
type(lcoord)

list

In [40]:
print(lcoord)

['33.56933945843312, -7.696156179686795']
