# Experimenting With Levenshtein Distance

---



In [None]:
import numpy as np
import pandas as pd 
!pip install names
import names
from random import randint, randrange, seed
!pip install python-levenshtein
import Levenshtein

#### Firstly, lets create some random names and ID's<br> and transform them into a dataframe, denoted as GT(ground truth):



In [None]:
seed(10)
data = {'Full_Name':[],'ID':[]}
for i in range(10):
    data['Full_Name'].append(names.get_full_name())
    data['ID'].append(''.join(["{}".format(randint(0, 9)) for num in range(0, 9)]))
GT = pd.DataFrame(data)
GT 

Unnamed: 0,Full_Name,ID
0,Frank Huang,377420875
1,Thomas Twehous,506295664
2,Lois Tibbetts,527379609
3,Joseph Long,485358767
4,Johnny Fletcher,823630074
5,Jack Carrozza,269592117
6,Leopoldo Coleman,566734298
7,Ross Hart,422275563
8,Justin Davis,531476926
9,Arlene Ramos,848797127


#### Lets create an input of sub-dataframe with some typos

In [None]:
input = pd.DataFrame({'Full_Name':['Jack Caroza','Leopoldo Coleman JR','John-Fletcher']})
input

Unnamed: 0,Full_Name
0,Jack Caroza
1,Leopoldo Coleman JR
2,John-Fletcher


### Now, i'll present the Levenshtein distance between all possible distance calculations

In [None]:
dict_names = {name: [] for name in input.Full_Name.tolist()}
out = []
min= 20
for string1 in dict_names.keys():
  out = []
  min= 20
  for string2 in GT.Full_Name.tolist():
    if Levenshtein.distance(string1, string2) <= min:
      min = Levenshtein.distance(string1, string2)
      out.append([string1, string2,min])
    print(f'The distance between {string1} and {string2} is: {Levenshtein.distance(string1, string2)}')
  i = -1
  max = out[i][2]
  while out[i][2] == max:
    dict_names[out[i][0]].append(out[i][1])
    i-=1
# for key,val in zip(dict_names.keys(),dict_names.values()):
#   print(f"{key}: {val}")
#   print("#"*50)

The distance between Jack Caroza and Frank Huang is: 9
The distance between Jack Caroza and Thomas Twehous is: 12
The distance between Jack Caroza and Lois Tibbetts is: 12
The distance between Jack Caroza and Joseph Long is: 9
The distance between Jack Caroza and Johnny Fletcher is: 13
The distance between Jack Caroza and Jack Carrozza is: 2
The distance between Jack Caroza and Leopoldo Coleman is: 13
The distance between Jack Caroza and Ross Hart is: 8
The distance between Jack Caroza and Justin Davis is: 10
The distance between Jack Caroza and Arlene Ramos is: 10
The distance between Leopoldo Coleman JR and Frank Huang is: 16
The distance between Leopoldo Coleman JR and Thomas Twehous is: 16
The distance between Leopoldo Coleman JR and Lois Tibbetts is: 16
The distance between Leopoldo Coleman JR and Joseph Long is: 15
The distance between Leopoldo Coleman JR and Johnny Fletcher is: 15
The distance between Leopoldo Coleman JR and Jack Carrozza is: 17
The distance between Leopoldo Col

Finally, by minimizing the distance, i can bind between the input and the ground truth:

In [None]:
print("#"*50)
for key,val in zip(dict_names.keys(),dict_names.values()):
  print(f"{key}: {val}")
  print("#"*50)

##################################################
Jack Caroza: ['Jack Carrozza']
##################################################
Leopoldo Coleman JR: ['Leopoldo Coleman']
##################################################
John-Fletcher: ['Johnny Fletcher']
##################################################


### After correcting the faulty inputs, i will extract the information i need on the input by joining the tables on the correct Name:

In [None]:
input

Unnamed: 0,Full_Name
0,Jack Caroza
1,Leopoldo Coleman JR
2,John-Fletcher


In [None]:
df = input.apply(lambda row:dict_names[row['Full_Name']][0], axis=1).reset_index()
df.columns = ['0','Full_Name']; df.drop('0',axis = 1,inplace = True)
df

Unnamed: 0,Full_Name
0,Jack Carrozza
1,Leopoldo Coleman
2,Johnny Fletcher


# Now, finally we can get the corresponding ID's for the fixed input:

In [None]:
df.join(GT.set_index('Full_Name'),on = ['Full_Name'])


Unnamed: 0,Full_Name,ID
0,Jack Carrozza,269592117
1,Leopoldo Coleman,566734298
2,Johnny Fletcher,823630074
