In [1]:
import numpy as np
import pandas as pd
import sys

df = pd.read_csv("data/national_names.csv") # import our data
df = df.sort_values(['Name']) # sort values to get similar names
df = df.loc[df["Year"] == 2012].reset_index() # filter by the year
df = df.iloc[0:100] # select the first 100 values
ls_Names = df["Name"].tolist() # convert the column to a list


In [2]:
DISTANCE_LENGTH = 2 # define the length to cluster upon

def levenshtein(seq1, seq2):
    size_x = len(seq1) + 1
    size_y = len(seq2) + 1
    matrix = np.zeros ((size_x, size_y))
    for x in range(size_x):
        matrix [x, 0] = x
    for y in range(size_y):
        matrix [0, y] = y

    for x in range(1, size_x):
        for y in range(1, size_y):
            if seq1[x-1] == seq2[y-1]:
                matrix [x,y] = min(
                    matrix[x-1, y] + 1,
                    matrix[x-1, y-1],
                    matrix[x, y-1] + 1
                )
            else:
                matrix [x,y] = min(
                    matrix[x-1,y] + 1,
                    matrix[x-1,y-1] + 1,
                    matrix[x,y-1] + 1
                )
    return (matrix[size_x - 1, size_y - 1])

def form_groups(data):
    grs = list() 
    for name in data:
        for g in grs:
            if all(levenshtein(name, w) < DISTANCE_LENGTH for w in g):
                g.append(name)
                break
        else:
            grs.append([name, ])

    print('output groups:', len(grs))
    return list(grs)

def allocate_groups(grs, data):
    return_list = list()
    for checkagainst in data:
        for i in range(0, len(grs)):
                if checkagainst in grs[i]:
                    return_list.append(i)
    return return_list


In [3]:
grs = form_groups(ls_Names)
allocated_groups = allocate_groups(grs, ls_Names)

df["allocated_group"] = allocated_groups


output groups: 57


In [4]:
grs[0:10]


[['Aaban', 'Aadan', 'Aahan', 'Aaran'],
 ['Aabha'],
 ['Aadam', 'Aagam'],
 ['Aadarsh', 'Aakarsh'],
 ['Aaden', 'Aadin', 'Aadon', 'Aadyn'],
 ['Aadhav'],
 ['Aadhavan'],
 ['Aadhira'],
 ['Aadhya', 'Aadya'],
 ['Aadi', 'Aadi', 'Aadil', 'Aadit', 'Aadiv']]

In [5]:
df.loc[df["allocated_group"] == 0]


Unnamed: 0,index,Id,Name,Year,Gender,Count,allocated_group
0,1751999,1752000,Aaban,2012,M,11,0
3,1752001,1752002,Aadan,2012,M,11,0
28,1749489,1749490,Aahan,2012,M,21,0
96,1752509,1752510,Aaran,2012,M,10,0
