# String matching of full names
> 1. Load data with full names
> 2. Perform string matching using fuzzywuzzy

In [1]:
import sys
sys.path.insert(0, '/healthcare-fraud/src/')
import scripts as src
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from multiprocessing import Pool
!pip install tqdm
from tqdm import tqdm_notebook
import time
from itertools import product
%matplotlib inline

[31mdistributed 1.21.8 requires msgpack, which is not installed.[0m
[33mYou are using pip version 10.0.1, however version 18.0 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
# load data
LEIE_id_full = src.read_from_efs('LEIE_id_full.csv')
PartD_id_full = src.read_from_efs('PartD_id_full.csv')
print('LEIE_id_full shape: %s, PartD_id_full shape: %s' %(LEIE_id_full.shape, PartD_id_full.shape))

LEIE_id_full shape: (14967, 7), PartD_id_full shape: (1294850, 8)


  mask |= (ar1 == a)


In [3]:
cities = list(set(LEIE_id_full['city']).intersection(set(PartD_id_full['city'])))
print('Number of cities common to LEIE and PartD: %d' %len(cities))

Number of cities common to LEIE and PartD: 3723


In [4]:
def build_pairs(city):
    LEIE_names = LEIE_id_full['full'][LEIE_id_full['city'] == city]
    PartD_names = PartD_id_full['full'][PartD_id_full['city'] == city]
    return list(product(LEIE_names, PartD_names))

In [5]:
print("Number of available CPU cores: ")
!cat /proc/cpuinfo | grep processor | wc -l

Number of available CPU cores: 
64


In [13]:
# Build string pairs in parallel using multi-processing
if __name__ == '__main__':
    tasks = cities
    function = build_pairs
    with Pool(processes=64)as p:
        pairs = list(tqdm_notebook(p.imap(function, tasks), total=len(tasks)))

HBox(children=(IntProgress(value=0, max=3723), HTML(value='')))

In [14]:
# make a dictionary before converting
city_pairs_dict = {}
for i, city in enumerate(cities):
    city_pairs_dict[city] = pairs[i]

# unpack name pairs
pairs = []
for city in cities:
    pairs += city_pairs_dict[city]
print('There are %s million pairs!' %round(len(pairs)/1000000,1))

There are 26.0 million pairs!


## String matching using Fuzzywuzzy
---

In [15]:
!pip install fuzzywuzzy
from fuzzywuzzy import fuzz

def get_fuzz_scores(str_pair):
    str_a = str_pair[0]
    str_b = str_pair[1]
    fuzz_scores = [fuzz.QRatio(str_a,str_b),
                   fuzz.UQRatio(str_a,str_b),
                   fuzz.UWRatio(str_a,str_b),
                   fuzz.WRatio(str_a,str_b),                        
                   fuzz.partial_ratio(str_a,str_b),
                   fuzz.partial_token_sort_ratio(str_a,str_b),                  
                   fuzz.token_set_ratio(str_a,str_b),
                   fuzz.token_sort_ratio(str_a,str_b)]
    return fuzz_scores

[31mdistributed 1.21.8 requires msgpack, which is not installed.[0m
[33mYou are using pip version 10.0.1, however version 18.0 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [None]:
# Get fuzzy wuzzy scores in parallel using multi-processing
if __name__ == '__main__':
    tasks = pairs
    function = get_fuzz_scores
    with Pool(processes=64)as p:
        fuzz_scores = list(tqdm_notebook(p.imap(function, tasks), total=len(tasks)))

HBox(children=(IntProgress(value=0, max=25991160), HTML(value='')))

In [None]:
# create pandas dataframe for fuzz scores
columns = ['Qratio','UQratio','UWQratio','UWQratio','partial_ratio','partial_token_sort_ratio','token_set_ratio','token_sort_ratio']
fuzz_scores_df = pd.DataFrame(data = fuzz_scores, index=pairs, columns=columns)

# save to efs
src.save_to_efs(fuzz_scores_df, 'fuzz_scores_df.csv')

In [None]:
fuzz_scores_df.boxplot(figsize=(12,6))
plt.show()

In [None]:
fuzz_scores_df.sort_values(by='partial_ratio', ascending=False).head(100)