# String matching of full names
> 1. Load data with full names
> 2. Perform string matching

In [1]:
# import sys
# sys.path.insert(0, '/healthcare-fraud/src/')
import src.scripts as src
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
# load data
LEIE_id_full = src.read_from_efs('LEIE_id_full.csv')
PartD_id_full = src.read_from_efs('PartD_id_full.csv')
print('LEIE_id_full shape: %s, PartD_id_full shape: %s' %(LEIE_id_full.shape, PartD_id_full.shape))

LEIE_id_full shape: (14967, 7), PartD_id_full shape: (1294850, 8)


  mask |= (ar1 == a)


In [4]:
city_intersection = list(set(LEIE_id_full['city']).intersection(set(PartD_id_full['city'])))
print('Number of cities common to LEIE and PartD: %d' %len(city_intersection))

Number of cities common to LEIE and PartD: 3723


In [5]:
each_city = city_intersection[0]

In [6]:
def build_pairs(city):
    LEIE_names = LEIE_id_full['full'][LEIE_id_full['city'] == city]
    PartD_names = PartD_id_full['full'][PartD_id_full['city'] == city]
    
    pairs = []
    for each_in_LEIE in LEIE_names:
        for each_in_PartD in PartD_names:
            pair = each_in_LEIE, each_in_PartD
            pairs.append(pair)
    
    return pairs

In [7]:
# Build pairs in parallel for speed
from joblib import Parallel, delayed
arg_instances = city_intersection
city_pairs = Parallel(n_jobs=-1, 
                   verbose=5, 
                   backend="threading")(map(delayed(build_pairs), arg_instances))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 64 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done 160 tasks      | elapsed:   17.2s
[Parallel(n_jobs=-1)]: Done 322 tasks      | elapsed:   32.2s
[Parallel(n_jobs=-1)]: Done 520 tasks      | elapsed:   50.9s
[Parallel(n_jobs=-1)]: Done 754 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1024 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 1330 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 1672 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 2050 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 2464 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 2914 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Done 3400 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done 3723 out of 3723 | elapsed:  6.1min finished


In [19]:
# unpack city pairs
pairs = []
for i in range(len(city_pairs)):
    pairs += city_pairs[i]
print('There are %s million pairs!' %round(len(pairs)/1000000,1))

There are 26.0 million pairs!


## String matching usinf Fuzzywuzzy
---

In [None]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import time
import multiprocessing

In [90]:
# multi-processing for token_sort_ratio
s_time = time.time()
if __name__ == '__main__':
    with multiprocessing.Pool(processes=3) as pool:
        fuzz_scores_token_sort = pool.starmap(fuzz.token_sort_ratio, pairs)
        
e_time = time.time()
print('Elapsed time: ', e_time - s_time)
# 100K pairs
# fuzz.ratio: t = 3.63sec (results are terrible)
# fuzz.partial_ratio: t = 19.47sec (results just as terrible)
# fuzz.token_sort_ratio: t = 5.01sec (intersting! it's very accurate on specialty but not so accurate on names)
# fuzz.token_set_ratio: t = 5.23sec (simialrly accurate on specialty but not so accurate on names)

Elapsed time:  5.2377519607543945


In [None]:
# multi-processing for ratio
s_time = time.time()
if __name__ == '__main__':
    with multiprocessing.Pool(processes=3) as pool:
        fuzz_scores = pool.starmap(fuzz.ratio, pairs)
        
e_time = time.time()
print('Elapsed time: ', e_time - s_time)

# order pair by high-2-low scores
fuzzy_token_sort_ratio_match_pairs = [pairs[idx] for idx in np.argsort(fuzz_scores_token_sort)[::-1]]
fuzzy_ratio_match_pairs = [pairs[idx] for idx in np.argsort(fuzz_scores)[::-1]]

# make a dataframe and save results to efs
fuzzywuzzy_pairs_df = pd.DataFrame({'ratio(LEIE:PartD)':fuzzy_ratio_match_pairs,
                                    'token_sort_ratio(LEIE:PartD)':fuzzy_token_sort_ratio_match_pairs[:100000]})
src.save_to_efs(fuzzywuzzy_pairs_df, 'fuzzywuzzy_pairs_df.src')

In [121]:
len(fuzzy_ratio_match_pairs)

100000

100000

(100000, 2)