# Build out `full` names for string matching
---

## _Step 0. Load packages_

In [1]:
import sys
sys.path.insert(0, '/healthcare-fraud/src/')
import scripts as src
import pandas as pd
import numpy as np

## _Step 1. Load, clean, and prep data_
---

In [2]:
# load data
LEIE_id_unlabeled = src.read_from_efs('LEIE_id_unlabeled.csv')
PartD_id_unlabeled = src.read_from_efs('PartD_id_unlabeled.csv')
print('Shape of LEIE_id_unlabeled: %s, PartD_id_unlabeled: %s.' %(LEIE_id_unlabeled.shape, PartD_id_unlabeled.shape))

Shape of LEIE_id_unlabeled: (14968, 6), PartD_id_unlabeled: (1294885, 7).


  mask |= (ar1 == a)


## Build out `full` names
> 1. start with identifying companies first
> 2. than move on to people
> 3. merge the full names for people and companies <br>
> _some people will do business as company, ex: {'John Smith Dentist Houston':'John Smith & Associate LLC. Dentistry Houston'}_
---

In [6]:
# filter out companies
LEIE_id_comp = LEIE_id_unlabeled[LEIE_id_unlabeled['first name'].isna()].reset_index(drop = True)
PartD_id_comp = PartD_id_unlabeled[PartD_id_unlabeled['first name'].isna()].reset_index(drop = True)
print(LEIE_id_comp.shape, PartD_id_comp.shape)

(241, 6) (763, 7)


In [7]:
LEIE_id_comp['full'] = LEIE_id_comp[['last/org name','city']].apply(lambda x: ' '.join(x), axis=1)
PartD_id_comp['full'] = PartD_id_comp[['last/org name','city']].apply(lambda x: ' '.join(x), axis=1)
print(len(LEIE_id_comp), len(PartD_id_comp))

241 763


In [8]:
PartD_id_comp.head()

Unnamed: 0,npi,last/org name,first name,city,specialty,exclusion_method,exclusion_year,full
0,1003040478,MONA TABIB MD INC,,ENCINO,Family Practice,,,MONA TABIB MD INC ENCINO
1,1003074600,GERIATRIC RESEARCH & CONSULTING GROUP LLC,,BEAR CREEK TOWNSHIP,Family Medicine,,,GERIATRIC RESEARCH & CONSULTING GROUP LLC BEAR...
2,1003838053,AUBURN FAMILY DENTAL,,AUBURN,Dentist,,,AUBURN FAMILY DENTAL AUBURN
3,1003879941,BARRY L GETZOFF ASSOCIATES,,LANGHORNE,Internal Medicine,,,BARRY L GETZOFF ASSOCIATES LANGHORNE
4,1003887118,EVMS ACADEMIC PHYSICIANS AND SURGEONS HEALTH S...,,NORFOLK,Internal Medicine,,,EVMS ACADEMIC PHYSICIANS AND SURGEONS HEALTH S...


### Now, lets focus on people
> 1. build out `full` names
> 2. perform NLTK string matching. Fuzzywuzzy isn't installing.

In [14]:
# filter out people
LEIE_id_peop = LEIE_id_unlabeled[~LEIE_id_unlabeled['first name']
                                 .isna()].dropna(axis=0, how='any', subset=['last/org name',
                                                                            'first name', 
                                                                            'city']).reset_index(drop = True)
PartD_id_peop = PartD_id_unlabeled[~PartD_id_unlabeled['first name']
                                   .isna()].dropna(axis=0, how='any', subset=['last/org name',
                                                                              'first name', 
                                                                              'city']).reset_index(drop = True)
print(LEIE_id_peop.shape, PartD_id_peop.shape)

(14726, 6) (1294086, 7)


In [15]:
LEIE_id_peop[['last/org name','first name', 'city']].dropna(axis=0, how='any', inplace=False).shape

(14726, 3)

In [16]:
LEIE_id_peop['full'] = LEIE_id_peop[['last/org name','first name', 'city']].apply(lambda x: ' '.join(x), axis=1)
PartD_id_peop['full'] = PartD_id_peop[['last/org name','first name', 'city']].apply(lambda x: ' '.join(x), axis=1)
print(len(LEIE_id_peop), len(PartD_id_peop))

14726 1294086


In [17]:
PartD_id_peop.head()

Unnamed: 0,npi,last/org name,first name,city,specialty,exclusion_method,exclusion_year,full
0,1003000126,ENKESHAFI,ARDALAN,CUMBERLAND,Internal Medicine,,,ENKESHAFI ARDALAN CUMBERLAND
1,1003000142,KHALIL,RASHID,TOLEDO,Anesthesiology,,,KHALIL RASHID TOLEDO
2,1003000167,ESCOBAR,JULIO,DAYTON,Dentist,,,ESCOBAR JULIO DAYTON
3,1003000282,BLAKEMORE,ROSIE,NASHVILLE,Nurse Practitioner,,,BLAKEMORE ROSIE NASHVILLE
4,1003000407,GIRARDI,DAVID,PATTON,Family Practice,,,GIRARDI DAVID PATTON


## Finally, lets merge the lists and save the results to efs
---

In [20]:
LEIE_id_full = pd.concat([LEIE_id_comp, LEIE_id_peop], axis=0)
print(LEIE_id_full.shape, LEIE_id_comp.shape, LEIE_id_peop.shape)

(14967, 7) (241, 7) (14726, 7)


In [21]:
PartD_id_full = pd.concat([PartD_id_comp, PartD_id_peop], axis=0)
print(PartD_id_full.shape, PartD_id_comp.shape, PartD_id_peop.shape)

(1294850, 8) (763, 8) (1294087, 8)


In [22]:
# save to efs
src.save_to_efs(LEIE_id_full, "LEIE_id_full.csv")
src.save_to_efs(PartD_id_full, "PartD_id_full.csv")