# **Match as many records as possible.**

## _Step 0. Load packages_

In [6]:
import sys
sys.path.insert(0, '/healthcare-fraud/src/')
import scripts as src
import pandas as pd
import numpy as np

## _Step 1. Load, clean, and prep data_
---

In [None]:
# load data
LEIE_df = src.read_from_efs('LEIE_2014+.csv')
PartD_df = src.read_from_efs('raw_PartD.csv')

In [None]:
# build unique ID
LEIE_id1 = LEIE_df[['exclusion year',
                    'NPI',
                    'LASTNAME',
                    'BUSNAME',
                    'FIRSTNAME',
                    'CITY',
                    'SPECIALTY']].drop_duplicates().reset_index(drop=True)

PartD_id1 = PartD_df[['npi',
                     'nppes_provider_last_org_name',
                     'nppes_provider_first_name',
                     'nppes_provider_city',
                     'specialty_description']].drop_duplicates().reset_index(drop=True)

In [None]:
# rename names in PartD
PartD_names_dict = {'nppes_provider_last_org_name':'last/org name',
                    'nppes_provider_first_name':'first name',
                    'nppes_provider_city':'city',
                    'specialty_description':'specialty'}
PartD_id = PartD_id1.rename(columns = PartD_names_dict)

# replace missing values in `LASTNAME` with values from `BUSNAME`
tf_array = LEIE_id1['LASTNAME'].isna() # find where are the missing values in `LASTNAME`
LEIE_id1['LASTNAME'][tf_array] = LEIE_id1['BUSNAME'][tf_array] # replace those with `BUSNAME`
LEIE_id1.drop('BUSNAME', axis=1, inplace=True)

# rename names in PartD
LEIE_names_dict = {'NPI':'npi',
                   'LASTNAME':'last/org name',
                   'FIRSTNAME':'first name',
                   'CITY':'city',
                   'SPECIALTY':'specialty'}
LEIE_id = LEIE_id1.rename(columns = LEIE_names_dict)


In [None]:
PartD_id.head()

In [None]:
LEIE_id.head()

## _Step 3. Match up ids_
> 1. Add `exclusion_method` column to PartD_id
> 2. Add `exclusion_year` column to PartD_id
> 3. Update all exclusions by `npi`
> 4. Save data frames for `fuzzy-match`
---

In [None]:
PartD_id['exclusion_method'] = [np.nan] * PartD_id.shape[0]
PartD_id['exclusion_year'] = [np.nan] * PartD_id.shape[0]

In [None]:
PartD_id.head()

In [None]:
npi_intersection = set(PartD_id['npi']).intersection(set(LEIE_id['npi']))
tf_array_PartD = [npi in npi_intersection for npi in PartD_id['npi'].values]
tf_array_LEIE = [npi in npi_intersection for npi in LEIE_id['npi'].values]
excluded_by_npi = LEIE_id[['exclusion year','npi']][tf_array_LEIE].values

In [None]:
len(set(PartD_id[tf_array_PartD]['npi']))                                       

In [None]:
len(set(LEIE_id[tf_array_LEIE]['npi']))

In [None]:
for year, npi in excluded_by_npi:
    tf_aray = PartD_id['npi'].values == npi
    PartD_id['exclusion_method'][tf_aray] = ['npi'] * np.count_nonzero(tf_aray == True)
    PartD_id['exclusion_year'][tf_aray] = [year] * np.count_nonzero(tf_aray == True)

In [None]:
PartD_id_labeled_npi = PartD_id[PartD_id['exclusion_method'] == 'npi'].reset_index(drop=True)
PartD_id_labeled_npi.head()

In [None]:
# build data frame for `fuzzy-match`
PartD_id_unlabeled = PartD_id[PartD_id['exclusion_method'] != 'npi'].reset_index(drop=True)
print(PartD_id.shape, PartD_id_labeled_npi.shape, PartD_id_unlabeled.shape)

**_Now, the data formats are all matched up and we can save the tables to efs._**

In [None]:
# save to clean data to efs
src.save_to_efs(PartD_id, 'PartD_id.csv')
src.save_to_efs(PartD_id_labeled_npi, 'PartD_id_labeled_npi.csv')
src.save_to_efs(PartD_id_unlabeled, 'PartD_id_unlabeled.csv')
src.save_to_efs(LEIE_id, 'LEIE_id.csv')