# **Match as many records as possible.**

## _Step 0. Load packages_

In [77]:
import src.scripts as src
import pandas as pd
import numpy as np

## _Step 1. Load, clean, and prep data_
---

In [6]:
# load data
LEIE_df = src.read_from_efs('LEIE_2014+.csv')
PartD_df = src.read_from_efs('raw_PartD.csv')

  mask |= (ar1 == a)


In [70]:
# build unique ID
LEIE_id1 = LEIE_df[['exclusion year',
                    'NPI',
                    'LASTNAME',
                    'BUSNAME',
                    'FIRSTNAME',
                    'CITY',
                    'SPECIALTY']].drop_duplicates().reset_index(drop=True)

PartD_id1 = PartD_df[['npi',
                     'nppes_provider_last_org_name',
                     'nppes_provider_first_name',
                     'nppes_provider_city',
                     'specialty_description']].drop_duplicates().reset_index(drop=True)

In [71]:
# rename names in PartD
PartD_names_dict = {'nppes_provider_last_org_name':'last/org name',
                    'nppes_provider_first_name':'first name',
                    'nppes_provider_city':'city',
                    'specialty_description':'specialty'}
PartD_id = PartD_id1.rename(columns = PartD_names_dict)

# replace missing values in `LASTNAME` with values from `BUSNAME`
tf_array = LEIE_id1['LASTNAME'].isna() # find where are the missing values in `LASTNAME`
LEIE_id1['LASTNAME'][tf_array] = LEIE_id1['BUSNAME'][tf_array] # replace those with `BUSNAME`
LEIE_id1.drop('BUSNAME', axis=1, inplace=True)

# rename names in PartD
LEIE_names_dict = {'NPI':'npi',
                   'LASTNAME':'last/org name',
                   'FIRSTNAME':'first name',
                   'CITY':'city',
                   'SPECIALTY':'specialty'}
LEIE_id = LEIE_id1.rename(columns = LEIE_names_dict)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [72]:
PartD_id.head()

Unnamed: 0,npi,last/org name,first name,city,specialty
0,1003000126,ENKESHAFI,ARDALAN,CUMBERLAND,Internal Medicine
1,1003000142,KHALIL,RASHID,TOLEDO,Anesthesiology
2,1003000167,ESCOBAR,JULIO,DAYTON,Dentist
3,1003000282,BLAKEMORE,ROSIE,NASHVILLE,Nurse Practitioner
4,1003000407,GIRARDI,DAVID,PATTON,Family Practice


In [73]:
LEIE_id.head()

Unnamed: 0,exclusion year,npi,last/org name,first name,city,specialty
0,2018,1922348218,184TH STREET PHARMACY CORP,,BRONX,PHARMACY
1,2018,0,A & H VITAMINS SUPPLY CORP,,TEANECK,PHARMACY
2,2017,1942476080,"A & Y MEDICAL SUPPLY, INC",,FOREST HILLS,DME - GENERAL
3,2017,1891731758,"A FAIR DEAL PHARMACY, INC",,BROOKLYN,PHARMACY
4,2017,0,"A SERVICE CAB CO, INC",,METAIRIE,TRANSPORTATION CO


## _Step 3. Match up ids_
> 1. Add `exclusion_method` column to PartD_id
> 2. Add `exclusion_year` column to PartD_id
> 3. Update all exclusions by `npi`
> 4. Save data frames for `fuzzy-match`
---

In [80]:
PartD_id['exclusion_method'] = [np.nan] * PartD_id.shape[0]
PartD_id['exclusion_year'] = [np.nan] * PartD_id.shape[0]

In [81]:
PartD_id.head()

Unnamed: 0,npi,last/org name,first name,city,specialty,exclusion_method,exclusion_year
0,1003000126,ENKESHAFI,ARDALAN,CUMBERLAND,Internal Medicine,,
1,1003000142,KHALIL,RASHID,TOLEDO,Anesthesiology,,
2,1003000167,ESCOBAR,JULIO,DAYTON,Dentist,,
3,1003000282,BLAKEMORE,ROSIE,NASHVILLE,Nurse Practitioner,,
4,1003000407,GIRARDI,DAVID,PATTON,Family Practice,,


In [168]:
npi_intersection = set(PartD_id['npi']).intersection(set(LEIE_id['npi']))
tf_array_PartD = [npi in npi_intersection for npi in PartD_id['npi'].values]
tf_array_LEIE = [npi in npi_intersection for npi in LEIE_id['npi'].values]
excluded_by_npi = LEIE_id[['exclusion year','npi']][tf_array_LEIE].values

In [163]:
len(set(PartD_id[tf_array_PartD]['npi']))                                       

1079

In [164]:
len(set(LEIE_id[tf_array_LEIE]['npi']))

1079

In [176]:
for year, npi in excluded_by_npi:
    tf_aray = PartD_id['npi'].values == npi
    PartD_id['exclusion_method'][tf_aray] = ['npi'] * np.count_nonzero(tf_aray == True)
    PartD_id['exclusion_year'][tf_aray] = [year] * np.count_nonzero(tf_aray == True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [184]:
PartD_id_labeled_npi = PartD_id[PartD_id['exclusion_method'] == 'npi'].reset_index(drop=True)
PartD_id_labeled_npi.head()

Unnamed: 0,npi,last/org name,first name,city,specialty,exclusion_method,exclusion_year
0,1003042441,MANZO,MARK,GRAFTON,Physician Assistant,npi,2016.0
1,1003801044,AFZAL,FIAZ,KENNER,Internal Medicine,npi,2014.0
2,1003809195,MACKEY,JOSEPH,BLUE SPRINGS,Family Practice,npi,2014.0
3,1003811167,SORELL,PAUL,HUNTINGTON,Pain Management,npi,2017.0
4,1003817743,ECKMAN,CHARLES,OKLAHOMA CITY,Interventional Pain Management,npi,2017.0


In [190]:
# save data frames for `fuzzy-match`
PartD_id_unlabeled = PartD_id[PartD_id['exclusion_method'] != 'npi'].reset_index(drop=True)
print(PartD_id.shape, PartD_id_labeled_npi.shape, PartD_id_unlabeled.shape)

(1296102, 7) (1217, 7) (1294885, 7)


**_Now, the data formats are all matched up and we can save the tables to efs._**

In [182]:
# save to clean data to efs
src.save_to_efs(PartD_id, 'PartD_id.csv')
src.save_to_efs(PartD_id_labeled_npi, 'PartD_id_labeled_npi.csv')
src.save_to_efs(LEIE_id, 'LEIE_id.csv')