In [1]:
import numpy as np
import pandas as pd

In [2]:
#Reading in LEIE dataset with NPIs and duplicates removed
leie_NPI = pd.read_csv('LEIE_NPI_Clean.csv')

In [3]:
#Confirming it appears to have read in correctly
leie_NPI.head()

Unnamed: 0,LASTNAME,FIRSTNAME,MIDNAME,BUSNAME,SPECIALTY,NPI,ADDRESS,CITY,STATE,ZIP,EXCLYear,excl_type
0,,,,184TH STREET PHARMACY CORP,PHARMACY,1922348218,69 E 184TH ST,BRONX,NY,10468,2018,1128a1
1,,,,"A & Y MEDICAL SUPPLY, INC",DME - GENERAL,1942476080,"6310 108TH STREET, APT 6J",FOREST HILLS,NY,11375,2017,1128b8
2,,,,"A CARING ALTERNATIVE, INC",HOME HEALTH AGENCY,1275600959,"1229 HURON RD E, FLR 6TH",CLEVELAND,OH,44115,2013,1128a1
3,,,,"A FAIR DEAL PHARMACY, INC",PHARMACY,1891731758,"C/O P O BOX 329014, #69709-05",BROOKLYN,NY,11232,2017,1128b8
4,,,,"ACACIA MENTAL HEALTH CLINIC, L",MENTAL/BEHAVIORAL HE,1851631543,5228 W FOND DU LAC AVENUE,MILWAUKEE,WI,53216,2019,1128b7


In [4]:
#Checking number of rows
leie_NPI.shape

(4634, 12)

In [5]:
#Reading in Provider Summary Data
Provider = pd.read_csv('Provider_2017_72Columns.txt', sep='\t')

In [6]:
#Checking number of rows
Provider.shape

(1162898, 72)

In [7]:
#Looking at columns for Provider data
Provider.columns

Index(['npi', 'nppes_provider_last_org_name', 'nppes_provider_first_name',
       'nppes_provider_mi', 'nppes_credentials', 'nppes_provider_gender',
       'nppes_entity_code', 'nppes_provider_street1', 'nppes_provider_city',
       'nppes_provider_zip5', 'nppes_provider_state', 'nppes_provider_country',
       'specialty_description', 'description_flag',
       'medicare_prvdr_enroll_status', 'total_claim_count',
       'total_30_day_fill_count', 'total_drug_cost', 'total_day_supply',
       'bene_count', 'ge65_suppress_flag', 'total_claim_count_ge65',
       'total_30_day_fill_count_ge65', 'total_drug_cost_ge65',
       'total_day_supply_ge65', 'bene_count_ge65_suppress_flag',
       'bene_count_ge65', 'brand_suppress_flag', 'brand_claim_count',
       'brand_drug_cost', 'generic_suppress_flag', 'generic_claim_count',
       'generic_drug_cost', 'other_suppress_flag', 'other_claim_count',
       'other_drug_cost', 'mapd_suppress_flag', 'mapd_claim_count',
       'mapd_drug_cost', 'pd

In [8]:
#Renaming so that both dataframes have the same column heading on the primary key
leie_NPI.rename(columns = {'NPI': 'npi'}, inplace = True)

In [9]:
#Confirming renaming worked correctly
leie_NPI.head()

Unnamed: 0,LASTNAME,FIRSTNAME,MIDNAME,BUSNAME,SPECIALTY,npi,ADDRESS,CITY,STATE,ZIP,EXCLYear,excl_type
0,,,,184TH STREET PHARMACY CORP,PHARMACY,1922348218,69 E 184TH ST,BRONX,NY,10468,2018,1128a1
1,,,,"A & Y MEDICAL SUPPLY, INC",DME - GENERAL,1942476080,"6310 108TH STREET, APT 6J",FOREST HILLS,NY,11375,2017,1128b8
2,,,,"A CARING ALTERNATIVE, INC",HOME HEALTH AGENCY,1275600959,"1229 HURON RD E, FLR 6TH",CLEVELAND,OH,44115,2013,1128a1
3,,,,"A FAIR DEAL PHARMACY, INC",PHARMACY,1891731758,"C/O P O BOX 329014, #69709-05",BROOKLYN,NY,11232,2017,1128b8
4,,,,"ACACIA MENTAL HEALTH CLINIC, L",MENTAL/BEHAVIORAL HE,1851631543,5228 W FOND DU LAC AVENUE,MILWAUKEE,WI,53216,2019,1128b7


In [10]:
Combined_2017_NPI = pd.merge(Provider, leie_NPI, how = 'left', on = 'npi', sort=False, indicator = True)

In [11]:
#Exploring the merge
Combined_2017_NPI.head()

Unnamed: 0,npi,nppes_provider_last_org_name,nppes_provider_first_name,nppes_provider_mi,nppes_credentials,nppes_provider_gender,nppes_entity_code,nppes_provider_street1,nppes_provider_city,nppes_provider_zip5,...,MIDNAME,BUSNAME,SPECIALTY,ADDRESS,CITY,STATE,ZIP,EXCLYear,excl_type,_merge
0,1003000126,ENKESHAFI,ARDALAN,,M.D.,M,I,900 SETON DR,CUMBERLAND,21502.0,...,,,,,,,,,,left_only
1,1003000142,KHALIL,RASHID,,M.D.,M,I,4126 N HOLLAND SYLVANIA RD,TOLEDO,43623.0,...,,,,,,,,,,left_only
2,1003000167,ESCOBAR,JULIO,E,DDS,M,I,5 PINE CONE RD,DAYTON,89403.0,...,,,,,,,,,,left_only
3,1003000175,REYES-VASQUEZ,BELINDA,,D.D.S.,F,I,322 N AZUSA AVE STE 202,LA PUENTE,91744.0,...,,,,,,,,,,left_only
4,1003000282,BLAKEMORE,ROSIE,K,FNP,F,I,TENNESSEE PRISON FOR WOMEN,NASHVILLE,37243.0,...,,,,,,,,,,left_only


In [12]:
Combined_2017_NPI.shape

(1162898, 84)

In [13]:
#Checking to see how many matched
Combined_2017_NPI['_merge'].value_counts()

left_only     1162451
both              447
right_only          0
Name: _merge, dtype: int64

In [14]:
#Looking at exclusion timeframe to confirm it makes sense with 2017 provider data and decide if we should choose a different Provider year dataset
Combined_2017_NPI['EXCLYear'].value_counts()

2019.0    161
2018.0    141
2017.0     87
2020.0     53
2016.0      3
2015.0      1
2012.0      1
Name: EXCLYear, dtype: int64

Provider claims data is posted publicly for 2013-2017. We should look at the various years to explore how long it might take for providers to still be receiving claims before they are excluded since a higher percentage of the total exclusions connected in 2019 and 2020 for this 2017 data. We need to balance a timeframe connection where we can label enough claims as fraud but ensure those claims are in a close enough timeframe that those are connected to why the provider was excluded.

In [15]:
#Creating a column to flag those marked as excluded
Combined_2017_NPI['exclusion_flag'] = Combined_2017_NPI['_merge'].apply(lambda x: 1 if x == 'both' else 0)

In [16]:
#Confirming exclusion column added correctly
Combined_2017_NPI['exclusion_flag'].value_counts()

0    1162451
1        447
Name: exclusion_flag, dtype: int64

In [17]:
#Dropping the _merge column from data and the other unneeded columns from the LEIE data
Combined_2017_NPI_2 = Combined_2017_NPI.drop(columns = ['LASTNAME', 'FIRSTNAME', 'MIDNAME', 'BUSNAME', 'SPECIALTY', 'ADDRESS', 'CITY', 'STATE', 'ZIP', '_merge'])

In [18]:
#Confirming data looks ready to save in a new file
Combined_2017_NPI_2.head()

Unnamed: 0,npi,nppes_provider_last_org_name,nppes_provider_first_name,nppes_provider_mi,nppes_credentials,nppes_provider_gender,nppes_entity_code,nppes_provider_street1,nppes_provider_city,nppes_provider_zip5,...,antipsych_bene_count_ge65,average_age_of_beneficiaries,beneficiary_female_count,beneficiary_male_count,beneficiary_nondual_count,beneficiary_dual_count,beneficiary_average_risk_score,EXCLYear,excl_type,exclusion_flag
0,1003000126,ENKESHAFI,ARDALAN,,M.D.,M,I,900 SETON DR,CUMBERLAND,21502.0,...,,72.0,142.0,92.0,143.0,91.0,2.1685,,,0
1,1003000142,KHALIL,RASHID,,M.D.,M,I,4126 N HOLLAND SYLVANIA RD,TOLEDO,43623.0,...,0.0,65.0,184.0,92.0,143.0,133.0,1.8029,,,0
2,1003000167,ESCOBAR,JULIO,E,DDS,M,I,5 PINE CONE RD,DAYTON,89403.0,...,0.0,72.0,16.0,17.0,,,1.0598,,,0
3,1003000175,REYES-VASQUEZ,BELINDA,,D.D.S.,F,I,322 N AZUSA AVE STE 202,LA PUENTE,91744.0,...,0.0,,,,,,,,,0
4,1003000282,BLAKEMORE,ROSIE,K,FNP,F,I,TENNESSEE PRISON FOR WOMEN,NASHVILLE,37243.0,...,0.0,62.0,,,,,4.5148,,,0


In [19]:
#Creating new dataset to work with moving forward
Combined_2017_NPI_2.to_csv('Providers_Connected_byNPI.txt', sep='\t', index=False)