# Medicare Fraud Detection

### Data 698: Data Science Research Project - Spring 2019

### Student: Chunhui Zhu

Medicare_Provider_Utilization_and_Payment_Data__Physician_and_Other_Supplier_PUF_CY2016.csv: https://data.cms.gov/Medicare-Physician-Supplier/Medicare-Provider-Utilization-and-Payment-Data-Phy/utc4-f9xp

LEIE2019.csv: https://oig.hhs.gov/exclusions/exclusions_list.asp

# PART I : Data Mining

In [112]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib as plt

### Step 1: LEIE-Fraud Data - Label Data

In [109]:
leie=pd.read_csv('LEIE2019.csv',low_memory=False)
leie.columns.values

array(['LASTNAME', 'FIRSTNAME', 'MIDNAME', 'BUSNAME', 'GENERAL',
       'SPECIALTY', 'UPIN', 'NPI', 'DOB', 'ADDRESS', 'CITY', 'STATE',
       'ZIP', 'EXCLTYPE', 'EXCLDATE', 'REINDATE', 'WAIVERDATE',
       'WVRSTATE'], dtype=object)

In [110]:
#NPI is a very important unique id to identify a provider.
npi_leie= leie[leie.NPI!= 0]

In [145]:
fraud_list=npi_leie.NPI.values.tolist()

In [146]:
len(fraud_list)

5036

### Step 2: Detect fraud data between 2012-2016 using LEIE-Fraude Data

### 2016 

In [147]:
def find_fraud(checklist, fraud_list):
    fraud=[]   
    for x in checklist:
        if x in fraud_list:
            fraud.append(x)
    return (fraud)

In [177]:
data2016=pd.read_table('Medicare_Provider_Util_Payment_PUF_CY2016.txt',low_memory=False)

In [178]:
data2016.shape

(9714897, 26)

In [180]:
data2016.columns.values

array(['NPI', 'NPPES_PROVIDER_LAST_ORG_NAME', 'NPPES_PROVIDER_FIRST_NAME',
       'NPPES_PROVIDER_MI', 'NPPES_CREDENTIALS', 'NPPES_PROVIDER_GENDER',
       'NPPES_ENTITY_CODE', 'NPPES_PROVIDER_STREET1',
       'NPPES_PROVIDER_STREET2', 'NPPES_PROVIDER_CITY',
       'NPPES_PROVIDER_ZIP', 'NPPES_PROVIDER_STATE',
       'NPPES_PROVIDER_COUNTRY', 'PROVIDER_TYPE',
       'MEDICARE_PARTICIPATION_INDICATOR', 'PLACE_OF_SERVICE',
       'HCPCS_CODE', 'HCPCS_DESCRIPTION', 'HCPCS_DRUG_INDICATOR',
       'LINE_SRVC_CNT', 'BENE_UNIQUE_CNT', 'BENE_DAY_SRVC_CNT',
       'AVERAGE_MEDICARE_ALLOWED_AMT', 'AVERAGE_SUBMITTED_CHRG_AMT',
       'AVERAGE_MEDICARE_PAYMENT_AMT', 'AVERAGE_MEDICARE_STANDARD_AMT'],
      dtype=object)

In [181]:
checklist2016=data2016['NPI'].unique()
len(checklist2016)

1000925

In [182]:
fraud_2016=find_fraud(checklist2016,fraud_list)
print(fraud_2016)

[1003811167, 1003817743, 1003822834, 1003859885, 1003892746, 1003904830, 1013009729, 1013087741, 1013095975, 1013999002, 1023004587, 1023006129, 1033126651, 1043215650, 1043590250, 1053360966, 1053486704, 1063436517, 1063452167, 1063486090, 1063501823, 1063575561, 1063611499, 1063696508, 1063851285, 1073511192, 1083614770, 1083654826, 1083667562, 1083681423, 1093726556, 1093755134, 1104898634, 1104913789, 1104924349, 1114951407, 1124000427, 1124057245, 1124088380, 1124296298, 1124356043, 1134213796, 1134228794, 1134280779, 1134285059, 1134401136, 1134575681, 1144228834, 1144503418, 1154413110, 1154471621, 1164414769, 1164436960, 1164458006, 1164515003, 1164598140, 1174545271, 1174571285, 1184710691, 1184727745, 1184785354, 1194702076, 1194745695, 1194801662, 1205806163, 1205824117, 1205840451, 1205954955, 1205963402, 1215053665, 1215328323, 1215941349, 1215963715, 1215999578, 1225067044, 1225111933, 1225144264, 1235200775, 1235211095, 1245210764, 1245231406, 1245246925, 1245337864, 124

In [183]:
len(fraud_2016)

281

In [185]:
fraud_df16=pd.DataFrame(fraud_2016)

In [284]:
fraud_df16.columns=['Fraud_id_16']

In [285]:
import pickle
fraud_df16_pkl=open("fraud_df16_pickle","wb")
pickle.dump(fraud_df16,fraud_df16_pkl)
fraud_df16_pkl.close()

In [286]:
fraud_df16_pkl=open("fraud_df16_pickle","rb")
fraud_df16=pickle.load(fraud_df16_pkl)
print(fraud_df16.head(5))

   Fraud_id_16
0   1003811167
1   1003817743
2   1003822834
3   1003859885
4   1003892746


In [363]:
fraud_df16.to_csv('fraud_df16.csv')

### 2015

In [171]:
data2015=pd.read_table('Medicare_Provider_Util_Payment_PUF_CY2015.txt',low_memory=False)

In [261]:
data2015.shape

(9497892, 26)

In [172]:
checklist2015=data2015['npi'].unique()
len(checklist2015)

968418

In [173]:
fraud_2015=find_fraud(checklist2015,fraud_list)
print(fraud_2015)

[1003042441, 1003811167, 1003817743, 1003822834, 1003859885, 1003885344, 1003892746, 1003904830, 1003972670, 1013009729, 1013087741, 1013095975, 1013097708, 1013999002, 1023004587, 1023006129, 1023079274, 1023119898, 1023166410, 1033126651, 1033145487, 1043215650, 1043219405, 1043238421, 1043257744, 1053360966, 1053372201, 1053450346, 1053486704, 1063436517, 1063452167, 1063477503, 1063486090, 1063501823, 1063526580, 1063537215, 1063575561, 1063611499, 1063696508, 1063851285, 1073511192, 1073524823, 1083614770, 1083654826, 1083667562, 1083681423, 1093726556, 1093755134, 1093814717, 1093851164, 1104809706, 1104898634, 1104913789, 1104924349, 1114106689, 1114922606, 1114951407, 1124000427, 1124016480, 1124057245, 1124088380, 1124089016, 1124192042, 1124296298, 1124356043, 1124423983, 1134213796, 1134228794, 1134280779, 1134285059, 1134286958, 1144214578, 1144228834, 1144503418, 1144511700, 1154337129, 1154349330, 1154413110, 1154471621, 1164414769, 1164436960, 1164458006, 1164459350, 116

In [175]:
len(fraud_2015)

475

In [186]:
fraud_df15=pd.DataFrame(fraud_2015)

In [287]:
fraud_df15.columns=['Fraud_id_15']

In [288]:
fraud_df15_pkl=open("fraud_df15_pickle","wb")
pickle.dump(fraud_df15,fraud_df15_pkl)
fraud_df15_pkl.close()

In [289]:
fraud_df15_pkl=open("fraud_df15_pickle","rb")
fraud_df15=pickle.load(fraud_df15_pkl)
print(fraud_df15.head(5))

   Fraud_id_15
0   1003042441
1   1003811167
2   1003817743
3   1003822834
4   1003859885


In [None]:
fraud_df15.to_csv('fraud_df15.csv')

### 2014

In [165]:
data2014=pd.read_table('Medicare_Physician_and_Other_Supplier_NPI_Aggregate_CY2014.txt',low_memory=False)

In [262]:
data2014.shape

(986677, 70)

In [168]:
checklist2014=data2014['npi'].unique()
len(checklist2014)

986677

In [169]:
fraud_2014=find_fraud(checklist2014,fraud_list)
print(fraud_2014)

[1003811167, 1003817743, 1003822834, 1003859885, 1003878711, 1003885344, 1003892746, 1003904830, 1003972670, 1013009729, 1013021047, 1013056670, 1013087741, 1013095975, 1013097708, 1013957182, 1013999002, 1023004587, 1023006129, 1023079274, 1023119898, 1023166410, 1033126651, 1033145487, 1033261953, 1033295332, 1043213887, 1043215650, 1043217052, 1043219405, 1043238421, 1043257744, 1053303792, 1053360966, 1053372201, 1053450346, 1053458018, 1053486704, 1053499673, 1063436517, 1063452167, 1063477503, 1063482198, 1063486090, 1063499572, 1063501823, 1063526580, 1063537215, 1063562635, 1063575561, 1063611499, 1063696508, 1063851285, 1073511192, 1073524823, 1073589420, 1073808945, 1083614770, 1083654826, 1083667562, 1083681423, 1083857270, 1083949143, 1093726556, 1093755134, 1093779456, 1093814717, 1093851164, 1093927576, 1104809706, 1104834209, 1104898634, 1104913789, 1104924349, 1114106689, 1114922606, 1114932084, 1114951407, 1124000427, 1124016480, 1124055900, 1124057245, 1124064639, 112

In [170]:
len(fraud_2014)

722

In [187]:
fraud_df14=pd.DataFrame(fraud_2014)

In [290]:
fraud_df14.columns=['Fraud_id_14']

In [291]:
fraud_df14_pkl=open("fraud_df14_pickle","wb")
pickle.dump(fraud_df14,fraud_df14_pkl)
fraud_df14_pkl.close()

In [292]:
fraud_df14_pkl=open("fraud_df14_pickle","rb")
fraud_df14=pickle.load(fraud_df14_pkl)
print(fraud_df14.head(5))

   Fraud_id_14
0   1003811167
1   1003817743
2   1003822834
3   1003859885
4   1003878711


In [362]:
fraud_df14.to_csv('fraud_df14.csv')

### 2013

In [158]:
data2013=pd.read_table('Medicare_Provider_Util_Payment_PUF_CY2013.txt',low_memory=False)

In [263]:
data2013.shape

(9287877, 28)

In [161]:
data2013.columns.values

array(['NPI', 'NPPES_PROVIDER_LAST_ORG_NAME', 'NPPES_PROVIDER_FIRST_NAME',
       'NPPES_PROVIDER_MI', 'NPPES_CREDENTIALS', 'NPPES_PROVIDER_GENDER',
       'NPPES_ENTITY_CODE', 'NPPES_PROVIDER_STREET1',
       'NPPES_PROVIDER_STREET2', 'NPPES_PROVIDER_CITY',
       'NPPES_PROVIDER_ZIP', 'NPPES_PROVIDER_STATE',
       'NPPES_PROVIDER_COUNTRY', 'PROVIDER_TYPE',
       'MEDICARE_PARTICIPATION_INDICATOR', 'PLACE_OF_SERVICE',
       'HCPCS_CODE', 'HCPCS_DESCRIPTION', 'HCPCS_DRUG_INDICATOR',
       'LINE_SRVC_CNT', 'BENE_UNIQUE_CNT', 'BENE_DAY_SRVC_CNT',
       'AVERAGE_MEDICARE_ALLOWED_AMT', 'STDEV_MEDICARE_ALLOWED_AMT',
       'AVERAGE_SUBMITTED_CHRG_AMT', 'STDEV_SUBMITTED_CHRG_AMT',
       'AVERAGE_MEDICARE_PAYMENT_AMT', 'STDEV_MEDICARE_PAYMENT_AMT'],
      dtype=object)

In [162]:
checklist2013=data2013['NPI'].unique()
len(checklist2013)

909606

In [163]:
fraud_2013=find_fraud(checklist2013,fraud_list)
print(fraud_2013)

[1003042441, 1003809195, 1003811167, 1003817743, 1003822834, 1003854159, 1003859885, 1003878711, 1003886979, 1003892746, 1003902800, 1003904830, 1003972670, 1003999376, 1013009729, 1013056670, 1013059740, 1013063064, 1013087741, 1013093178, 1013095975, 1013097708, 1013957182, 1013999002, 1023000122, 1023004587, 1023006129, 1023079274, 1023094190, 1023119898, 1023166410, 1023208675, 1033126651, 1033136544, 1033145487, 1033206800, 1033261953, 1033295332, 1043213887, 1043215650, 1043217052, 1043219405, 1043238421, 1043257744, 1043312168, 1043369093, 1053303792, 1053354787, 1053357376, 1053360966, 1053372201, 1053393405, 1053417345, 1053450346, 1053458018, 1053486704, 1053499673, 1063417368, 1063436517, 1063452167, 1063477503, 1063482198, 1063486090, 1063499572, 1063501823, 1063526580, 1063537215, 1063562635, 1063575561, 1063583060, 1063611499, 1063696508, 1063851285, 1073511192, 1073524823, 1073589420, 1073808945, 1083614770, 1083654826, 1083667562, 1083681423, 1083726921, 1083763874, 108

In [164]:
len(fraud_2013)

906

In [188]:
fraud_df13=pd.DataFrame(fraud_2013)

In [293]:
fraud_df13.columns=['Fraud_id_13']

In [294]:
fraud_df13_pkl=open("fraud_df13_pickle","wb")
pickle.dump(fraud_df13,fraud_df13_pkl)
fraud_df13_pkl.close()

In [295]:
fraud_df13_pkl=open("fraud_df13_pickle","rb")
fraud_df13=pickle.load(fraud_df13_pkl)
print(fraud_df13.head(5))

   Fraud_id_13
0   1003042441
1   1003809195
2   1003811167
3   1003817743
4   1003822834


In [361]:
fraud_df13.to_csv('fraud_df13.csv')

### 2012

In [122]:
data2012=pd.read_table('Medicare_Provider_Util_Payment_PUF_CY2012.txt',low_memory=False)

In [264]:
data2012.shape

(9153272, 28)

In [149]:
checklist2012=data2012['National Provider Identifier'].unique()
len(checklist2012)

880644

In [150]:
fraud_2012=find_fraud(checklist2012,fraud_list)
print(fraud_2012)

[1255378592, 1871522540, 1154337129, 1689986945, 1225148562, 1063575561, 1760541312, 1508125600, 1598713430, 1730291261, 1972687754, 1851329429, 1124057245, 1134401136, 1346224987, 1578754362, 1063436517, 1043238421, 1770553521, 1225029275, 1932260064, 1972614451, 1104860253, 1245298371, 1538359104, 1477645836, 1376596726, 1346438140, 1346274966, 1154391001, 1982601704, 1326065913, 1114932084, 1629085105, 1316008766, 1306919162, 1144214578, 1063452167, 1588694343, 1588783351, 1578549051, 1205813441, 1164414769, 1093851164, 1770667677, 1356354252, 1841230166, 1679565204, 1700907771, 1164458006, 1558478529, 1033145487, 1417931213, 1922068733, 1750451613, 1922058767, 1528284502, 1881622090, 1619952561, 1902916984, 1205000353, 1306831300, 1982637856, 1306853221, 1831145929, 1295836245, 1205023512, 1245390020, 1083803563, 1205954955, 1316984750, 1841303310, 1407923568, 1699717843, 1528052735, 1760576755, 1699748228, 1538237482, 1215917042, 1356450274, 1669403457, 1124000427, 1659324747, 158

In [174]:
len(fraud_2012)

1149

In [296]:
fraud_df12.columns=['Fraud_id_12']

In [297]:
fraud_df12_pkl=open("fraud_df12_pickle","wb")
pickle.dump(fraud_df12,fraud_df12_pkl)
fraud_df12_pkl.close()

In [298]:
fraud_df12_pkl=open("fraud_df12_pickle","rb")
fraud_df12=pickle.load(fraud_df12_pkl)
print(fraud_df12.head(5))

   Fraud_id_12
0   1255378592
1   1871522540
2   1154337129
3   1689986945
4   1225148562


In [360]:
fraud_df12.to_csv('fraud_df12.csv')

### Step 3:  Fraud Data in between 2012-2016 Data Sets

#### Total number of pysicians found in 2_2019 LEIE between 2012-2016

In [277]:
combined_fraud_id=pd.concat([fraud_df12, fraud_df13], axis=0, ignore_index=True)
combined_fraud_id=pd.concat([combined_fraud_id, fraud_df14], axis=0, ignore_index=True)
combined_fraud_id=pd.concat([combined_fraud_id, fraud_df15], axis=0, ignore_index=True)
combined_fraud_id=pd.concat([combined_fraud_id, fraud_df16], axis=0, ignore_index=True)

In [278]:
combined_fraud_id.columns=['npi']

In [279]:
fraud_id_unique=combined_fraud_id['npi'].unique()

In [280]:
len(fraud_id_unique)

1235

#### Total number of pysicians between 2012-2016

In [230]:
checklist2012=pd.DataFrame(checklist2012)
checklist2013=pd.DataFrame(checklist2013)
checklist2014=pd.DataFrame(checklist2014)
checklist2015=pd.DataFrame(checklist2015)
checklist2016=pd.DataFrame(checklist2016)

In [269]:
total_id=pd.concat([checklist2012, checklist2013], axis=0, ignore_index=True)
total_id=pd.concat([total_id,checklist2014], axis=0, ignore_index=True)
total_id=pd.concat([total_id, checklist2015], axis=0, ignore_index=True)
total_id=pd.concat([total_id, checklist2016], axis=0, ignore_index=True)

In [270]:
total_id.columns=['npi']

In [271]:
total_unique=total_id['npi'].unique()

In [272]:
len(total_unique)

1221295

In [359]:
#total_id.to_csv('total_id_npi.csv')

#### Crime Rate between 2012-2016

In [281]:
len(fraud_id_unique)/len(total_unique)

0.0010112216950040735

#### Number of fraud npi  continous between 2012-2016

In [338]:
fraud_id_yrs=combined_fraud_id.groupby(['npi']).size()

In [345]:
a=fraud_id_yrs.index

In [344]:
b=fraud_id_yrs.tolist()

In [350]:
fraud_id_yrs=pd.DataFrame({'npi':a,'num_yrs':b })

In [352]:
fraud_id_yrs.head(5)

Unnamed: 0,npi,num_yrs
0,1003042441,2
1,1003809195,2
2,1003811167,5
3,1003817743,5
4,1003822834,5


In [356]:
len(fraud_id_yrs[fraud_id_yrs.num_yrs==5])

247

In [357]:
fraud_id_yrs.to_csv('fraud_id_yrs.csv')

## PART II: MAP Fraud Data 

### Subset tables by year 

### 2012 subset

In [237]:
sub_2012=data2012[['National Provider Identifier','Provider Type', 'Gender','HCPCS Code','Number of Services', 'Number of Medicare Beneficiaries','Number of Medicare Beneficiary/Day Services','Average Submitted Charge Amount','Average Medicare Payment Amount']]

In [239]:
sub_2012.columns=['NPI','Provider_Type', 'Gender','HCPCS_c','Num_Services', 'Num_Beneficiaries','Num_Service_Daily','Ave_Charge','Ave_Payment']

In [240]:
print(sub_2012.head(10))

          NPI         Provider_Type Gender HCPCS_c  Num_Services  \
0  1740238153  Diagnostic Radiology      M   72100          34.0   
1  1871676908     Internal Medicine      M   93000          67.0   
2  1396700191             Pathology      F   88311          48.0   
3  1548218720   Physician Assistant      F   99213         113.0   
4  1356363477      Vascular Surgery      M   75625          42.0   
5  1053401950  Diagnostic Radiology      M   72291          26.0   
6  1962522201   Clinical Laboratory    NaN   80069          80.0   
7  1881665719  Diagnostic Radiology      M   74160          17.0   
8  1770781452       Family Practice      M   93880          82.0   
9  1447205752  Diagnostic Radiology      M   72170          34.0   

   Num_Beneficiaries  Num_Service_Daily  Ave_Charge  Ave_Payment  
0                 33                 33   22.928235     9.346765  
1                 64                 67   57.137910    13.365075  
2                 45                 45   75.00000

### 2013 subset

In [242]:
sub_2013=data2013[['NPI','PROVIDER_TYPE','NPPES_PROVIDER_GENDER','HCPCS_CODE', 'LINE_SRVC_CNT','BENE_UNIQUE_CNT', 'BENE_DAY_SRVC_CNT', 'AVERAGE_SUBMITTED_CHRG_AMT', 'AVERAGE_MEDICARE_PAYMENT_AMT']]

In [243]:
sub_2013.columns=['NPI','Provider_Type', 'Gender','HCPCS_c','Num_Services', 'Num_Beneficiaries','Num_Service_Daily','Ave_Charge','Ave_Payment']

In [244]:
print(sub_2013.head(10))

          NPI      Provider_Type Gender HCPCS_c  Num_Services  \
0           1                NaN    NaN     NaN           NaN   
1  1003000126  Internal Medicine      M   99222         142.0   
2  1003000126  Internal Medicine      M   99223          96.0   
3  1003000126  Internal Medicine      M   99231          61.0   
4  1003000126  Internal Medicine      M   99232         777.0   
5  1003000126  Internal Medicine      M   99233         170.0   
6  1003000126  Internal Medicine      M   99238         219.0   
7  1003000126  Internal Medicine      M   99239         142.0   
8  1003000134          Pathology      M   88304         209.0   
9  1003000134          Pathology      M   88305        5786.0   

   Num_Beneficiaries  Num_Service_Daily  Ave_Charge  Ave_Payment  
0                NaN                NaN         NaN          NaN  
1              138.0              142.0  368.626761   104.299718  
2               95.0               96.0  524.604167   155.901146  
3               

### 2014 subset

In [246]:
data2014.columns.values

array(['npi', 'nppes_provider_last_org_name', 'nppes_provider_first_name',
       'nppes_provider_mi', 'nppes_credentials', 'nppes_provider_gender',
       'nppes_entity_code', 'nppes_provider_street1',
       'nppes_provider_street2', 'nppes_provider_city',
       'nppes_provider_zip', 'nppes_provider_state',
       'nppes_provider_country', 'provider_type',
       'medicare_participation_indicator', 'number_of_hcpcs',
       'total_services', 'total_unique_benes', 'total_submitted_chrg_amt',
       'total_medicare_allowed_amt', 'total_medicare_payment_amt',
       'total_medicare_stnd_amt', 'drug_suppress_indicator',
       'number_of_drug_hcpcs', 'total_drug_services',
       'total_drug_unique_benes', 'total_drug_submitted_chrg_amt',
       'total_drug_medicare_allowed_amt',
       'total_drug_medicare_payment_amt', 'total_drug_medicare_stnd_amt',
       'med_suppress_indicator', 'number_of_med_hcpcs',
       'total_med_services', 'total_med_unique_benes',
       'total_med_submitt

In [250]:
sub_2014=data2014[['npi','provider_type','nppes_provider_gender','number_of_hcpcs','total_services','total_med_unique_benes', 'total_med_services', 'total_drug_submitted_chrg_amt', 'total_med_medicare_payment_amt']]

In [251]:
sub_2014.columns=['NPI','Provider_Type', 'Gender','HCPCS_c','Num_Services', 'Num_Beneficiaries','Num_Service_Daily','Ave_Charge','Ave_Payment']

In [252]:
print(sub_2014.head(10))

          NPI          Provider_Type Gender  HCPCS_c  Num_Services  \
0  1003000126      Internal Medicine      M       16        2749.0   
1  1003000134              Pathology      M       13        8643.0   
2  1003000142         Anesthesiology      M       40         654.0   
3  1003000407        Family Practice      M       39        1012.0   
4  1003000423  Obstetrics/Gynecology      F       29         293.0   
5  1003000480        General Surgery      M       62         238.0   
6  1003000522        Family Practice      M       34        1619.0   
7  1003000530      Internal Medicine      F       32         848.0   
8  1003000639        Cardiac Surgery      M       29          48.0   
9  1003000704         Anesthesiology      M       43         140.0   

   Num_Beneficiaries  Num_Service_Daily  Ave_Charge  Ave_Payment  
0              913.0             2749.0         0.0    194073.09  
1             4276.0             8643.0         0.0    245364.34  
2              165.0        

### 2015 subset

In [253]:
data2015.columns.values

array(['npi', 'nppes_provider_last_org_name', 'nppes_provider_first_name',
       'nppes_provider_mi', 'nppes_credentials', 'nppes_provider_gender',
       'nppes_entity_code', 'nppes_provider_street1',
       'nppes_provider_street2', 'nppes_provider_city',
       'nppes_provider_zip', 'nppes_provider_state',
       'nppes_provider_country', 'provider_type',
       'medicare_participation_indicator', 'place_of_service',
       'hcpcs_code', 'hcpcs_description', 'hcpcs_drug_indicator',
       'line_srvc_cnt', 'bene_unique_cnt', 'bene_day_srvc_cnt',
       'average_Medicare_allowed_amt', 'average_submitted_chrg_amt',
       'average_Medicare_payment_amt', 'average_Medicare_standard_amt'],
      dtype=object)

In [254]:
sub_2015=data2015[['npi', 'provider_type', 'nppes_provider_gender','hcpcs_code','line_srvc_cnt', 'bene_unique_cnt','bene_day_srvc_cnt', 'average_Medicare_allowed_amt', 'average_Medicare_payment_amt']]

In [255]:
sub_2015.columns=['NPI','Provider_Type', 'Gender','HCPCS_c','Num_Services', 'Num_Beneficiaries','Num_Service_Daily','Ave_Charge','Ave_Payment']

In [256]:
print(sub_2015.head(10))

          NPI      Provider_Type Gender HCPCS_c  Num_Services  \
0           1                NaN    NaN     NaN           NaN   
1  1003000126  Internal Medicine      M   99217          23.0   
2  1003000126  Internal Medicine      M   99219          18.0   
3  1003000126  Internal Medicine      M   99221          59.0   
4  1003000126  Internal Medicine      M   99222         132.0   
5  1003000126  Internal Medicine      M   99223         220.0   
6  1003000126  Internal Medicine      M   99231          38.0   
7  1003000126  Internal Medicine      M   99232        1117.0   
8  1003000126  Internal Medicine      M   99233         580.0   
9  1003000126  Internal Medicine      M   99238         175.0   

   Num_Beneficiaries  Num_Service_Daily  Ave_Charge  Ave_Payment  
0                NaN                NaN         NaN          NaN  
1               23.0               23.0   72.680000    54.502609  
2               18.0               18.0  135.850000   100.958889  
3               

### 2016 subset

In [257]:
data2016.columns.values

array(['NPI', 'NPPES_PROVIDER_LAST_ORG_NAME', 'NPPES_PROVIDER_FIRST_NAME',
       'NPPES_PROVIDER_MI', 'NPPES_CREDENTIALS', 'NPPES_PROVIDER_GENDER',
       'NPPES_ENTITY_CODE', 'NPPES_PROVIDER_STREET1',
       'NPPES_PROVIDER_STREET2', 'NPPES_PROVIDER_CITY',
       'NPPES_PROVIDER_ZIP', 'NPPES_PROVIDER_STATE',
       'NPPES_PROVIDER_COUNTRY', 'PROVIDER_TYPE',
       'MEDICARE_PARTICIPATION_INDICATOR', 'PLACE_OF_SERVICE',
       'HCPCS_CODE', 'HCPCS_DESCRIPTION', 'HCPCS_DRUG_INDICATOR',
       'LINE_SRVC_CNT', 'BENE_UNIQUE_CNT', 'BENE_DAY_SRVC_CNT',
       'AVERAGE_MEDICARE_ALLOWED_AMT', 'AVERAGE_SUBMITTED_CHRG_AMT',
       'AVERAGE_MEDICARE_PAYMENT_AMT', 'AVERAGE_MEDICARE_STANDARD_AMT'],
      dtype=object)

In [258]:
sub_2016=data2016[['NPI', 'PROVIDER_TYPE','NPPES_PROVIDER_GENDER','HCPCS_CODE', 'LINE_SRVC_CNT', 'BENE_UNIQUE_CNT', 'BENE_DAY_SRVC_CNT','AVERAGE_MEDICARE_ALLOWED_AMT','AVERAGE_MEDICARE_PAYMENT_AMT']]

In [259]:
sub_2016.columns=['NPI','Provider_Type', 'Gender','HCPCS_c','Num_Services', 'Num_Beneficiaries','Num_Service_Daily','Ave_Charge','Ave_Payment']

In [260]:
print(sub_2016.head(10))

          NPI      Provider_Type Gender HCPCS_c  Num_Services  \
0           1                NaN    NaN     NaN           NaN   
1  1003000126  Internal Medicine      M   99217          57.0   
2  1003000126  Internal Medicine      M   99219          38.0   
3  1003000126  Internal Medicine      M   99220          23.0   
4  1003000126  Internal Medicine      M   99221          20.0   
5  1003000126  Internal Medicine      M   99222          96.0   
6  1003000126  Internal Medicine      M   99223         148.0   
7  1003000126  Internal Medicine      M   99225          11.0   
8  1003000126  Internal Medicine      M   99232         596.0   
9  1003000126  Internal Medicine      M   99233         117.0   

   Num_Beneficiaries  Num_Service_Daily  Ave_Charge  Ave_Payment  
0                NaN                NaN         NaN          NaN  
1               55.0               57.0   72.743158    54.474386  
2               38.0               38.0  135.010000   105.850000  
3               

### Reverse Method - To find new added Fraud NPI among 2012-2016

### 2015-2016

In [366]:
fraud_npi_1516=[i for i, j in zip(fraud_2015,fraud_2016 ) if i != j]
len(fraud_npi_1516)

281

In [368]:
#check fraud_npi_1516 list 
fraud_npi_1516

[1003042441,
 1003811167,
 1003817743,
 1003822834,
 1003859885,
 1003885344,
 1003892746,
 1003904830,
 1003972670,
 1013009729,
 1013087741,
 1013095975,
 1013097708,
 1013999002,
 1023004587,
 1023006129,
 1023079274,
 1023119898,
 1023166410,
 1033126651,
 1033145487,
 1043215650,
 1043219405,
 1043238421,
 1043257744,
 1053360966,
 1053372201,
 1053450346,
 1053486704,
 1063436517,
 1063452167,
 1063477503,
 1063486090,
 1063501823,
 1063526580,
 1063537215,
 1063575561,
 1063611499,
 1063696508,
 1063851285,
 1073511192,
 1073524823,
 1083614770,
 1083654826,
 1083667562,
 1083681423,
 1093726556,
 1093755134,
 1093814717,
 1093851164,
 1104809706,
 1104898634,
 1104913789,
 1104924349,
 1114106689,
 1114922606,
 1114951407,
 1124000427,
 1124016480,
 1124057245,
 1124088380,
 1124089016,
 1124192042,
 1124296298,
 1124356043,
 1124423983,
 1134213796,
 1134228794,
 1134280779,
 1134285059,
 1134286958,
 1144214578,
 1144228834,
 1144503418,
 1144511700,
 1154337129,
 1154349330,

In [369]:
#if NIP is not in fraud_2016 list, it means NPI provider has been detected by OIG in calendar year 2015
if '1003042441' not in fraud_2016:
     print("not extist")

not extist


### 2014-2015

In [372]:
temp=[i for i, j in zip(fraud_2014,fraud_2015 ) if i != j]
fraud_npi_1415=[i for i, j in zip(temp,fraud_2016 ) if i != j]
len(fraud_npi_1415)

275

### 2013-2014

In [373]:
temp=[i for i, j in zip(fraud_2013,fraud_2014) if i != j]
temp=[i for i, j in zip(temp,fraud_2015 ) if i != j]
fraud_npi_1314=[i for i, j in zip(temp,fraud_2016 ) if i != j]
len(fraud_npi_1314)

281

### 2012-2013

In [374]:
temp=[i for i, j in zip(fraud_2012,fraud_2013) if i != j]
temp=[i for i, j in zip(temp,fraud_2014) if i != j]
temp=[i for i, j in zip(temp,fraud_2015 ) if i != j]i
fraud_npi_1213=[i for i, j in zip(temp,fraud_2016 ) if  != j]
len(fraud_npi_1213)

281

## References:

[Medicare Fee-For-Service Provider Utilization & Payment Data Physician and Other Supplier Public Use File: A Methodological Overview] last updated: May 3, 2018, The Centers of Medicare and Medicare Services, Office of Enterprise Data and Analytics https://www.cms.gov/Research-Statistics-Data-and-Systems/Statistics-Trends-and-Reports/Medicare-Provider-Charge-Data/Physician-and-Other-Supplier.html

[Medicare Provider Utilization and Payment Data Physician and Other Supplier PUF: Frequently Asked Questions] last updated: May 4, 2018, The Centers of Medicare and Medicare Services, Office of Enterprise Data and Analytics https://www.cms.gov/Research-Statistics-Data-and-Systems/Statistics-Trends-and-Reports/Medicare-Provider-Charge-Data/Downloads/Physician_FAQ.pdf

[List of Excluded Individuals/Entities (LEIE)] 02-2019 Updated LEIE Database, https://oig.hhs.gov/exclusions/exclusions_list.asp; 

[The Detection of Medicare Fraud Using Machine Learning Methods with Excluded Provider Labels] by Richard A. Bauder, Taghi M. Khoshgoftaar, College of Engineering & Computer Science, Florida Atlantic University (2018)