In [1]:
import numpy as np
import pandas as pd

In [2]:
#Reading in dataset that is filtered to exclusions in 2011 and after and only fraud exclusion types
leie = pd.read_csv('LEIE_filtered.csv')

In [3]:
#checking dataset read in correctly
leie.head()

Unnamed: 0,LASTNAME,FIRSTNAME,MIDNAME,BUSNAME,SPECIALTY,NPI,DOB,ADDRESS,CITY,STATE,ZIP,EXCLYear,excl_type
0,,,,"#1 MARKETING SERVICE, INC",SOBER HOME,0,,239 BRIGHTON BEACH AVENUE,BROOKLYN,NY,11235,2020,1128a1
1,,,,184TH STREET PHARMACY CORP,PHARMACY,1922348218,,69 E 184TH ST,BRONX,NY,10468,2018,1128a1
2,,,,2001 BATH AVENUE PHARMACY INC,PHARMACY,0,,2001 BATH AVENUE,BROOKLYN,NY,11214,2011,1128a1
3,,,,"716 TRANSPORTATION, INC",TRANSPORTATION CO,0,,540 VINE LANE,BUFFALO,NY,14228,2019,1128a1
4,,,,A & H VITAMINS SUPPLY CORP,PHARMACY,0,,592 PALISADE AVENUE,TEANECK,NJ,7666,2018,1128a1


In [4]:
leie.shape

(28290, 13)

In [5]:
print(leie.dtypes)

LASTNAME      object
FIRSTNAME     object
MIDNAME       object
BUSNAME       object
SPECIALTY     object
NPI            int64
DOB          float64
ADDRESS       object
CITY          object
STATE         object
ZIP            int64
EXCLYear       int64
excl_type     object
dtype: object


In [6]:
#An NPI of 0 means there is not NPI available for that row
leie['NPI'].value_counts()

0             23567
1801839139        3
1952477622        2
1174694863        2
1033186697        2
1447202205        2
1124052352        2
1053443002        2
1821097130        2
1427158377        2
1306963863        2
1487629796        2
1891743654        2
1578647038        2
1013059740        2
1982644415        2
1154351500        2
1811282098        2
1447390539        2
1730107558        2
1154396315        2
1518039783        2
1215968847        2
1033284930        2
1154379840        2
1356423784        2
1437276128        2
1801080247        2
1578785531        2
1790881605        2
              ...  
1649241431        1
1417379173        1
1417942376        1
1801919865        1
1225129349        1
1376640193        1
1942656392        1
1588692362        1
1639258445        1
1467493791        1
1528087970        1
1285791145        1
1114320207        1
1841268026        1
1942476080        1
1124246830        1
1437121837        1
1225004328        1
1588788517        1


To connect to our Provider Data, the easiest connection will be using NPI however we have a lot of missing NPIs. Therefore, we will connect to two separate connections: one with the NPI for the rows that have NPI and one without NPI for those with no NPIs. I will create two separate tables in order to do these two separate connections.

In [7]:
#Creating dataset without NPIs (all values of 0)
no_NPI = leie['NPI'] == 0
leie_no_NPI = leie[no_NPI]

In [8]:
#Confirming only 0 values for NPI
leie_no_NPI['NPI'].value_counts()

0    23567
Name: NPI, dtype: int64

In [9]:
leie_no_NPI.shape

(23567, 13)

In [10]:
#Saving filtered dataframe as new csv
leie_NPI_none = leie_no_NPI.to_csv('LEIE_NoNPI.csv', index = False)

In [11]:
#Reloading filtered csv
leie_NPI_none = pd.read_csv('LEIE_noNPI.csv', dtype=object)

In [12]:
#Need to create a unique identifier using Name/Busname variables and DOB, however, if one is NaN, don't want all of them to be NaN
leie_NPI_none['FIRSTNAME'].fillna('', inplace = True)
leie_NPI_none['LASTNAME'].fillna('', inplace = True)
leie_NPI_none['BUSNAME'].fillna('', inplace = True)
leie_NPI_none['DOB'].fillna('', inplace = True)

In [13]:
#Confirm NaNs replaced with space so no more NaNs for any of these columns
leie_NPI_none.isnull().sum()

LASTNAME      0
FIRSTNAME     0
MIDNAME       0
BUSNAME       0
SPECIALTY    79
NPI           0
DOB           0
ADDRESS       0
CITY          0
STATE         0
ZIP           0
EXCLYear      0
excl_type     0
dtype: int64

In [14]:
#Creating new unique identifier with first name, last name, business name, and DOB
leie_NPI_none['Full_Name'] = leie_NPI_none['FIRSTNAME'].str.cat(leie_NPI_none['LASTNAME'],sep="")

In [15]:
leie_NPI_none['Name_OR_Business'] = leie_NPI_none['Full_Name'].str.cat(leie_NPI_none['BUSNAME'], sep="")

In [16]:
leie_NPI_none['NameBus_DOB'] = leie_NPI_none['Name_OR_Business'].str.cat(leie_NPI_none['DOB'], sep="")

In [17]:
#confirm columns appear to add correctly
leie_NPI_none.head(10)

Unnamed: 0,LASTNAME,FIRSTNAME,MIDNAME,BUSNAME,SPECIALTY,NPI,DOB,ADDRESS,CITY,STATE,ZIP,EXCLYear,excl_type,Full_Name,Name_OR_Business,NameBus_DOB
0,,,,"#1 MARKETING SERVICE, INC",SOBER HOME,0,,239 BRIGHTON BEACH AVENUE,BROOKLYN,NY,11235,2020,1128a1,,"#1 MARKETING SERVICE, INC","#1 MARKETING SERVICE, INC"
1,,,,2001 BATH AVENUE PHARMACY INC,PHARMACY,0,,2001 BATH AVENUE,BROOKLYN,NY,11214,2011,1128a1,,2001 BATH AVENUE PHARMACY INC,2001 BATH AVENUE PHARMACY INC
2,,,,"716 TRANSPORTATION, INC",TRANSPORTATION CO,0,,540 VINE LANE,BUFFALO,NY,14228,2019,1128a1,,"716 TRANSPORTATION, INC","716 TRANSPORTATION, INC"
3,,,,A & H VITAMINS SUPPLY CORP,PHARMACY,0,,592 PALISADE AVENUE,TEANECK,NJ,7666,2018,1128a1,,A & H VITAMINS SUPPLY CORP,A & H VITAMINS SUPPLY CORP
4,,,,"A SERVICE CAB CO, INC",TRANSPORTATION CO,0,,3704 AIRLINE DRIVE,METAIRIE,LA,70001,2017,1128a1,,"A SERVICE CAB CO, INC","A SERVICE CAB CO, INC"
5,,,,"AAA PLUS HOME HEALTH CARE, LLC",HOME HEALTH AGENCY,0,,6160 104TH CIRCLE NORTH,MINNEAPOLIS,MN,55443,2017,1128a1,,"AAA PLUS HOME HEALTH CARE, LLC","AAA PLUS HOME HEALTH CARE, LLC"
6,,,,"ABRANT CARE PHARMACY CORP, INC",PHARMACY,0,,"777 NE 79TH ST, STE 100",MIAMI,FL,33138,2011,1128b4,,"ABRANT CARE PHARMACY CORP, INC","ABRANT CARE PHARMACY CORP, INC"
7,,,,ACCESSIBILITY DESIGNS & MANAGE,ALLIED HEALTH RELATE,0,,6800 PITTSFORD PALMYRA RD,FAIRPORT,NY,14450,2011,1128a1,,ACCESSIBILITY DESIGNS & MANAGE,ACCESSIBILITY DESIGNS & MANAGE
8,,,,"ACE CLINIQUE OF MEDICINE, LLC",LAB - CLINICAL,0,,"P O BOX 14500, #17745-032",LEXINGTON,KY,40512,2018,1128a1,,"ACE CLINIQUE OF MEDICINE, LLC","ACE CLINIQUE OF MEDICINE, LLC"
9,,,,ADAMS SQUARE PHARMACY,PHARMACY,0,,1122A EAST CHEVY CHASE DR,GLENDALE,CA,91205,2015,1128b4,,ADAMS SQUARE PHARMACY,ADAMS SQUARE PHARMACY


In [18]:
#looking at possible duplicates
leie_NPI_none['NameBus_DOB'].value_counts()

JERMAINEDOLEMAN19770621.0        3
CHARLESHARRIS19680731.0          3
BRIDGETTEMITCHELL19801229.0      2
ANTHONYSMITH19721010.0           2
DEBRAVELASQUEZ19711128.0         2
LORINDAHERNANDEZ19691214.0       2
CELESTINEOKWILAGWE19680428.0     2
KENNETHDUISTERMARS19940410.0     2
ERINTIPTON19770712.0             2
CHARLVETTEMILLICAN19670223.0     2
ISAACCASTRO19741129.0            2
CHRISTINADENNISON19780620.0      2
MICHAELKLINGENSMITH19740302.0    2
LINDABATRIN19590115.0            2
IBISALVAREZ19600312.0            2
DANAMANGELS19771025.0            2
RENEEFRAIN19821222.0             2
ARMANDOOCEGUERA19600816.0        2
VICTORIAALTMAN19910527.0         2
RAMONDE LA GARZA19601224.0       2
MEOSCHAHINES19900923.0           2
ORLANDOTORRES19670625.0          2
MARTINTASIS19670105.0            2
OLENAKULAKOVA19781015.0          2
AMBERZWOLINSKI19870529.0         2
MANUELAALANA19690405.0           2
BARBARACURRIN19621222.0          2
RAYANAMOHAMED19801225.0          2
EMMAKING19481120.0  

In [19]:
#Filtering to only duplicates to explore these more and confirm they are duplicates
Name_dups = leie_NPI_none.groupby('NameBus_DOB').filter(lambda x:len(x) > 1)

In [20]:
#Looking at number of duplicates
Name_dups.shape

(348, 16)

In [21]:
#Looking at as many as possible to confirm they appear to be duplicates
Name_dups.head(60)

Unnamed: 0,LASTNAME,FIRSTNAME,MIDNAME,BUSNAME,SPECIALTY,NPI,DOB,ADDRESS,CITY,STATE,ZIP,EXCLYear,excl_type,Full_Name,Name_OR_Business,NameBus_DOB
103,,,,MEDLINK PROFESSIONAL MANAGEMEN,COMM MNTL HLTH CNTR,0,,1809 NE 2ND AVE,MIAMI,FL,33132,2012,1128a1,,MEDLINK PROFESSIONAL MANAGEMEN,MEDLINK PROFESSIONAL MANAGEMEN
104,,,,MEDLINK PROFESSIONAL MANAGEMEN,CLINIC,0,,1809 NE 2ND AVENUE,MIAMI,FL,33132,2016,1128b7,,MEDLINK PROFESSIONAL MANAGEMEN,MEDLINK PROFESSIONAL MANAGEMEN
237,ACEVEDO,ANGELICA,J,,NURSE/NURSES AIDE,0,19910801.0,3608 SW 31ST DRIVE,GAINESVILLE,FL,32608,2015,1128a2,ANGELICAACEVEDO,ANGELICAACEVEDO,ANGELICAACEVEDO19910801.0
238,ACEVEDO,ANGELICA,JUANITA,,NURSE/NURSES AIDE,0,19910801.0,"3608 SW 31ST DRIVE, APT 17A",GAINESVILLE,FL,32608,2016,1128a2,ANGELICAACEVEDO,ANGELICAACEVEDO,ANGELICAACEVEDO19910801.0
241,ACEVEDO,LISSETTE,,,DME - POWER VEHICLES,0,19690823.0,P O BOX 51343,TOA BAJA,PR,950,2014,1128a1,LISSETTEACEVEDO,LISSETTEACEVEDO,LISSETTEACEVEDO19690823.0
242,ACEVEDO,LISSETTE,,,DME - POWER VEHICLES,0,19690823.0,P O BOX 51343,TOA BAJA,PR,950,2014,1128a1,LISSETTEACEVEDO,LISSETTEACEVEDO,LISSETTEACEVEDO19690823.0
423,AHMED,AMIR,ABDI,,HOME HEALTH AGENCY,0,19641225.0,2619 SUMMIT HOLLOW DR,COLUMBUS,OH,43219,2013,1128a1,AMIRAHMED,AMIRAHMED,AMIRAHMED19641225.0
424,AHMED,AMIR,A,,HOME HEALTH AGENCY,0,19641225.0,P O BOX 10,LISBON,OH,44432,2017,1128a1,AMIRAHMED,AMIRAHMED,AMIRAHMED19641225.0
481,AL HADDAD,AHMAD,HAMIDI,,PHARMACIST,0,19621006.0,1866 N TAMIAMI TRAIL,NORTH FORT MYERS,FL,33903,2013,1128a4,AHMADAL HADDAD,AHMADAL HADDAD,AHMADAL HADDAD19621006.0
482,AL HADDAD,AHMAD,HAMIDI,,PARAMEDIC TECHNICIAN,0,19621006.0,"14458 REFLECTION LAKES DR, APT",FORT MYERS,FL,33907,2014,1128a4,AHMADAL HADDAD,AHMADAL HADDAD,AHMADAL HADDAD19621006.0


In [22]:
#The above table gives confidence that these are indeed duplicates. Looking to ensure DOB is available in these
#duplicates to provide greater confidence as it seems very unlikely a provider would have the same first and last name
#as well as the exact same DOB
Name_dups['DOB'].value_counts()

19770621.0    3
19680731.0    3
19730810.0    2
19750924.0    2
19861130.0    2
19630802.0    2
19780308.0    2
19870614.0    2
19691214.0    2
19800912.0    2
19690913.0    2
19811201.0    2
19690823.0    2
19580505.0    2
19770303.0    2
19650508.0    2
19700310.0    2
19710630.0    2
19940525.0    2
19670105.0    2
19840103.0    2
19720113.0    2
19621224.0    2
19780815.0    2
19810103.0    2
19590115.0    2
19781015.0    2
19700317.0    2
19690528.0    2
19690811.0    2
             ..
19831214.0    2
19801225.0    2
19640121.0    2
19910527.0    2
19670502.0    2
19691026.0    2
19710121.0    2
19950629.0    2
19661020.0    2
19490620.0    2
19910801.0    2
19601224.0    2
19620222.0    2
19841105.0    2
19850316.0    2
19590316.0    2
19750612.0    2
19690608.0    2
19561123.0    2
19850715.0    2
19650206.0    2
19750217.0    2
19821222.0    2
19741004.0    2
19641225.0    2
19840209.0    2
19851213.0    2
19710505.0    2
19720410.0    2
              2
Name: DOB, Length: 173, 

They all do appear to be duplicates so will drop the duplicates by only keeping the row with the most recent exclusion

In [23]:
#Need to convert exclusion year to a numeric in order to sort
leie_NPI_none['EXCLYear'] = pd.to_numeric(leie_NPI_none['EXCLYear'])

In [24]:
#Sorting dataframe so that the most recent exclusions are at the top
leie_NPI_none.sort_values(by = 'EXCLYear', ascending=False)

Unnamed: 0,LASTNAME,FIRSTNAME,MIDNAME,BUSNAME,SPECIALTY,NPI,DOB,ADDRESS,CITY,STATE,ZIP,EXCLYear,excl_type,Full_Name,Name_OR_Business,NameBus_DOB
0,,,,"#1 MARKETING SERVICE, INC",SOBER HOME,0,,239 BRIGHTON BEACH AVENUE,BROOKLYN,NY,11235,2020,1128a1,,"#1 MARKETING SERVICE, INC","#1 MARKETING SERVICE, INC"
18257,ROGERS,GWENDOLYN,FAYE,,NURSE/NURSES AIDE,0,19750609.0,542 WEST LAYTON ST,OLATHE,KS,66061,2020,1128b4,GWENDOLYNROGERS,GWENDOLYNROGERS,GWENDOLYNROGERS19750609.0
21944,VINES,SHELIA,,,PERSONAL CARE PROVID,0,19640225.0,2549 BARNESLEY PLACE,WINDSOR MILL,MD,21244,2020,1128a1,SHELIAVINES,SHELIAVINES,SHELIAVINES19640225.0
11275,KAWANGA,PATRICK,,,NURSE/NURSES AIDE,0,19810626.0,1364 E 171 ST,CLEVELAND,OH,44110,2020,1128a1,PATRICKKAWANGA,PATRICKKAWANGA,PATRICKKAWANGA19810626.0
21938,VINCELETTE,MICHAEL,ROBERT,,NURSE/NURSES AIDE,0,19600127.0,9729 STATE HIGHWAY 37,OGDENSBURG,NY,13669,2020,1128a2,MICHAELVINCELETTE,MICHAELVINCELETTE,MICHAELVINCELETTE19600127.0
18270,ROGERS,TRUNNOD,DEMARCO,,NO KNOWN AFFILIATION,0,19760917.0,3516 TOULON DRIVE,BATON ROUGE,LA,70816,2020,1128a1,TRUNNODROGERS,TRUNNODROGERS,TRUNNODROGERS19760917.0
11394,KENNY,TASHIMA,,,MARKETING FIRM,0,19840413.0,"4130 SHADE TREE LOUP, APT 68",ORLANDO,FL,32810,2020,1128a3,TASHIMAKENNY,TASHIMAKENNY,TASHIMAKENNY19840413.0
7430,GANOTE,STEVE,,,NO KNOWN AFFILIATION,0,19731124.0,"P O BOX 33, #15427-028",TERRE HAUTE,IN,47808,2020,1128a1,STEVEGANOTE,STEVEGANOTE,STEVEGANOTE19731124.0
3299,CAID,DONALD,DAVID,,NURSE/NURSES AIDE,0,19660630.0,24306 RUTHERFORD RD,RAMONA,CA,92065,2020,1128b4,DONALDCAID,DONALDCAID,DONALDCAID19660630.0
11421,KESMAN,MALISSA,N,,HEALTH CARE AIDE,0,19841119.0,197 MOUND STREET,NEWARK,OH,43055,2020,1128a1,MALISSAKESMAN,MALISSAKESMAN,MALISSAKESMAN19841119.0


In [25]:
#Remove the duplicate rows
leie_noNPIs_dedupped = leie_NPI_none.drop_duplicates(subset = 'NameBus_DOB', keep = 'first')

In [26]:
#Confirm that duplicates appear to be dropped: As there were 348 rows in the duplicate df, with some triple duplicates,
#out of the original 23,567, at least 175 lines should have been removed so the number of rows remaining should be 23,392
leie_noNPIs_dedupped.shape

(23392, 16)

In [27]:
#One more visual confirmation duplicates appeard to be removed
leie_noNPIs_dedupped['NameBus_DOB'].value_counts()

KEITHFOREMAN19550920.0               1
CHERIESEMANUS-BOYKINS19660503.0      1
ALICIAROMO19760122.0                 1
JESSICARHODES19830424.0              1
TUANTRAN19600527.0                   1
REBECCAIMBUSH19690705.0              1
TINANEAL19630921.0                   1
SEASONHUGHES19831226.0               1
WILLIAMHARRIS19420930.0              1
ANTONIADELK19830622.0                1
LEONARDOVALLEJO19670118.0            1
ALINAGARCIA19690817.0                1
KYRIESMITH19820926.0                 1
CONRADCASTRO19690801.0               1
CINDYNOBLITT19570807.0               1
JANICEBURCH19690826.0                1
SHAHZADNAWAZ19710411.0               1
JENNIFERMITCHELL19900409.0           1
RICKIWRIGHT19860130.0                1
CHARLESBAYNARD19910326.0             1
WILLIAMBRASHEAR19641231.0            1
NICKHER19710615.0                    1
ANGELAAPPLETON19761212.0             1
YVONNEALLEN19710612.0                1
JACKIEELVINE19740119.0               1
JOHNFESLER19710817.0     

In [28]:
#As DOB is not in our provider claims data then these fields will not be needed for the connection
leie_noNPIs_clean = leie_noNPIs_dedupped.drop(columns = ['DOB', 'NameBus_DOB'])

In [29]:
#confirm data looks ready to save as a clean version
leie_noNPIs_clean.head()

Unnamed: 0,LASTNAME,FIRSTNAME,MIDNAME,BUSNAME,SPECIALTY,NPI,ADDRESS,CITY,STATE,ZIP,EXCLYear,excl_type,Full_Name,Name_OR_Business
0,,,,"#1 MARKETING SERVICE, INC",SOBER HOME,0,239 BRIGHTON BEACH AVENUE,BROOKLYN,NY,11235,2020,1128a1,,"#1 MARKETING SERVICE, INC"
1,,,,2001 BATH AVENUE PHARMACY INC,PHARMACY,0,2001 BATH AVENUE,BROOKLYN,NY,11214,2011,1128a1,,2001 BATH AVENUE PHARMACY INC
2,,,,"716 TRANSPORTATION, INC",TRANSPORTATION CO,0,540 VINE LANE,BUFFALO,NY,14228,2019,1128a1,,"716 TRANSPORTATION, INC"
3,,,,A & H VITAMINS SUPPLY CORP,PHARMACY,0,592 PALISADE AVENUE,TEANECK,NJ,7666,2018,1128a1,,A & H VITAMINS SUPPLY CORP
4,,,,"A SERVICE CAB CO, INC",TRANSPORTATION CO,0,3704 AIRLINE DRIVE,METAIRIE,LA,70001,2017,1128a1,,"A SERVICE CAB CO, INC"


In [29]:
#Drop the columns created to combine the names and DOB
leie_noNPIs_clean2 = leie_noNPIs_clean.drop(columns = ['Full_Name', 'Name_OR_Business'])

In [30]:
#Taking one last look before saving as a new csv
leie_noNPIs_clean2.head()

Unnamed: 0,LASTNAME,FIRSTNAME,MIDNAME,BUSNAME,SPECIALTY,NPI,ADDRESS,CITY,STATE,ZIP,EXCLYear,excl_type
0,,,,"#1 MARKETING SERVICE, INC",SOBER HOME,0,239 BRIGHTON BEACH AVENUE,BROOKLYN,NY,11235,2020,1128a1
1,,,,2001 BATH AVENUE PHARMACY INC,PHARMACY,0,2001 BATH AVENUE,BROOKLYN,NY,11214,2011,1128a1
2,,,,"716 TRANSPORTATION, INC",TRANSPORTATION CO,0,540 VINE LANE,BUFFALO,NY,14228,2019,1128a1
3,,,,A & H VITAMINS SUPPLY CORP,PHARMACY,0,592 PALISADE AVENUE,TEANECK,NJ,7666,2018,1128a1
4,,,,"A SERVICE CAB CO, INC",TRANSPORTATION CO,0,3704 AIRLINE DRIVE,METAIRIE,LA,70001,2017,1128a1


In [31]:
#Save as new csv
leie_noNPIs_clean2.to_csv('LEIE_NoNPI_Clean.csv', index = False)

In [32]:
#Creating a table that has NPIs
NPI = leie['NPI'] != 0
leie_NPI = leie[NPI]

In [33]:
#Confirming no 0 values for NPI
leie_NPI['NPI'].value_counts()

1801839139    3
1518039783    2
1659399129    2
1942288006    2
1033284930    2
1215053665    2
1548787724    2
1578785531    2
1982644415    2
1437161304    2
1477594935    2
1124341060    2
1912011594    2
1609815307    2
1457534984    2
1801080247    2
1891868501    2
1891743654    2
1356339055    2
1396736260    2
1902906092    2
1972592947    2
1356430557    2
1053557314    2
1346481728    2
1922048156    2
1154579191    2
1619269214    2
1114973955    2
1215968847    2
             ..
1346324316    1
1417942376    1
1417956715    1
1255439725    1
1548388820    1
1407972680    1
1407923568    1
1699741041    1
1841371515    1
1538100599    1
1548531064    1
1801919865    1
1235133794    1
1265597793    1
1699746933    1
1114102373    1
1275549016    1
1649241431    1
1326155094    1
1811250517    1
1144493396    1
1609033042    1
1407835473    1
1710065958    1
1184623951    1
1548246291    1
1083612493    1
1417964875    1
1417102666    1
1093779456    1
Name: NPI, Length: 4634,

In [34]:
leie_NPI.shape

(4723, 13)

In [35]:
#First confirm data is ready to save as new CSV as we may need to go back to this earlier version and relook at the duplicate values
leie_NPI.head()

Unnamed: 0,LASTNAME,FIRSTNAME,MIDNAME,BUSNAME,SPECIALTY,NPI,DOB,ADDRESS,CITY,STATE,ZIP,EXCLYear,excl_type
1,,,,184TH STREET PHARMACY CORP,PHARMACY,1922348218,,69 E 184TH ST,BRONX,NY,10468,2018,1128a1
5,,,,"A & Y MEDICAL SUPPLY, INC",DME - GENERAL,1942476080,,"6310 108TH STREET, APT 6J",FOREST HILLS,NY,11375,2017,1128b8
6,,,,"A CARING ALTERNATIVE, INC",HOME HEALTH AGENCY,1275600959,,"1229 HURON RD E, FLR 6TH",CLEVELAND,OH,44115,2013,1128a1
7,,,,"A FAIR DEAL PHARMACY, INC",PHARMACY,1891731758,,"C/O P O BOX 329014, #69709-05",BROOKLYN,NY,11232,2017,1128b8
11,,,,"ACACIA MENTAL HEALTH CLINIC, L",MENTAL/BEHAVIORAL HE,1851631543,,5228 W FOND DU LAC AVENUE,MILWAUKEE,WI,53216,2019,1128b7


In [36]:
leie_NPI.to_csv('LEIE_NPI_with_Duplicates.csv', index=False)

In [37]:
#Reading back in data with only NPIs
leie_NPI = pd.read_csv('LEIE_NPI_with_Duplicates.csv')

In [38]:
#confirming data read in correctly
leie_NPI.head()

Unnamed: 0,LASTNAME,FIRSTNAME,MIDNAME,BUSNAME,SPECIALTY,NPI,DOB,ADDRESS,CITY,STATE,ZIP,EXCLYear,excl_type
0,,,,184TH STREET PHARMACY CORP,PHARMACY,1922348218,,69 E 184TH ST,BRONX,NY,10468,2018,1128a1
1,,,,"A & Y MEDICAL SUPPLY, INC",DME - GENERAL,1942476080,,"6310 108TH STREET, APT 6J",FOREST HILLS,NY,11375,2017,1128b8
2,,,,"A CARING ALTERNATIVE, INC",HOME HEALTH AGENCY,1275600959,,"1229 HURON RD E, FLR 6TH",CLEVELAND,OH,44115,2013,1128a1
3,,,,"A FAIR DEAL PHARMACY, INC",PHARMACY,1891731758,,"C/O P O BOX 329014, #69709-05",BROOKLYN,NY,11232,2017,1128b8
4,,,,"ACACIA MENTAL HEALTH CLINIC, L",MENTAL/BEHAVIORAL HE,1851631543,,5228 W FOND DU LAC AVENUE,MILWAUKEE,WI,53216,2019,1128b7


In [39]:
#Looking at number of rows
leie_NPI.shape

(4723, 13)

In [40]:
#Confirming all NPIs are there
leie_NPI['NPI'].value_counts()

1801839139    3
1518039783    2
1659399129    2
1942288006    2
1033284930    2
1215053665    2
1548787724    2
1578785531    2
1982644415    2
1437161304    2
1477594935    2
1124341060    2
1912011594    2
1609815307    2
1457534984    2
1801080247    2
1891868501    2
1891743654    2
1356339055    2
1396736260    2
1902906092    2
1972592947    2
1356430557    2
1053557314    2
1346481728    2
1922048156    2
1154579191    2
1619269214    2
1114973955    2
1215968847    2
             ..
1346324316    1
1417942376    1
1417956715    1
1255439725    1
1548388820    1
1407972680    1
1407923568    1
1699741041    1
1841371515    1
1538100599    1
1548531064    1
1801919865    1
1235133794    1
1265597793    1
1699746933    1
1114102373    1
1275549016    1
1649241431    1
1326155094    1
1811250517    1
1144493396    1
1609033042    1
1407835473    1
1710065958    1
1184623951    1
1548246291    1
1083612493    1
1417964875    1
1417102666    1
1093779456    1
Name: NPI, Length: 4634,

In [41]:
#Looking at the duplicated NPIs to confirm they are duplicates
dups = leie_NPI.groupby('NPI').filter(lambda x:len(x) > 1)

In [42]:
dups.shape

(177, 13)

In [43]:
dups.head(60)

Unnamed: 0,LASTNAME,FIRSTNAME,MIDNAME,BUSNAME,SPECIALTY,NPI,DOB,ADDRESS,CITY,STATE,ZIP,EXCLYear,excl_type
21,,,,AMERICAN THERAPEUTIC CORP,CLINIC,1215968847,,1801 NE 2ND AVENUE,MIAMI,FL,33132,2016,1128b7
22,,,,AMERICAN THERAPUTIC CORPORATIO,COMM MNTL HLTH CNTR,1215968847,,1801 NE 2ND AVE,MIAMI,FL,33132,2012,1128a1
96,,,,GAINESVILLE PAIN MANAGEMENT,CLINIC,1154579191,,"3 WASHINGTON AVE, STE C",GAINESVILLE,GA,30501,2015,1128b7
97,,,,GAINESVILLE PAIN MANAGEMENT,CLINIC,1154579191,,"3 WASHINGTON AVENUE, SUITE C",GAINESVILLE,GA,30501,2015,BRCH CI
240,,,,SUNCOAST BRACE AND LIMB INC,DME - PROSTHETICS,1013919307,,1878 59TH STREET WEST,BRADENTON,FL,34209,2017,1128a1
241,,,,"SUNCOAST BRACE AND LIMB, INC",DME - PROSTHETICS,1013919307,,"1878 59TH ST, W",BRADENTON,FL,34209,2015,1128a1
326,AGGARWAL,SHELINDER,,,GENERAL PRACTICE,1891743654,19680312.0,P O BOX 9000,FORREST CITY,AR,72336,2017,1128a1
327,AGGARWAL,SHELINDER,,,GENERAL PRACTICE,1891743654,19680312.0,"808B TURNER STREET, SW",HUNTSVILLE,AL,35801,2013,1128b4
436,ANYAJI,GEORGE,IFEANYICHUKWU,,GENERAL PRACTICE,1437161304,19540202.0,"601 E PALOMAR ST, STE C",CHULA VISTA,CA,91911,2016,1128a1
437,ANYAJI,GEORGE,,,GENERAL PRACTICE,1437161304,19540202.0,807 DOVER COURT,SAN DIEGO,CA,92109,2014,1128b14


In [44]:
#Sorting dataframe so that the most recent exclusions are at the top
leie_NPI.sort_values(by = 'EXCLYear', ascending=False)

Unnamed: 0,LASTNAME,FIRSTNAME,MIDNAME,BUSNAME,SPECIALTY,NPI,DOB,ADDRESS,CITY,STATE,ZIP,EXCLYear,excl_type
4242,TAITT,MICHAEL,R,,INTERNAL MEDICINE,1346389780,19600514.0,"242 E 87TH STREET, APT 3C",NEW YORK,NY,10128,2020,1128a1
3081,MORLAND,MARTINE,,,NURSE/NURSES AIDE,1407197528,19750228.0,440 JEFFERSON STREET,MASSAPEQUA,NY,11758,2020,1128a2
524,BAIRD,CYNTHIA,,,NURSE/NURSES AIDE,1376640193,19651222.0,885 N PEARSON CT,SPRINGFIELD,MO,65802,2020,1128b4
3381,PATEL,DEVENDRA,I.,,CARDIOLOGY,1871658310,19590211.0,P O BOX 5000,SHERIDAN,OR,97378,2020,1128a4
1721,GIBSON,MARGARET,A,,DME - GENERAL,1902299001,19610818.0,"120 ROLLINGWOOD DRIVE, APT 169",NEWPORT,NC,28570,2020,1128b7
4341,TODD,TEIA,M,,NURSE/NURSES AIDE,1265730543,19750826.0,1789 EAST 90TH STREET A,CLEVELAND,OH,44106,2020,1128a1
1946,HANS,ANIA,,,TECHNICIAN,1063997039,19770605.0,"401 N MIAMI AVENUE, #19670-104",MIAMI,FL,33128,2020,1128a1
1937,HANDA,CHRISTOPHER,J,,SUBSTANCE ABUSE REHA,1134464001,19700824.0,239 EAST WILLOCK ROAD,PITTSBURGH,PA,15227,2020,1128a1
4083,SMITH,CARLOS,ALLEN,,PSYCHOLOGY,1811270895,19790730.0,"P O BOX 5010, #56762-039",OAKDALE,LA,71463,2020,1128a1
4078,SMALL,GARY,DAVID,,PODIATRY,1326152810,19640213.0,1879 HIDDEN TRAIL LANE Q,WESTON,FL,33327,2020,1128a3


In [45]:
#Remove the duplicate rows
leie_NPI_dedupped = leie_NPI.drop_duplicates(subset = 'NPI', keep = 'first')

In [46]:
#Checking if duplicates have been removed
leie_NPI_dedupped['NPI'].value_counts()

1093851164    1
1700395746    1
1689772188    1
1316276868    1
1023042181    1
1235390212    1
1144337031    1
1801937545    1
1003903626    1
1780994699    1
1811909260    1
1386037901    1
1255438990    1
1831447184    1
1639338643    1
1578871448    1
1639299738    1
1427260033    1
1851482749    1
1730210428    1
1235182189    1
1851316830    1
1568592251    1
1841517737    1
1295788644    1
1013009000    1
1760553580    1
1184633455    1
1730278011    1
1083763312    1
             ..
1538100599    1
1548531064    1
1801919865    1
1750324604    1
1477539419    1
1841511805    1
1649511999    1
1184821290    1
1972577666    1
1437209988    1
1225129349    1
1548388820    1
1255439725    1
1407835473    1
1114102373    1
1609033042    1
1144493396    1
1811250517    1
1326155094    1
1649241431    1
1275549016    1
1699746933    1
1417956715    1
1265597793    1
1235133794    1
1417379173    1
1003999145    1
1346324316    1
1417942376    1
1093779456    1
Name: NPI, Length: 4634,

In [47]:
# if they were dropped correctly then the shape should now be 4634 rows
leie_NPI_dedupped.shape

(4634, 13)

In [48]:
#As in the CSV without NPIs, we do not need DOB for future purposes
leie_NPI_dedupped1 = leie_NPI_dedupped.drop(columns = ['DOB'])

In [49]:
#Checking all other data still seems to be included
leie_NPI_dedupped1.head()

Unnamed: 0,LASTNAME,FIRSTNAME,MIDNAME,BUSNAME,SPECIALTY,NPI,ADDRESS,CITY,STATE,ZIP,EXCLYear,excl_type
0,,,,184TH STREET PHARMACY CORP,PHARMACY,1922348218,69 E 184TH ST,BRONX,NY,10468,2018,1128a1
1,,,,"A & Y MEDICAL SUPPLY, INC",DME - GENERAL,1942476080,"6310 108TH STREET, APT 6J",FOREST HILLS,NY,11375,2017,1128b8
2,,,,"A CARING ALTERNATIVE, INC",HOME HEALTH AGENCY,1275600959,"1229 HURON RD E, FLR 6TH",CLEVELAND,OH,44115,2013,1128a1
3,,,,"A FAIR DEAL PHARMACY, INC",PHARMACY,1891731758,"C/O P O BOX 329014, #69709-05",BROOKLYN,NY,11232,2017,1128b8
4,,,,"ACACIA MENTAL HEALTH CLINIC, L",MENTAL/BEHAVIORAL HE,1851631543,5228 W FOND DU LAC AVENUE,MILWAUKEE,WI,53216,2019,1128b7


In [50]:
#Saving as a clean CSV
leie_NPI_dedupped1.to_csv('LEIE_NPI_Clean.csv', index = False)