# NPI Registry EDA
### MSDS 498 Capstone, Winter 2023, Team 53
### Author: Sarah Rodenbeck

### Import Necessary Libraries

In [363]:
from dask import dataframe as dd
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
pd.set_option('display.max_columns', None)

### Only need a subset of columns

In [364]:
cols = open('columns.txt').read().split(',')
int_cols = ['NPI','Entity Type Code','Provider Business Practice Location Address Postal Code','Provider Business Practice Location Address Telephone Number','Provider Business Practice Location Address Fax Number']
str_cols = list(set(cols)-set(int_cols))

### Read in Data with Dask
Note: File too large to effectively use pandas

In [485]:
res = {}
for i in range(len(str_cols)):
    res[str_cols[i]] = object
res['Provider Business Practice Location Address Postal Code'] = object
res['Provider Business Practice Location Address Telephone Number']= object

df = dd.read_csv("npidata_pfile_20050523-20230108.csv",usecols=cols,dtype=res)

df[str_cols] = df[str_cols].astype(str)
df[str_cols] = df[str_cols].replace('nan', '')
df[str_cols] = df[str_cols].fillna('')


### Total size of the dataset is 7,618,186 providers and organizations
Note: Contains physicians, non-physicians, and organizations; need to use taxonomy to determine type of care provided

In [367]:
# Very slow computation!
df.index.size.compute()

7618186

### Rename Columns

In [486]:
new_columns = ['NPI','EntityType','LastName','FirstName','MiddleName','Prefix','Suffix','Credential','PracticeAddress','PracticeAddress2','PracticeCity','PracticeState','PracticeZip','PracticeCountry','PracticePhoneNum','PracticeFaxNum','ProviderGender','TaxonomyCode1','TaxonomyCode2','TaxonomyCode3','TaxonomyCode4','TaxonomyCode5','TaxonomyCode6','TaxonomyCode7','TaxonomyCode8','TaxonomyCode9','TaxonomyCode10','TaxonomyCode11','TaxonomyCode12','TaxonomyCode13','TaxonomyCode14','TaxonomyCode15']
df = df.rename(columns=dict(zip(df.columns, new_columns)))


### Filter Providers

In [487]:
df = df[(df['EntityType']==1) & (df['PracticeCountry']=='US')] #only look at individuals practicing in the US

def physician_check(codes):
    return any(item.startswith('20') for item in codes)

df['DR_bool']= df.apply(lambda x: True if physician_check(x.tolist()[17:]) else False,axis=1)
df = df[df['DR_bool']==True]

df = df[df['PracticeZip'].str.isdigit()]
int_col_new = ['NPI','EntityType','PracticeZip','PracticePhoneNum','PracticeFaxNum']
df[int_col_new] = df[int_col_new].fillna(-1)
df[int_col_new]=df[int_col_new].astype(int)



You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, please provide the `meta=` keyword, as described in the map or apply function that you are using.
  Before: .apply(func)
  After:  .apply(func, meta=(None, 'bool'))



In [488]:
df.head(10)

Unnamed: 0,NPI,EntityType,LastName,FirstName,MiddleName,Prefix,Suffix,Credential,PracticeAddress,PracticeAddress2,PracticeCity,PracticeState,PracticeZip,PracticeCountry,PracticePhoneNum,PracticeFaxNum,ProviderGender,TaxonomyCode1,TaxonomyCode2,TaxonomyCode3,TaxonomyCode4,TaxonomyCode5,TaxonomyCode6,TaxonomyCode7,TaxonomyCode8,TaxonomyCode9,TaxonomyCode10,TaxonomyCode11,TaxonomyCode12,TaxonomyCode13,TaxonomyCode14,TaxonomyCode15,DR_bool
559,1528061496,1,BRODY,HOWARD,,DR.,,M.D.,138 SERVICE RD,,EAST LANSING,MI,488241376,US,5173533050,5174323742,M,207Q00000X,,,,,,,,,,,,,,,True
560,1235132101,1,SCHENCK,DAVID,,,,MD,700 MEDICAL BLVD,,ENGLEWOOD,FL,342233964,US,8004768646,9193823210,M,207P00000X,,,,,,,,,,,,,,,True
561,1962405837,1,SAYE,WILLIAM,H.,DR.,JR.,MD,2201 FOREST HILLS DR,STE 7,HARRISBURG,PA,171121089,US,7176525063,7176719554,M,207N00000X,,,,,,,,,,,,,,,True
562,1871596742,1,AHLERS,LUISE,KUSE,DR.,,M.D.,1 CLARA BARTON DR,,ALBANY,NY,122083401,US,5182625588,5182625589,F,208000000X,,,,,,,,,,,,,,,True
566,1639172497,1,THOMPSON,IAN,L,DR.,,M.D.,3217 SQUALICUM PKWY,,BELLINGHAM,WA,982251935,US,3607154144,3607154120,M,2085R0203X,,,,,,,,,,,,,,,True
568,1992708754,1,KAVCIC,JOHN,,DR.,,MD,20 LOSSON RD,STE 105,CHEEKTOWAGA,NY,142272379,US,7165587727,-1,M,207Q00000X,,,,,,,,,,,,,,,True
569,1245233006,1,WILLIS,DANIEL,JAMES,,,M.D.,85 BRYANT WOODS S,,AMHERST,NY,142283604,US,7166893333,7166899866,M,2084P0800X,,,,,,,,,,,,,,,True
575,1538162003,1,RODRIGUEZ,EDWIN,J,,,M.D.,4643 S WOODHAVEN WAY,,BILLINGS,MT,591062493,US,6059201965,-1,M,207Q00000X,,,,,,,,,,,,,,,True
577,1427051994,1,CHEN,SANFORD,,DR.,,M.D.,1200 N TUSTIN AVE,STE 140,SANTA ANA,CA,927053501,US,7149728235,-1,M,207W00000X,,,,,,,,,,,,,,,True
580,1326041880,1,YOU,TIMOTHY,T,DR.,,M.D.,1200 N TUSTIN AVE,STE 140,SANTA ANA,CA,927053501,US,7149728235,-1,M,207W00000X,,,,,,,,,,,,,,,True


### Number of Physicians in NPI Registry: 1,135,297
Note: Includes all specialties

In [489]:
#Very slow computation!
df.index.size.compute()

1135297

### Create Database from Filtered Data

In [490]:
engine = create_engine('sqlite:///npi.db')
if not database_exists(engine.url):
    create_database(engine.url)
df.to_sql('npi_registry','sqlite:///npi.db',if_exists='replace',index=False)

### Simple query to find physicians in zip code
Need to filter for specific specialties and do geospatial query to find providers near zip, not just within

In [494]:
query = 'SELECT * FROM npi_registry where PracticeZip like \'60601%\';'
pd.read_sql(query,engine)

Unnamed: 0,NPI,EntityType,LastName,FirstName,MiddleName,Prefix,Suffix,Credential,PracticeAddress,PracticeAddress2,PracticeCity,PracticeState,PracticeZip,PracticeCountry,PracticePhoneNum,PracticeFaxNum,ProviderGender,TaxonomyCode1,TaxonomyCode2,TaxonomyCode3,TaxonomyCode4,TaxonomyCode5,TaxonomyCode6,TaxonomyCode7,TaxonomyCode8,TaxonomyCode9,TaxonomyCode10,TaxonomyCode11,TaxonomyCode12,TaxonomyCode13,TaxonomyCode14,TaxonomyCode15,DR_bool
0,1154329274,1,HERMAN,HOWARD,LAWRENCE,DR.,,M.D.,151 N MICHIGAN AVE,#811,CHICAGO,IL,606017506,US,3129466374,3128610311,M,2084P0800X,,,,,,,,,,,,,,,1
1,1659373165,1,LEVY,FREDRIC,JEROME,DR.,,M.D.,1 E WACKER DR,SUITE 630,CHICAGO,IL,606011802,US,3126702590,3126448183,M,2084P0800X,,,,,,,,,,,,,,,1
2,1407893159,1,GOLDMAN,SAMUEL,EPHRAIM,DR.,,M.D.,333 N MICHIGAN AVE,SUITE 602,CHICAGO,IL,606013901,US,3126416444,-1,M,2084P0800X,,,,,,,,,,,,,,,1
3,1285741512,1,HEFTER,GILBERT,MORRIS,,,MD,333 N MICHIGAN AVE,SUITE 2017,CHICCAGO,IL,60601,US,3127829609,3127828649,M,2084P0800X,,,,,,,,,,,,,,,1
4,1639266158,1,MALEK,GAREY,ANDREW,,,M.D.,151 N MICHIGAN AVE,SUITE 815,CHICAGO,IL,606017506,US,3128560131,3128560134,M,2084P0800X,,,,,,,,,,,,,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83,1285891002,1,STEINHAUER,ELIZABETH,,DR.,,MD,70 E LAKE ST STE 1018,,CHICAGO,IL,606017637,US,3124589205,3124589205,F,2084P0800X,,,,,,,,,,,,,,,1
84,1851497119,1,MANIQUIS,VINA MARIA,ESTIPONA,DR.,,MD,155 NORTH HARBOR DRIVE,APARTMENT 5102,CHICAGO,IL,60601,US,3128060193,-1,F,207Q00000X,,,,,,,,,,,,,,,1
85,1164841722,1,NIE,DAISY,,,,MD,333 N MICHIGAN AVE STE 1120,,CHICAGO,IL,606014001,US,8477599110,2249852119,F,2084P0800X,,,,,,,,,,,,,,,1
86,1003973413,1,PINSKI,KEVIN,SCOTT,DR.,,M.D.,150 N MICHIGAN AVE STE 1200,,CHICAGO,IL,606017605,US,3122634625,3122635029,M,207NS0135X,207N00000X,,,,,,,,,,,,,,1


### Read taxonomy code dataset into DB table

In [498]:
tc_df = pd.read_csv('nucc_taxonomy_230.csv',encoding='utf-8')
tc_df.to_sql('taxonomy',engine,index=False)

873

In [499]:
query = 'SELECT * FROM taxonomy where Code=\'207V00000X\';'
pd.read_sql(query,engine)

Unnamed: 0,Code,Grouping,Classification,Specialization,Definition,Notes,Display Name,Section
0,207V00000X,Allopathic & Osteopathic Physicians,Obstetrics & Gynecology,,An obstetrician/gynecologist possesses special...,"Source: American Board of Medical Specialties,...",Obstetrics & Gynecology Physician,Individual


### -------------Don't use below cell----------------
For experimentation/testing only; full file too large for pandas


In [447]:
chunk = pd.read_csv("npidata_pfile_20050523-20230108.csv",nrows=100000,low_memory=False,usecols=cols)

chunk[str_cols]= chunk[str_cols].astype(str)
chunk[str_cols] = chunk[str_cols].replace('nan', '')
chunk = chunk[(chunk['Entity Type Code']==1) & (chunk['Provider Business Practice Location Address Country Code (If outside U.S.)']=='US')]
def physician_check(codes):
    return any(item.startswith('20') for item in codes)

chunk['DR_bool']= chunk.apply(lambda x: True if physician_check(x.tolist()[17:]) else False,axis=1)
chunk = chunk[chunk['DR_bool']==True]
chunk = chunk[chunk['Provider Business Practice Location Address Postal Code'].str.isdigit()]

chunk[int_cols] = chunk[int_cols].fillna(-1)
chunk[int_cols]=chunk[int_cols].astype(int)