# NPI Registry
### MSDS 498 Capstone, Winter 2023, Team 53
### Author: Sarah Rodenbeck

### Import Necessary Libraries

In [8]:
from dask import dataframe as dd
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
from sqlalchemy.engine import URL
import psycopg2
pd.set_option('display.max_columns', None)

## Initial Data Exploration and Filtering

### Only need a subset of columns

In [9]:
cols = ['NPI','Entity Type Code','Provider Last Name (Legal Name)','Provider First Name','Provider Middle Name',\
        'Provider Name Prefix Text','Provider Name Suffix Text','Provider Credential Text','Provider First Line Business Practice Location Address',\
        'Provider Second Line Business Practice Location Address','Provider Business Practice Location Address City Name',\
        'Provider Business Practice Location Address State Name','Provider Business Practice Location Address Postal Code',\
        'Provider Business Practice Location Address Country Code (If outside U.S.)',
        'Provider Business Practice Location Address Telephone Number','Provider Business Practice Location Address Fax Number',\
        'Provider Gender Code','Healthcare Provider Taxonomy Code_1','Healthcare Provider Taxonomy Code_2',\
        'Healthcare Provider Taxonomy Code_3','Healthcare Provider Taxonomy Code_4','Healthcare Provider Taxonomy Code_5',\
        'Healthcare Provider Taxonomy Code_6','Healthcare Provider Taxonomy Code_7','Healthcare Provider Taxonomy Code_8',\
        'Healthcare Provider Taxonomy Code_9','Healthcare Provider Taxonomy Code_10','Healthcare Provider Taxonomy Code_11',\
        'Healthcare Provider Taxonomy Code_12','Healthcare Provider Taxonomy Code_13','Healthcare Provider Taxonomy Code_14',\
        'Healthcare Provider Taxonomy Code_15']
int_cols = ['NPI','Entity Type Code','Provider Business Practice Location Address Postal Code','Provider Business Practice Location Address Telephone Number','Provider Business Practice Location Address Fax Number']
str_cols = list(set(cols)-set(int_cols))

### Read in Data with Dask
Note: File too large to effectively use pandas

In [10]:
res = {}
for i in range(len(str_cols)):
    res[str_cols[i]] = object
res['Provider Business Practice Location Address Postal Code'] = object
res['Provider Business Practice Location Address Telephone Number']= object

df = dd.read_csv("npidata_pfile_20050523-20230108.csv",usecols=cols,dtype=res)

df[str_cols] = df[str_cols].astype(str)
df[str_cols] = df[str_cols].replace('nan', '')
df[str_cols] = df[str_cols].fillna('')


### Total size of the dataset is 7,618,186 providers and organizations
Note: Contains physicians, non-physicians, and organizations; need to use taxonomy to determine type of care provided

In [None]:
# Very slow computation!
df.index.size.compute()

### Rename Columns

In [11]:
new_columns = ['NPI','EntityType','LastName','FirstName','MiddleName','Prefix','Suffix','Credential','PracticeAddress','PracticeAddress2','PracticeCity','PracticeState','PracticeZip','PracticeCountry','PracticePhoneNum','PracticeFaxNum','ProviderGender','TaxonomyCode1','TaxonomyCode2','TaxonomyCode3','TaxonomyCode4','TaxonomyCode5','TaxonomyCode6','TaxonomyCode7','TaxonomyCode8','TaxonomyCode9','TaxonomyCode10','TaxonomyCode11','TaxonomyCode12','TaxonomyCode13','TaxonomyCode14','TaxonomyCode15']
df = df.rename(columns=dict(zip(df.columns, new_columns)))


### Filter Providers

In [12]:
df = df[(df['EntityType']==1) & (df['PracticeCountry']=='US')] #only look at individuals practicing in the US

def physician_check(codes):
    return any(item.startswith('20') for item in codes)

df['DR_bool']= df.apply(lambda x: True if physician_check(x.tolist()[17:]) else False,axis=1)
df = df[df['DR_bool']==True]

df = df[df['PracticeZip'].str.isdigit()]
int_col_new = ['NPI','EntityType','PracticeZip','PracticePhoneNum','PracticeFaxNum']
df[int_col_new] = df[int_col_new].fillna(-1)
df[int_col_new]=df[int_col_new].astype(int)

You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, please provide the `meta=` keyword, as described in the map or apply function that you are using.
  Before: .apply(func)
  After:  .apply(func, meta=(None, 'bool'))



### Number of Physicians in NPI Registry: 1,135,297
Note: Includes all specialties

In [None]:
#Very slow computation!
df.index.size.compute()

### Filter down physicians to those who can provide CC screenings

In [13]:
#Codes correspond to: family medicine, FM adult medicine, general internal medicine, general ob/gyn, gynecologic oncology, gynecologic medicine
relevant_codes = ['207Q00000X','207QA0505X','207R00000X','207V00000X','207VX0201X','207VG0400X']

def relevant_med(codes):
    return any(item in relevant_codes for item in codes)

df['can_screen']= df.apply(lambda x: True if relevant_med(x.tolist()[17:]) else False,axis=1)
df = df[df['can_screen']==True]

You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, please provide the `meta=` keyword, as described in the map or apply function that you are using.
  Before: .apply(func)
  After:  .apply(func, meta=(None, 'bool'))



In [14]:
df.head(10)

Unnamed: 0,NPI,EntityType,LastName,FirstName,MiddleName,Prefix,Suffix,Credential,PracticeAddress,PracticeAddress2,PracticeCity,PracticeState,PracticeZip,PracticeCountry,PracticePhoneNum,PracticeFaxNum,ProviderGender,TaxonomyCode1,TaxonomyCode2,TaxonomyCode3,TaxonomyCode4,TaxonomyCode5,TaxonomyCode6,TaxonomyCode7,TaxonomyCode8,TaxonomyCode9,TaxonomyCode10,TaxonomyCode11,TaxonomyCode12,TaxonomyCode13,TaxonomyCode14,TaxonomyCode15,DR_bool,can_screen
559,1528061496,1,BRODY,HOWARD,,DR.,,M.D.,138 SERVICE RD,,EAST LANSING,MI,488241376,US,5173533050,5174323742,M,207Q00000X,,,,,,,,,,,,,,,True,True
568,1992708754,1,KAVCIC,JOHN,,DR.,,MD,20 LOSSON RD,STE 105,CHEEKTOWAGA,NY,142272379,US,7165587727,-1,M,207Q00000X,,,,,,,,,,,,,,,True,True
575,1538162003,1,RODRIGUEZ,EDWIN,J,,,M.D.,4643 S WOODHAVEN WAY,,BILLINGS,MT,591062493,US,6059201965,-1,M,207Q00000X,,,,,,,,,,,,,,,True,True
585,1497758957,1,JACOB,CHRISTINA,N,DR.,,M.D.,2230 LYNN RD,STE 230,THOUSAND OAKS,CA,913601984,US,8053735864,8053742439,F,207R00000X,,,,,,,,,,,,,,,True,True
613,1184627986,1,FLEMING,MICHAEL,O,DR.,,MD,8383 MILLICENT WAY,,SHREVEPORT,LA,711155207,US,3187976661,3187958512,M,207Q00000X,,,,,,,,,,,,,,,True,True
632,1245233295,1,POTTS,JILL,K,DR.,,M.D.,335 PARRISH ST,,CANANDAIGUA,NY,144241728,US,5853932845,5853969275,F,207R00000X,,,,,,,,,,,,,,,True,True
634,1508869553,1,VAN HOUTEN-SAUTER,LEE,ANN,,,D.O.,220 PINE ST,,WILLIAMSTOWN,NJ,80941137,US,8566297436,8568754742,F,207Q00000X,,,,,,,,,,,,,,,True,True
635,1861495814,1,SMITH,FRANCISCO,ALBERTO,DR.,,M.D.,1660 MEDICAL BLVD,STE 302,NAPLES,FL,341101497,US,2395961995,2395961413,M,207VX0201X,,,,,,,,,,,,,,,True,True
636,1285637223,1,LOWRY,STEVEN,MICHAEL,,,D.O.,220 PINE ST,,WILLIAMSTOWN,NJ,80941137,US,8566297436,8568754742,M,207Q00000X,,,,,,,,,,,,,,,True,True
639,1922001957,1,PRESLEY,RICHARD,E,,,M.D.,2011 MURPHY AVE,STE 302,NASHVILLE,TN,372032023,US,6152842929,6152842920,M,207V00000X,,,,,,,,,,,,,,,True,True


### Number of physicians who might reasonably provide cervical cancer screenings: 394,525

In [None]:
#Very slow computation!
df.index.size.compute()

### Convert to regular dataframe for easier parsing

In [15]:
df = df.compute()

In [16]:
df

Unnamed: 0,NPI,EntityType,LastName,FirstName,MiddleName,Prefix,Suffix,Credential,PracticeAddress,PracticeAddress2,PracticeCity,PracticeState,PracticeZip,PracticeCountry,PracticePhoneNum,PracticeFaxNum,ProviderGender,TaxonomyCode1,TaxonomyCode2,TaxonomyCode3,TaxonomyCode4,TaxonomyCode5,TaxonomyCode6,TaxonomyCode7,TaxonomyCode8,TaxonomyCode9,TaxonomyCode10,TaxonomyCode11,TaxonomyCode12,TaxonomyCode13,TaxonomyCode14,TaxonomyCode15,DR_bool,can_screen
559,1528061496,1,BRODY,HOWARD,,DR.,,M.D.,138 SERVICE RD,,EAST LANSING,MI,488241376,US,5173533050,5174323742,M,207Q00000X,,,,,,,,,,,,,,,True,True
568,1992708754,1,KAVCIC,JOHN,,DR.,,MD,20 LOSSON RD,STE 105,CHEEKTOWAGA,NY,142272379,US,7165587727,-1,M,207Q00000X,,,,,,,,,,,,,,,True,True
575,1538162003,1,RODRIGUEZ,EDWIN,J,,,M.D.,4643 S WOODHAVEN WAY,,BILLINGS,MT,591062493,US,6059201965,-1,M,207Q00000X,,,,,,,,,,,,,,,True,True
585,1497758957,1,JACOB,CHRISTINA,N,DR.,,M.D.,2230 LYNN RD,STE 230,THOUSAND OAKS,CA,913601984,US,8053735864,8053742439,F,207R00000X,,,,,,,,,,,,,,,True,True
613,1184627986,1,FLEMING,MICHAEL,O,DR.,,MD,8383 MILLICENT WAY,,SHREVEPORT,LA,711155207,US,3187976661,3187958512,M,207Q00000X,,,,,,,,,,,,,,,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53375,1427786607,1,PICARRA,EMERALD,GRACE,,,NP,1980 LONG BRANCH CT,,KERNERSVILLE,NC,272847398,US,4346040983,-1,F,207Q00000X,,,,,,,,,,,,,,,True,True
53403,1467629550,1,NASREEN,JOHRA,,DR.,,MD,8811 VILLAGE DRIVE,,SAN ANTONIO,TX,78217,US,2102976500,2102972098,F,207Q00000X,390200000X,208M00000X,,,,,,,,,,,,,True,True
53432,1659735827,1,AVILA,ANIBAL,MIKJAIL,,,M.D.,800 N OKLAHOMA AVE APT 1201,,OKLAHOMA CITY,OK,731044407,US,4057637098,-1,M,207VX0000X,207V00000X,,,,,,,,,,,,,,True,True
53434,1598225997,1,DAMUS,FRANCESCA,,,,,1000 MEDICAL CENTER BLVD,,LAWRENCEVILLE,GA,300467694,US,6783124077,-1,F,207Q00000X,390200000X,,,,,,,,,,,,,,True,True


In [17]:
df.to_csv('filtered_npi_registry.csv')

## SQLite DB

### Create Database from Filtered Data

In [None]:
engine = create_engine('sqlite:///npi.db')
if not database_exists(engine.url):
    create_database(engine.url)
df.to_sql('npi_registry','sqlite:///npi.db',if_exists='replace',index=False)

### Simple query to find physicians in zip code
Need to filter for specific specialties and do geospatial query to find providers near zip, not just within

In [None]:
query = 'SELECT * FROM npi_registry where PracticeZip like \'60601%\';'
pd.read_sql(query,engine)

### Read taxonomy code dataset into DB table

In [None]:
tc_df = pd.read_csv('nucc_taxonomy_230.csv',encoding='utf-8')
tc_df.to_sql('taxonomy',engine,index=False)

In [None]:
query = 'SELECT * FROM taxonomy where Code=\'207V00000X\';'
pd.read_sql(query,engine)

### -------------Don't use below cell----------------
For experimentation/testing only; full file too large for pandas


In [None]:
chunk = pd.read_csv("npidata_pfile_20050523-20230108.csv",nrows=100000,low_memory=False,usecols=cols)

chunk[str_cols]= chunk[str_cols].astype(str)
chunk[str_cols] = chunk[str_cols].replace('nan', '')
chunk = chunk[(chunk['Entity Type Code']==1) & (chunk['Provider Business Practice Location Address Country Code (If outside U.S.)']=='US')]
def physician_check(codes):
    return any(item.startswith('20') for item in codes)

chunk['DR_bool']= chunk.apply(lambda x: True if physician_check(x.tolist()[17:]) else False,axis=1)
chunk = chunk[chunk['DR_bool']==True]
chunk = chunk[chunk['Provider Business Practice Location Address Postal Code'].str.isdigit()]

chunk[int_cols] = chunk[int_cols].fillna(-1)
chunk[int_cols]=chunk[int_cols].astype(int)