In [85]:
import pandas as pd
import numpy as np
import time
import glob
import os
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
from sqlalchemy.engine import URL
import psycopg2
pd.set_option('display.max_columns', None)

### Read in filtered provider CSV

In [69]:
col = ['NPI', 'LastName', 'FirstName', 'MiddleName', 'Prefix',
       'Suffix', 'Credential', 'PracticeAddress', 'PracticeAddress2',
       'PracticeCity', 'PracticeState', 'PracticeZip', 'PracticeCountry',
       'PracticePhoneNum', 'PracticeFaxNum', 'ProviderGender', 'TaxonomyCode1',
       'TaxonomyCode2', 'TaxonomyCode3', 'TaxonomyCode4', 'TaxonomyCode5',
       'TaxonomyCode6', 'TaxonomyCode7', 'TaxonomyCode8', 'TaxonomyCode9',
       'TaxonomyCode10', 'TaxonomyCode11', 'TaxonomyCode12', 'TaxonomyCode13',
       'TaxonomyCode14', 'TaxonomyCode15']
df = pd.read_csv('filtered_npi_registry.csv',index_col=0,usecols=col)
df = df.reset_index()

  df = pd.read_csv('filtered_npi_registry.csv',index_col=0,usecols=col)


### Generate geocoding scripts

In [14]:
addresses = df[['NPI','PracticeAddress','PracticeCity','PracticeState','PracticeZip']]
for idx, chunk in enumerate(np.array_split(addresses,40)):
    chunk.to_csv(f"addresses{idx}.csv",index=False,header=False)
for i in range(0,40):
    print(f"!curl --form addressFile=@addresses{i}.csv --form benchmark=\"Public_AR_Current\"  https://geocoding.geo.census.gov/geocoder/locations/addressbatch --output geocodingResults/addresses{i}out.csv")

### Join all output together

In [44]:
path = '/Users/sarahrodenbeck/Documents/MSDS/MSDS498 - Capstone/Cervical_Cancer_Risk_Assessment/FindProviderFeature/geocodingResults'
cols = ['NPI','Address','isMatch','isExact','AddressFound','LonLat','TigerLineID','Side']
all_files = glob.glob(os.path.join(path,"*.csv"))
li = []
for f in all_files:
    df = pd.read_csv(f,header=None,names=cols)
    li.append(df)

geocode_df = pd.concat(li,axis=0,ignore_index=True)

In [51]:
geocode_match = geocode_df[geocode_df['isMatch']=='Match']
geocode_match[['LON', 'LAT']] = geocode_match['LonLat'].str.split(',', 1, expand=True)

In [53]:
geocode_match

Unnamed: 0,NPI,Address,isMatch,isExact,AddressFound,LonLat,TigerLineID,Side,Lon,Lat
0,1952577819,"1170 S EAST AVE, OAK PARK, IL, 603042143",Match,Non_Exact,"1170 S EAST AVE, OAK PARK, IL, 60304","-87.78889390499995,41.86642949700007",605103423.0,L,-87.78889390499995,41.86642949700007
2,1801811716,"275 SEVENTH AVE, NY, NY, 10001",Match,Exact,"275 7TH AVE, NEW YORK, NY, 10001","-73.99429134499997,40.74595244400007",59653388.0,R,-73.99429134499997,40.74595244400007
3,1780684019,"550 GAGE BLVD, RICHLAND, WA, 993529532",Match,Exact,"550 GAGE BLVD, RICHLAND, WA, 99352","-119.26166515499995,46.227124229000026",175008930.0,L,-119.26166515499995,46.227124229000026
4,1003819053,"436 HINSDALE RD, CAMILLUS, NY, 130311648",Match,Exact,"436 HINSDALE RD, CAMILLUS, NY, 13031","-76.26534634899997,43.050668114000075",640456402.0,R,-76.26534634899997,43.050668114000075
5,1871513564,"11100 EUCLID AVE, CLEVELAND, OH, 441061716",Match,Exact,"11100 EUCLID AVE, CLEVELAND, OH, 44106","-81.60805145599994,41.50662616000005",638278041.0,L,-81.60805145599994,41.50662616000005
...,...,...,...,...,...,...,...,...,...,...
394518,1023038155,"3501 N MACARTHUR BLVD, IRVING, TX, 750623636",Match,Non_Exact,"3501 N MACARTHUR BLVD, IRVING, TX, 75062","-96.95904881899997,32.852430351000066",617055072.0,L,-96.95904881899997,32.852430351000066
394520,1912929928,"307 SAINT JOHNS WAY, LEWISTON, ID, 835012435",Match,Exact,"307 ST JOHNS WAY, LEWISTON, ID, 83501","-117.02652766699998,46.41779947900005",173130432.0,R,-117.02652766699998,46.41779947900005
394521,1407960446,"728 PACIFIC AVE STE 611, SAN FRANCISCO, CA, 94...",Match,Exact,"728 PACIFIC AVE, SAN FRANCISCO, CA, 94133","-122.40751166399997,37.79682395400005",639023945.0,R,-122.40751166399997,37.79682395400005
394523,1336184613,"1540 SPRING VALLEY DR, HUNTINGTON, WV, 257049300",Match,Exact,"1540 SPRING VALLEY DR, HUNTINGTON, WV, 25704","-82.51809241699993,38.38406918700008",57203084.0,L,-82.51809241699993,38.38406918700008


### Add lat/lon info to original dataframe

In [74]:
df = pd.merge(df,geocode_match[['NPI','LON','LAT']], on='NPI',how='left')

In [78]:
matches = df[df['LON'].isna()==False]

In [107]:
df.to_csv('npi_registry_census_geocode.csv')

### Create postgresql database

In [80]:
url = 'postgresql+psycopg2://postgres:password@localhost/npiProviders'
engine = create_engine(url)
matches.to_sql('npi_registry',url,if_exists='replace',index=False)

912

### Test Queries and Validation

In [81]:
query = 'SELECT count(*) FROM npi_registry;'
pd.read_sql(query,engine)

Unnamed: 0,count
0,310912


In [106]:
query = 'SELECT * \
        FROM npi_registry \
        WHERE CAST("PracticeZip" as TEXT) like \'46236%%\';'
pd.read_sql(query,engine)

Unnamed: 0,NPI,LastName,FirstName,MiddleName,Prefix,Suffix,Credential,PracticeAddress,PracticeAddress2,PracticeCity,PracticeState,PracticeZip,PracticeCountry,PracticePhoneNum,PracticeFaxNum,ProviderGender,TaxonomyCode1,TaxonomyCode2,TaxonomyCode3,TaxonomyCode4,TaxonomyCode5,TaxonomyCode6,TaxonomyCode7,TaxonomyCode8,TaxonomyCode9,TaxonomyCode10,TaxonomyCode11,TaxonomyCode12,TaxonomyCode13,TaxonomyCode14,TaxonomyCode15,Lon,Lat
0,1770004954,GIFFEN,MICHAEL,,,,DO,8150 OAKLANDON RD STE 130,,INDIANAPOLIS,IN,462369554,US,3176211111,3176211110,M,207Q00000X,207Q00000X,,,,,,,,,,,,,,-85.95714892899997,39.90479400100003
1,1942253208,BRADY,TIMOTHY,B,,,MD,9443 E. 38TH ST.,,INDIANAPOLIS,IN,462362132,US,3178902100,3178902171,M,208000000X,207R00000X,,,,,,,,,,,,,,-86.00172876599999,39.82614653600007
2,1669017117,RETHERFORD,RACHEL,,,,PA,8150 OAKLANDON RD STE 130,,INDIANAPOLIS,IN,462369554,US,3176211111,3176211110,F,363A00000X,207Q00000X,,,,,,,,,,,,,,-85.95714892899997,39.90479400100003
3,1477999233,GELATT,MICHAEL,P,,,DO,8150 OAKLANDON RD,SUITE 130,INDIANAPOLIS,IN,462369525,US,3176211111,3176211110,M,390200000X,207Q00000X,,,,,,,,,,,,,,-85.95714892899997,39.90479400100003
4,1962479923,LISANTI,JOSEPH,P,DR.,,DO,9347 PENDLETON PIKE,,INDIANAPOLIS,IN,462362768,US,3176123193,3176123270,M,207Q00000X,,,,,,,,,,,,,,,-86.00382091599994,39.84748465500007
5,1134765993,VARELDZIS,BASIL,,DR.,,MD,9125 NAUTICAL WATCH DR,,INDIANAPOLIS,IN,462369036,US,3179192846,-1,M,207Q00000X,2083A0300X,2084P0015X,,,,,,,,,,,,,-85.96805080499996,39.91961740500005
6,1407111339,LEE,JASON,DANIEL,,,M.D.,8150 OAKLANDON RD STE 130,,INDIANAPOLIS,IN,462369554,US,3176211111,3176211110,M,390200000X,207Q00000X,,,,,,,,,,,,,,-85.95714892899997,39.90479400100003
7,1528089844,LOVELESS,DANYELLE,M,,,MD,8150 OAKLANDON RD,SUITE 130,INDIANAPOLIS,IN,462369554,US,3176211111,3176211110,F,207Q00000X,,,,,,,,,,,,,,,-85.95714892899997,39.90479400100003
8,1669496162,LYNN,WILLIAM,J,,,MD,8150 OAKLANDON RD,SUITE 130,INDIANAPOLIS,IN,462369554,US,3176211111,3176211110,M,207Q00000X,,,,,,,,,,,,,,,-85.95714892899997,39.90479400100003
9,1144242124,VASILCHEK,DANICA,M,,,MD,8150 OAKLANDON RD,SUITE 130,INDIANAPOLIS,IN,462369554,US,3176211111,3176217110,F,207Q00000X,,,,,,,,,,,,,,,-85.95714892899997,39.90479400100003


In [98]:
matches[(matches['PracticeZip']>=462360000) & (matches['PracticeZip']<=462369999)]

Unnamed: 0,NPI,LastName,FirstName,MiddleName,Prefix,Suffix,Credential,PracticeAddress,PracticeAddress2,PracticeCity,PracticeState,PracticeZip,PracticeCountry,PracticePhoneNum,PracticeFaxNum,ProviderGender,TaxonomyCode1,TaxonomyCode2,TaxonomyCode3,TaxonomyCode4,TaxonomyCode5,TaxonomyCode6,TaxonomyCode7,TaxonomyCode8,TaxonomyCode9,TaxonomyCode10,TaxonomyCode11,TaxonomyCode12,TaxonomyCode13,TaxonomyCode14,TaxonomyCode15,Lon,Lat
186097,1962479923,LISANTI,JOSEPH,P,DR.,,DO,9347 PENDLETON PIKE,,INDIANAPOLIS,IN,462362768,US,3176123193,3176123270,M,207Q00000X,,,,,,,,,,,,,,,-86.00382091599994,39.84748465500007
236690,1134765993,VARELDZIS,BASIL,,DR.,,MD,9125 NAUTICAL WATCH DR,,INDIANAPOLIS,IN,462369036,US,3179192846,-1,M,207Q00000X,2083A0300X,2084P0015X,,,,,,,,,,,,,-85.96805080499996,39.91961740500005
260824,1477999233,GELATT,MICHAEL,P,,,DO,8150 OAKLANDON RD,SUITE 130,INDIANAPOLIS,IN,462369525,US,3176211111,3176211110,M,390200000X,207Q00000X,,,,,,,,,,,,,,-85.95714892899997,39.90479400100003
261590,1407111339,LEE,JASON,DANIEL,,,M.D.,8150 OAKLANDON RD STE 130,,INDIANAPOLIS,IN,462369554,US,3176211111,3176211110,M,390200000X,207Q00000X,,,,,,,,,,,,,,-85.95714892899997,39.90479400100003
261595,1528089844,LOVELESS,DANYELLE,M,,,MD,8150 OAKLANDON RD,SUITE 130,INDIANAPOLIS,IN,462369554,US,3176211111,3176211110,F,207Q00000X,,,,,,,,,,,,,,,-85.95714892899997,39.90479400100003
261597,1669496162,LYNN,WILLIAM,J,,,MD,8150 OAKLANDON RD,SUITE 130,INDIANAPOLIS,IN,462369554,US,3176211111,3176211110,M,207Q00000X,,,,,,,,,,,,,,,-85.95714892899997,39.90479400100003
262102,1760496772,ROBINETTE,JENNIFER,S,,,MD,8150 OAKLANDON RD,SUITE 130,INDIANAPOLIS,IN,462369554,US,3176217111,3176217110,F,207Q00000X,,,,,,,,,,,,,,,-85.95714892899997,39.90479400100003
262381,1396752226,STOOKEY,MICHELE,R,,,MD,8150 OAKLANDON RD,SUITE 130,INDIANAPOLIS,IN,462369554,US,3176217111,3176217110,F,207Q00000X,,,,,,,,,,,,,,,-85.95714892899997,39.90479400100003
262500,1144242124,VASILCHEK,DANICA,M,,,MD,8150 OAKLANDON RD,SUITE 130,INDIANAPOLIS,IN,462369554,US,3176211111,3176217110,F,207Q00000X,,,,,,,,,,,,,,,-85.95714892899997,39.90479400100003
270464,1770004954,GIFFEN,MICHAEL,,,,DO,8150 OAKLANDON RD STE 130,,INDIANAPOLIS,IN,462369554,US,3176211111,3176211110,M,207Q00000X,207Q00000X,,,,,,,,,,,,,,-85.95714892899997,39.90479400100003
