# Webscraper for BlackDoctor.org

## Imports

In [1]:
from google.cloud import bigquery
import warnings
import pandas as pd
from bs4 import BeautifulSoup
import requests

## Setup

In [2]:
# Ignores bigquery warnings about credentials type
warnings.filterwarnings(
    "ignore", "Your application has authenticated using end user credentials"
)

pd.options.display.max_rows = 100

# bigquery client setup
client = bigquery.Client()

## Get list of all ZIP codes

In [3]:
sql_query = f"""
SELECT Zipcode
FROM `w2ohcwork.rwd_references.zips`
WHERE ZipCodeType = 'STANDARD'
"""
res = client.query(sql_query)
zips_df = res.to_dataframe()
zips_df.head()

Unnamed: 0,Zipcode
0,610
1,612
2,601
3,602
4,603


In [4]:
zipcodes = zips_df['Zipcode'].tolist()

## Search blackdoctor.org using all zips for family med & peds

In [None]:
doctors = [] # list of dicts with doc info
for zipcode in zipcodes:
    print(zipcode)
    
    # get html of page
    specialty = '2413' # family med - 2411, pediatrics - 2413
    url = f'https://blackdoctor.org/find-a-doctor/?doctor-zip={zipcode}&doctor-specialty={specialty}&doctor-provider=&doctor-search=Find+a+Doctor'
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0'}
    res = requests.get(url,  headers = headers)
    soup = BeautifulSoup(res.text, 'html.parser')
    
    # skip if there are no results
    sr = soup.find('section', class_='doctor-search-results')
    if "Sorry" in sr.get_text():
        #exclude_zips.append(zipcode)
        continue
        
    # if there are results, parse html and extract information
    search_results = soup.find_all('div', class_='info')

    for s in search_results:
        full_name = s.h2.a.get_text()
        temp_name = full_name[3:].split(',')[0].replace('.', '') # remove 'Dr.' and ', MD' etc
        if 'MD' in temp_name[-3:]: # remove MD if there is no comma
            temp_name = temp_name[:-2]  
            
        temp_name = temp_name.strip().split(' ')
        first_name = str(temp_name[0]).strip()
        last_name = str(temp_name[-1]).strip()
        if len(temp_name) == 1:
            first_name = ''
            middle_name = ''
        elif len(temp_name) == 2:
            middle_name = ''
        else:
            temp_name.pop(0)
            temp_name.pop(-1)
            middle_name = ' '.join(temp_name).strip()
            
        spec = s.find('div', class_='doctor-specialties').get_text()

        loc_html = s.find('div', class_='doctor-address')
        loc_html.find('br').replace_with('\n')
        full_loc = loc_html.get_text()
        temp = full_loc.split('\n')
        address = temp[0]
        csz = temp[1]
        city_state = csz[:-5]
        city_state = city_state.split(',')
        city = city_state[0].strip()
        state = city_state[1].strip()
        zip_code = csz[-5:].strip()
        if(len(zip_code) < 5):
            zip_code = '0' + zip_code

        doc = {'full_name': full_name,
               'first_name': first_name,
               'middle_name': middle_name,
               'last_name': last_name,
               'specialty': spec,
               'address': address,
               'city': city,
               'state': state,
               'zipcode': zip_code,
               'full_location': full_loc
        }
        if(doc not in doctors):
            doctors.append(doc)
            print(f'{full_name}; {city}, {state}, {zip_code}')           

00610
00612
00601
00602
00603
00616
00617
00622
00623
00627
00659
00660
00662
00664
00638
00646
00647
00650
00652
00653
00656
00667
00669
00670
00674
00606
00680
00682
00676
00687
00624
00678
00677
00637
00688
00690
00683
00685
00641
00692
00693
00698
00705
00703
00704
00714
00794
00729
00736
00739
00769
00782
00783
00791
00795
00777
00728
00725
00727
00735
00780
00738
00784
00778
00771
00772
00773
00707
00715
00718
00719
00720
00723
00716
00717
00730
00731
00740
00741
00745
00751
00754
00757
00765
00766
00767
00802
00820
00830
00840
00850
00979
00982
00983
00985
00987
00965
00966
00968
00969
00971
00934
00952
00901
00907
00909
00911
00912
00913
00915
00917
00918
00920
00921
00923
00924
00925
00926
00927
00956
00957
00959
00961
00962
00976
00953
00949
01001
01002
01005
01007
01008
01010
01011
01012
01084
01013
01020
01022
01026
01027
01062
01028
01030
01031
01032
01033
01034
01035
01036
01038
01088
01039
01040
01050
01053
01054
01056
01057
01060
01068
01069
01070
01071
01072
01073
0107

05068
05069
05070
05071
05072
05073
05083
05075
05077
05079
05081
05084
05086
05001
05089
05091
05101
05141
05142
05143
05146
05148
05149
05150
05151
05152
05153
05154
05155
05156
05158
05161
05250
05201
05251
05252
05253
05255
05257
05260
05261
05262
05340
05301
05341
05342
05343
05345
05346
05350
05352
05353
05359
05354
05355
05356
05358
05360
05361
05351
05362
05363
05440
05441
05443
05401
05403
05404
05408
05444
05445
05446
05448
05447
05450
05452
05454
05455
05456
05457
05458
05459
05461
05462
05463
05464
05465
05468
05471
05472
05473
05474
05476
05477
05478
05482
05483
05486
05487
05488
05489
05491
05442
05492
05494
05495
05641
05647
05648
05649
05640
05650
05651
05666
05652
05653
05654
05655
05656
05658
05602
05660
05661
05663
05667
05669
05672
05673
05674
05675
05676
05677
05679
05680
05681
05682
05730
05732
05733
05734
05735
05737
05738
05739
05766
05742
05743
05744
05747
05748
05751
05753
05757
05758
05759
05760
05761
05762
05763
05764
05765
05767
05701
05769
05770
05772
0577

10303
10304
Dr. Belinda Marquis, MD; Brooklyn, NY, 11203
10305
10306
10307
10308
10309
10310
10311
10312
10314
10451
10452
10453
10454
10455
10456
10457
10458
Dr. Christopher Phang M.D.; Bronx, New York, 10461
Dr. Christopher A. Phang MD; Bronx, New York, 10461
10459
10460
10461
10462
10463
10464
10465
10466
10467
10468
10469
10470
10471
10472
10473
10474
10475
10501
10502
10504
10505
10506
10507
10509
10510
10511
10512
10514
10516
10518
10520
10522
10523
10524
10526
10527
10528
10530
10532
10533
10535
10536
10537
10538
10541
10543
10546
10547
10548
10549
10550
10552
10553
10560
10562
10566
10567
10570
10573
10576
10577
10578
10579
10580
10583
10588
10589
10590
10591
10594
10595
10597
10598
10601
10603
10604
10605
10606
10607
10701
10702
10703
10704
10705
10706
10707
10708
10709
10710
10801
10803
10804
10805
10911
10913
10916
10917
10918
10919
10920
10921
10923
10924
10925
10926
10927
10928
10930
10931
10940
10941
10950
10952
10954
10956
10958
10960
10962
10963
10964
10965
10968
10969


14062
14063
14065
14066
14067
14068
14069
14070
14072
14075
14080
14081
14082
14083
14085
14086
14091
14092
14094
14098
14028
14101
14102
14103
14105
14108
14111
14113
14120
14125
14127
14129
14131
14132
14134
14136
14138
14139
14141
14143
14145
14150
14167
14170
14171
14172
14174
14201
14202
14203
14204
14206
14207
14208
14209
14210
14211
14212
14213
14214
14215
14216
14217
14218
14219
14220
14221
14222
14223
14224
14225
14226
14227
14228
14280
14301
14303
14304
14305
14411
14479
14414
14415
14416
14420
14422
14423
14424
14425
14427
14428
14432
14433
14435
14437
14441
14445
14450
14454
14456
14462
14464
14466
14467
14468
14469
14470
14471
14472
14475
14476
14477
14418
14478
14480
14481
14482
14485
14487
14489
14486
14545
14502
14504
14505
14506
14507
14510
14512
14513
14514
14516
14517
14519
14521
14522
14525
14526
14527
14530
14532
14533
14534
14536
14541
14543
14544
14546
14548
14550
14551
14555
14559
14560
14561
14564
14568
14569
14571
14572
14580
14585
14586
14589
14590
14591
1460

17601
17602
17603
17611
17724
17728
17729
17737
17723
17727
17740
17742
17744
17745
17747
17751
17752
17754
17756
17758
17764
17765
17768
17771
17772
17774
17776
17777
17778
17701
17702
17738
17810
17812
17843
17813
17814
17878
17815
17820
17821
17823
17836
17824
17827
17830
17834
17837
17841
17842
17844
17845
17846
17847
17851
17853
17855
17857
17859
17860
17864
17867
17870
17866
17872
17876
17801
17881
17856
17886
17888
17889
17921
17922
17929
17931
17934
17935
17938
17941
17948
17954
17957
17959
17960
17961
17963
17964
17965
17901
17967
17968
17970
17972
17976
17978
17980
17981
17983
17985
18011
Dr. Harriette L Starr, MD; Fort Washington, Pennsylvania, 19034
Dr. Pamela Huffman-Devaughn M.D.; Philadelphia, Pennsylvania, 19144
18013
18014
18015
18016
18017
18018
18020
18031
18032
18034
Dr. Patricia Sandiford, MD; Philadelphia, PA, 19132
18035
18036
18037
18038
18041
18040
18042
18045
18049
18051
18053
18054
18055
18056
18058
18059
18062
18064
18066
18067
18069
18070
18071
18072
18074


20886
20874
20876
20812
20895
20832
20837
20850
20851
20852
20853
20854
20855
20860
20868
20901
20902
20903
20904
20905
20906
20910
20912
21001
Dr. Rosemary A. Dayie, MD; Baltimore, Maryland, 21224
21005
21010
21009
21012
21013
21014
21015
21017
21028
21029
21030
21031
21044
21045
21046
21032
21034
21035
21036
21037
21040
21042
21043
21047
21048
21050
21051
21053
21054
21057
21060
Dr. Dawan V King, MD; Baltimore, Maryland, 21202
Dr. Carissa M. Baker-Smith, M.D.; Baltimore, MD, 21201
21061
21074
21075
21076
21077
21078
21082
21084
21085
21087
21090
21093
21114
21102
21108
21111
21113
21117
21120
21122
21128
21131
21132
21133
21136
21140
21144
21146
21152
21154
21155
21156
21157
21158
21160
21161
21162
21104
21163
21201
21202
21204
21205
21206
21207
21208
21209
21210
21211
21212
21213
21214
21215
21216
21217
21218
21219
21220
21221
21222
21223
21224
21225
21226
21227
21228
21229
21230
21231
21233
21234
21236
21237
21239
21240
21244
21286
21260
21261
21401
21403
21405
21409
21402
21520
21

In [83]:
doc_df = pd.DataFrame.from_dict(doctors)
doc_df.head()

Unnamed: 0,full_name,first_name,middle_name,last_name,specialty,address,city,state,zipcode,full_location
0,"Dr. Faye Holder-Niles, M.D.",Faye,,Holder-Niles,Pediatrics,300 Longwood Ave,Boston,Massachusetts,2115,"300 Longwood Ave\nBoston, Massachusetts 2115"
1,"Dr. Joan Y Reede, MD",Joan,Y,Reede,Pediatrics,45 Greenwich Street,Roxbury Crossing,Massachusetts,2120,"45 Greenwich Street\nRoxbury Crossing, Massach..."
2,"Dr. Lee Pachter, MD",Lee,,Pachter,Pediatrics,114 Woodland Street,Hartford,Connecticut,6105,"114 Woodland Street\nHartford, Connecticut 6105"
3,"Dr. Tamiko Jackson-McArthur, M.D.",Tamiko,,Jackson-McArthur,Pediatrics,30 Quaker Farms Rd,Southbury,CT,6488,"30 Quaker Farms Rd\nSouthbury, CT 6488"
4,Dr. Tamiko Jackson-McArthur M.D.,Tamiko,,Jackson-McArthur,Pediatrics,"1423 Chapel St, Ste. 2B",New Haven,Connecticut,6511,"1423 Chapel St, Ste. 2B\nNew Haven, Connecticu..."


In [84]:
doc_df.to_csv('black_docs_peds.csv', index=False)