In [1]:
# importing libraries

import pandas as pd
from random import randrange
import time

## Single-page Tabular Scrape

- <a href="https://apps.health.ny.gov/pubdoh/professionals/doctors/conduct/factions/AllRecordsAction.action">On this site</a>, scrape all the doctors info on page 292.
- Export the content into a CSV file called ```page_292.csv```.

In [2]:
## create coding cells as needed

url_sp = "https://apps.health.ny.gov/pubdoh/professionals/doctors/conduct/factions/AllRecordsAction.action?d-49653-p="
page_num = 292

pd.read_html(f"{url_sp}{page_num}")

[                                  Physician Last Name  \
 0                                               Ataee   
 1                                              Feiner   
 2                                              Taylor   
 3                                             Donshik   
 4                                             Robbins   
 5                                             Freeman   
 6                                           Markowitz   
 7                                              Puskas   
 8                                               Lopez   
 9                                              Hirsch   
 10                                             Stobie   
 11                                                Leo   
 12                                                 Lu   
 13                                              James   
 14                                             O'Hair   
 15                                           Berselli   
 16           

In [3]:
response_sp = pd.read_html(f"{url_sp}{page_num}")[1]
response_sp

Unnamed: 0,Physician Last Name,Physician First Name,Physician Middle Name,License Number,License Type,Effective Date,Date Updated,Year of Birth
0,Ataee,Shahab,,232919,MD,09/11/2003,10/07/2003,1964
1,Feiner,Marc,Alan,147174,MD,10/10/2003,10/06/2003,1952
2,Taylor,David,Howarth,154079,MD,10/10/2003,10/06/2003,1953
3,Donshik,Jon,David,202926,MD,10/10/2003,10/06/2003,1968
4,Robbins,Richard,Gregg,179378,MD,10/10/2003,10/06/2003,1963
5,Freeman,Douglas,,220267,MD,10/10/2003,10/06/2003,1968
6,Markowitz,Howard,,188933,MD,10/10/2003,10/06/2003,1961
7,Puskas,John,Michael,120273,MD,10/06/2003,10/01/2003,1945
8,Lopez,Jose,A,192852,MD,10/06/2003,10/01/2003,1962
9,Hirsch,Anthony,,98440,MD,10/01/2003,09/24/2003,1940


In [4]:
response_sp.to_csv("page_292.csv", index=False, encoding="UTF-8")

## Multipage Tabular Scrape

- <a href="https://apps.health.ny.gov/pubdoh/professionals/doctors/conduct/factions/AllRecordsAction.action">On this site</a>, scrape all doctors whose last names begin with "P".
- Export the content into a CSV file called ```md_P.csv```.


In [5]:
## create coding cells as needed

url_mp = "https://apps.health.ny.gov/pubdoh/professionals/doctors/conduct/factions/AlphabetSearchAction.action?alpbhabetSearch="
letter_search = "P"
url_page_num = "&d-49653-p="

base_url = f"{url_mp}{letter_search}{url_page_num}"

pd.read_html(base_url)

[                                  Physician Last Name  \
 0                                                Paal   
 1                                                Pace   
 2                                                Pace   
 3                                             Pacetti   
 4                                              Pachas   
 5                                             Pacheco   
 6                                               Pacik   
 7                                               Pacis   
 8                                                Pack   
 9                                        Packianathan   
 10                                       Packianathan   
 11                                              Padeh   
 12                                              Padeh   
 13                                            Padilla   
 14                                            Padilla   
 15                                            Padilla   
 16           

In [6]:
pd.read_html(base_url)[1]

Unnamed: 0,Physician Last Name,Physician First Name,Physician Middle Name,License Number,License Type,Effective Date,Date Updated,Year of Birth
0,Paal,Adam,,,MD,10/30/2000,,1961.0
1,Pace,Enrico,,166026.0,MD,08/21/2001,,1956.0
2,Pace,Leonard,,172870.0,MD,01/15/2002,01/22/2002,1952.0
3,Pacetti,Stephen,J,175021.0,MD,04/14/2016,04/07/2016,1957.0
4,Pachas,Hector,M,95535.0,MD,02/11/1993,,
5,Pacheco,Denny,J.,258600.0,DO,08/27/2020,08/26/2020,1962.0
6,Pacik,Peter,,96944.0,MD,11/15/2012,11/09/2012,1940.0
7,Pacis,Andresito,B.,125213.0,MD,10/22/2021,10/22/2021,1938.0
8,Pack,A,Stephen,183669.0,MD,04/28/2000,07/19/2001,1956.0
9,Packianathan,Emmanuel,,203833.0,MD,07/31/2008,07/25/2008,1945.0


In [7]:
counter = 1 
last_page = 24 
main_df = [] 
error_pages = [] 

for page_num in range(1,last_page):
    scrape_url = f"{base_url}{page_num}"
    print(f"Scraping page {counter} of {last_page} pages...")
    counter = counter + 1
    try:
        main_df.append(pd.read_html(scrape_url)[1]) 
    except:
        error_pages.append(scrape_url)
        print(f"There is an error in this page: {link}")
    finally:
        snoozer = randrange(5,12)
        print(f"Snoozing for {snoozer} seconds before scraping the next page...")
        time.sleep(snoozer)

print("Scraping is finished!")

Scraping page 1 of 24 pages...
Snoozing for 8 seconds before scraping the next page...
Scraping page 2 of 24 pages...
Snoozing for 6 seconds before scraping the next page...
Scraping page 3 of 24 pages...
Snoozing for 6 seconds before scraping the next page...
Scraping page 4 of 24 pages...
Snoozing for 6 seconds before scraping the next page...
Scraping page 5 of 24 pages...
Snoozing for 7 seconds before scraping the next page...
Scraping page 6 of 24 pages...
Snoozing for 9 seconds before scraping the next page...
Scraping page 7 of 24 pages...
Snoozing for 5 seconds before scraping the next page...
Scraping page 8 of 24 pages...
Snoozing for 8 seconds before scraping the next page...
Scraping page 9 of 24 pages...
Snoozing for 9 seconds before scraping the next page...
Scraping page 10 of 24 pages...
Snoozing for 6 seconds before scraping the next page...
Scraping page 11 of 24 pages...
Snoozing for 8 seconds before scraping the next page...
Scraping page 12 of 24 pages...
Snoozing 

In [8]:
len(main_df)

23

In [9]:
len(error_pages)

0

In [10]:
md_p = pd.concat(main_df, ignore_index=True)
md_p

Unnamed: 0,Physician Last Name,Physician First Name,Physician Middle Name,License Number,License Type,Effective Date,Date Updated,Year of Birth
0,Paal,Adam,,,MD,10/30/2000,,1961.0
1,Pace,Enrico,,166026.0,MD,08/21/2001,,1956.0
2,Pace,Leonard,,172870.0,MD,01/15/2002,01/22/2002,1952.0
3,Pacetti,Stephen,J,175021.0,MD,04/14/2016,04/07/2016,1957.0
4,Pachas,Hector,M,95535.0,MD,02/11/1993,,
...,...,...,...,...,...,...,...,...
455,Psaila,Justin,Sciberras,83832.0,MD,07/26/2005,07/20/2005,1928.0
456,Pua,Florence,,197443.0,MD,09/26/2024,09/19/2024,1950.0
457,Puca,Christopher,,147733.0,MD,11/25/2009,11/18/2009,1952.0
458,Puccio,Steven,T.,229825.0,DO,07/01/2021,07/01/2021,1964.0


In [11]:
md_p.to_csv("md_p.csv", index=False, encoding="UTF-8")