In [1]:
import requests
from bs4 import BeautifulSoup
from bs4 import SoupStrainer
import re
import psycopg2
import pandas as pd

In [2]:
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Integer, String
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker, validates

## Define DoctorRatings class for scraping `checkbook.org/surgeonratings`

In [3]:
class DoctorRatings:
    def __init__(self, provider_id, zipcode):
        self.npi = provider_id
        self.zipcode = zipcode
        self.name = None
        self.address1 = None
        self.address2 = None
        self.phone = None
        self.gender = None
        self.recommended_by_doctors = None
        self.confidence = None
        self.relative_volume = None
        self.hospitals = None
        self.board_certification = None
        self.university = None
        self.graduation_year = None
        self.internship_residency = None
        
    def __create_query(self, zip_radius, procedure_id):
        """Construct URL to scrape checkbook.org/surgeonratings given provider_id and zipcode"""
        base_url = "http://www.checkbook.org/surgeonratings/default.cfm?action=details"
        query = "&".join([base_url, "npi="+self.npi, "myZip="+self.zipcode[:5], 
                         "myZipRadius="+zip_radius, "ProcedureID="+procedure_id])
        return query
    
    def __parse_pullleft(self, pullleft_list):
        """parse list returned by .pullleft selector"""
        self.address1 = pullleft_list[0].next.strip()
        self.address2 = pullleft_list[0].next.next.string.strip()
        self.phone = pullleft_list[1].string.strip()
    
    def __parse_td_alignCenter(self, td_alignCenter_list):
        """parse list returned by td.alignCenter selector"""
        self.gender = td_alignCenter_list[1].string.strip()
        
        self.recommended_by_doctors = ''
        if len(td_alignCenter_list[2].contents) > 1:
            if td_alignCenter_list[2].contents[1].string.strip().lower() == 'yes':
                m = re.match(r'\d+', td_alignCenter_list[2].contents[3].next.strip())
                self.recommended_by_doctors = m.group(0)
        
        confidence_dict = {b'---':'', 
                           b'\xe2\x98\x85\xe2\x98\x85\xe2\x98\x85':3, 
                           b'\xe2\x98\x85\xe2\x98\x85\xe2\x98\x85\xe2\x98\x85':4, 
                           b'\xe2\x98\x85\xe2\x98\x85\xe2\x98\x85\xe2\x98\x85\xe2\x98\x85':5}
        stars = td_alignCenter_list[3].next.next.string.strip().encode('utf8')       
        self.confidence = confidence_dict[stars]
        
        self.relative_volume = td_alignCenter_list[4].string.strip()
        self.university = td_alignCenter_list[5].next.string.strip(',').strip()
        self.graduation_year = td_alignCenter_list[5].contents[1].string.strip()
        
        if td_alignCenter_list[6]:
            internships = []
            for item in td_alignCenter_list[6].select('div'):
                internships.append(item.string.strip(',').strip())
            self.internship_residency = '; '.join(internships)
    
    def __parse_hospitals(self, alist):
        """parse list returned by .hospitals li selector"""
        if alist:
            hospitals = []
            for item in alist:
                hospitals.append(item.contents[1].strip())
            self.hospitals = '; '.join(hospitals)
        else:
            self.hospitals = ''
    
    def __parse_certificate(self, alist):
        """parse list returned by .certificate li selector"""
        if alist:
            certificates = []
            for item in alist:
                certificates.append(item.contents[0].strip())
            self.board_certification = '; '.join(certificates)
        else:
            self.board_certification = ''
    
    def scrape(self, zip_radius='500', procedure_id = 'P10'):
        """Scrape doctor attributes from checkbook.org/surgeonratings"""
        href = self.__create_query(zip_radius, procedure_id)
        
        try:
            r = requests.get(href)
            soup = BeautifulSoup(r.text, "html.parser")
            
            ## .oddColumn selector
            self.name = soup.select(".oddColumn")[0].contents[0].strip()
            
            ## .hospitals li selector
            self.__parse_hospitals(soup.select(".hospitals li"))
            
            ## .certificate li selector
            self.__parse_certificate(soup.select(".certificate li"))
            
            ## .pullleft selector
            self.__parse_pullleft(soup.select(".pullleft"))
            
            ## td.alignCenter selector
            self.__parse_td_alignCenter(soup.select("td.alignCenter"))
            return True
        except:
            return []

## Declare a mapping for SQLAlchemy

In [4]:
Base = declarative_base()

In [5]:
class Rating(Base):
    __tablename__ = 'doctor_ratings'
    
    npi = Column(String(10), primary_key = True)
    name = Column(String(100))
    address1 = Column(String(55))
    address2 = Column(String(70))
    board_certification = Column(String(70))
    confidence = Column(Integer)
    gender = Column(String(6))
    graduation_year = Column(String(4))
    hospitals = Column(String(500))
    internship_residency = Column(String(100))
    phone = Column(String(12))
    recommended_by_doctors = Column(String(3))
    relative_volume = Column(String(15))
    university = Column(String(256))
    zipcode = Column(String(20))
    
    @validates('name','address1','address2','board_certification','hospitals','internship_residency','university')
    def validate_code(self, key, value):
        max_len = getattr(self.__class__, key).prop.columns[0].type.length
        if value and len(value) > max_len:
            return value[:max_len]
        return value

## Read in npi providers from summary database

In [6]:
## connect to database
con = psycopg2.connect("dbname='doctordb' user='cathy'")

In [7]:
## Read in orthopedic surgeon info from summary database
q= """SELECT npi
            , nppes_provider_last_org_name
            , nppes_provider_first_name
            , nppes_provider_mi
            , nppes_provider_city
            , nppes_provider_zip
            , nppes_provider_state
        FROM summary 
        WHERE provider_type = 'Orthopedic Surgery'
        AND nppes_provider_country = 'US'"""
summary_df = pd.read_sql_query(q, con=con)
summary_df.shape

(21300, 7)

In [8]:
summary_df.head(3)

Unnamed: 0,npi,nppes_provider_last_org_name,nppes_provider_first_name,nppes_provider_mi,nppes_provider_city,nppes_provider_zip,nppes_provider_state
0,1003001785,JONES,JACLYN,C,CLEARWATER,337653305,FL
1,1003002890,HAMMAN,DANIEL,R,BRIGHTON,806014004,CO
2,1003010919,FREDERICKS,PETER,D,COLORADO SPRINGS,809207835,CO


In [9]:
## are all zips 9 digits long?
irregular_zip_mask = summary_df['nppes_provider_zip'].map(lambda x: 1 if len(x)!=9 else 0)
print('length: {0}'.format(len(irregular_zip_mask)))
print('number of zips not 9 digits long: {0}'.format(irregular_zip_mask.sum()))

length: 21300
number of zips not 9 digits long: 2218


In [10]:
summary_df.loc[irregular_zip_mask==1, ['npi','nppes_provider_zip']].head(3)

Unnamed: 0,npi,nppes_provider_zip
17,1003036997,64057
26,1003070327,2114
34,1003112244,11201


In [11]:
## are all the irregular zips 5 digits long?
summary_df.loc[irregular_zip_mask==1, 'nppes_provider_zip'].map(lambda x: 1 if len(x) != 5 else 0).sum()

0

So we can just take first 5 digits of all zips without a problem.

In [12]:
if con:
    con.close()

## Create table in database

In [13]:
engine = create_engine('postgresql://%s@localhost/%s'%('cathy','doctordb'))

#print(database_exists(engine.url))

In [14]:
## create table
Base.metadata.create_all(engine)

## Create session

In [15]:
Session = sessionmaker(bind=engine)

In [16]:
## instantiate session
session = Session()

## Scrape doctors

In [17]:
import time

In [18]:
count_notfound = 0
count_found = 0

toc = time.time()
for i in range(len(summary_df)):
    if i % 300 == 0:
        print(i)
        print("count not found: {0}".format(count_notfound))
        print("count found: {0}".format(count_found))
        
    doc = DoctorRatings(summary_df.loc[i,'npi'], summary_df.loc[i,'nppes_provider_zip'])
    doc_scraped = doc.scrape()
    
    if doc_scraped:
        count_found += 1
        ## Add doctor info to table
        arating = Rating(**vars(doc))
        
        try:
            session.add(arating)
            session.commit()
        except:
            session.rollback()
            raise        
    else:
        count_notfound += 1
session.close()

tic = time.time()

0
count not found: 0
count found: 0
300
count not found: 250
count found: 50
600
count not found: 491
count found: 109
900
count not found: 748
count found: 152
1200
count not found: 987
count found: 213
1500
count not found: 1250
count found: 250
1800
count not found: 1496
count found: 304
2100
count not found: 1733
count found: 367
2400
count not found: 1981
count found: 419
2700
count not found: 2226
count found: 474
3000
count not found: 2484
count found: 516
3300
count not found: 2735
count found: 565
3600
count not found: 2986
count found: 614
3900
count not found: 3233
count found: 667
4200
count not found: 3470
count found: 730
4500
count not found: 3724
count found: 776
4800
count not found: 3970
count found: 830
5100
count not found: 4206
count found: 894
5400
count not found: 4465
count found: 935
5700
count not found: 4706
count found: 994
6000
count not found: 4965
count found: 1035
6300
count not found: 5209
count found: 1091
6600
count not found: 5450
count found: 1150
6

In [19]:
print('scraped and added to database in {0}s'.format(tic-toc))

scraped and added to database in 12149.23812007904s


In [21]:
engine.dispose()