## Scaping Psychology Today for Mental Health Professionals in NYC

### Load libraries

In [1]:
print "importing libraries..."
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import random
import lxml
import pandas as pd
from bs4 import BeautifulSoup

importing libraries...


## Lets get scraping!

In [4]:
# define target url
#nj https://therapists.psychologytoday.com/rms/prof_results.php?search=edgewater,%20new%20jersey
def scrape_me(url):
    url = 'https://therapists.psychologytoday.com/rms/county/NY/New+York.html'
    driver = webdriver.Chrome()  #select selenium web driver
    docs = []                   #generating empty list
    while True:
        time.sleep(1)           #added a 1 second sleep to limit bot detection
        driver.get(url)         #open the url in selenium

        soup = BeautifulSoup(driver.page_source,'lxml') #grab the content with beautifulsoup for parsing
        main_table = soup.findAll('div',{'class':'no-min-height'})[0]  # select the desired html node
        docs_per_page = len(main_table.contents[2].contents[5].contents[1].findAll('div',{'class':'row'},recursive=False)) # outputs number of html node chldren - aka doctors per web page
        docs += [main_table.contents[2].contents[5].contents[1].contents[i].contents[1].contents[1].attrs for i in range(1,docs_per_page*2,2)] #building a list of dictionaries - filled with doctor info
        next_page_txt = main_table.contents[2].contents[7].contents[1].contents[3].contents[1].find_all('a',href=True)[~0].text #next page button should have text == 'Next' except for final page
        next_page_url = main_table.contents[2].contents[7].contents[1].contents[3].contents[1].find_all('a',href=True)[~0]['href'] #outputs url for next page of content
        if next_page_txt == 'Next':  #if a 'next page' exists...
            url = next_page_url       # set url = next page url to prepare for next page of scraping
        else:
            break                    #scraping complete!
            print "Scraping Complete!"

### clean up scrapped data and save it to csv

In [25]:
doc_df = pd.DataFrame(docs).drop(['class','data-new-clients','style'],axis=1).dropna(axis=0) # turn list of dicts to data frame, drop unimportant columns, remove na values
docs_1 = doc_df[['data-phone','data-prof-name','data-profid']] #select columns phone #, name, id # 
docs_1.columns = ['Phone Number', 'Physician Name', 'Psychology Today id'] #give columns more appropriate names
docs_l = docs_1[docs_1['Phone Number'] != ''].sort_values(by='Physician Name', ascending=True).reset_index().drop(['index'],axis=1) # filter out empty phone # individuals, sorts data by Name, resets out of order index

docs_l.to_csv('NYC-mental-health-professionals-2017.csv',encoding='utf8')  #save output to .csv file

In [2]:
# define target url
nj=  'https://therapists.psychologytoday.com/rms/prof_results.php?search=edgewater,%20new%20jersey'
#url = 'https://therapists.psychologytoday.com/rms/county/NY/New+York.html'

def scrape_me(url):
    driver = webdriver.Chrome()  #select selenium web driver
    docs = []                   #generating empty list
    while True:
        time.sleep(1)           #added a 1 second sleep to limit bot detection
        try:
            driver.get(url)         #open the url in selenium
        except:
            print ('bad url!')
        soup = BeautifulSoup(driver.page_source,'html5lib') #grab the content with beautifulsoup for parsing
        main_table = soup.findAll('div',{'class':'no-min-height'})[0]  # select the desired html node
        docs_per_page = len(main_table.contents[2].contents[5].contents[1].findAll('div',{'class':'row'},recursive=False)) # outputs number of html node chldren - aka doctors per web page
        docs += [main_table.contents[2].contents[5].contents[1].contents[i].contents[1].contents[1].attrs for i in range(1,docs_per_page*2,2)] #building a list of dictionaries - filled with doctor info
        next_page_txt = main_table.contents[2].contents[7].contents[1].contents[3].contents[1].find_all('a',href=True)[~0].text #next page button should have text == 'Next' except for final page
        next_page_url = main_table.contents[2].contents[7].contents[1].contents[3].contents[1].find_all('a',href=True)[~0]['href'] #outputs url for next page of content
        if next_page_txt == 'Next':  #if a 'next page' exists...
            url = next_page_url       # set url = next page url to prepare for next page of scraping
        else:
            break                    #scraping complete!
            print "Scraping Complete!"
    return docs

In [3]:
njdocs = scrape_me('https://therapists.psychologytoday.com/rms/prof_results.php?sid=1500768258.3737_28793&city=New+York&state=NY&rec_next=21')

## Now that the base info is scrapped, lets dive deeper!
### the next function scrapes additonal info from the detailed web page <br> because the base info omitts: full address info, professional designations, a summary from the Medical Health Professional, and along with a detailed list of specialties

In [107]:
## note, currently, we need to pass through a 'detailed' link to the function below
def scrape_detailed(url):
    First_Page = True
    driver = webdriver.Chrome()  #select selenium web driver
    docs = []                   #generating empty list
    while True:
        doc = {}
        time.sleep(.5)           #added a 1 second sleep to limit bot detection
        try:
            driver.get(url)         #open the url in selenium
        except:
            print ('bad url!')
        soup = BeautifulSoup(driver.page_source,'lxml') #grab the content with beautifulsoup for parsing
        main_table = soup.findAll('div',{'class':'no-min-height'})[0]  # select the desired html node
        doc = main_table.contents[3].attrs  #doc dict      # for i in range(1,docs_per_page*2,2)] #building a list of dictionaries - filled with doctor info

        ### grabbing professional designations aka certifications
        try:
            cert_suffix = main_table.contents[3].contents[1].contents[1].contents[3].text.replace('\n','').split(',')
            cert_suffix_list = [cert_suffix[i].strip() for i in range(len(cert_suffix))]
            doc['Certifications'] = cert_suffix_list #adding to dict
        except:
            print 'no certification found! probably end of links...'
            break
        ## finds number of locations
        address_quantity = len(main_table.contents[3].contents[3].contents[9].contents[1].contents[1].contents[1].contents[1].contents[1].findAll('div',recursive=False))
        
        ## need the try/except statment since some individuals omit a phone number!
        try:
            phone_nums = [main_table.contents[3].contents[3].contents[9].contents[1].contents[1].contents[1].contents[1].contents[1].findAll('div',recursive=False)[i].contents[1].findAll('a',href=True)[0].text for i in range(address_quantity)]
            doc['Phone_Numbers'] = phone_nums
        except:
            print 'bad or missing phone numbers!'
            
        ## separating addresses by component, list up to 3 locations    
        address_sizes = [len(main_table.contents[3].contents[3].contents[9].contents[1].contents[1].contents[1].contents[1].contents[1].findAll('div',recursive=False)[i].contents[1].findAll('span')) for i in range(address_quantity)]
        addybase = main_table.contents[3].contents[3].contents[9].contents[1].contents[1].contents[1].contents[1].contents[1].findAll('div',recursive=False)
        for addynum in range(address_quantity):
            if addynum == 0:
                for addysize in (range(address_sizes[addynum])):
                    if addybase[addynum].contents[1].findAll('span')[addysize].attrs['itemprop'] == 'streetAddress':
                        doc['Street_Addresses_1'] = addybase[addynum].contents[1].findAll('span')[addysize].text
                    elif addybase[addynum].contents[1].findAll('span')[addysize].attrs['itemprop'] == 'addressLocality':
                        doc['Cities_1'] = addybase[addynum].contents[1].findAll('span')[addysize].text
                    elif addybase[addynum].contents[1].findAll('span')[addysize].attrs['itemprop'] == 'addressRegrion':
                        doc['States_1'] = addybase[addynum].contents[1].findAll('span')[addysize].text
                    elif addybase[addynum].contents[1].findAll('span')[addysize].attrs['itemprop'] == 'postalcode':
                        doc['Zip_Codes_1'] = addybase[addynum].contents[1].findAll('span')[addysize].text
            elif addynum == 1:
                for addysize in (range(address_sizes[addynum])):
                    if addybase[addynum].contents[1].findAll('span')[addysize].attrs['itemprop'] == 'streetAddress':
                        doc['Street_Addresses_2'] = addybase[addynum].contents[1].findAll('span')[addysize].text
                    elif addybase[addynum].contents[1].findAll('span')[addysize].attrs['itemprop'] == 'addressLocality':
                        doc['Cities_2'] = addybase[addynum].contents[1].findAll('span')[addysize].text
                    elif addybase[addynum].contents[1].findAll('span')[addysize].attrs['itemprop'] == 'addressRegrion':
                        doc['States_2'] = addybase[addynum].contents[1].findAll('span')[addysize].text
                    elif addybase[addynum].contents[1].findAll('span')[addysize].attrs['itemprop'] == 'postalcode':
                        doc['Zip_Codes_2'] = addybase[addynum].contents[1].findAll('span')[addysize].text
            elif addynum == 2:
                for addysize in (range(address_sizes[addynum])):
                    if addybase[addynum].contents[1].findAll('span')[addysize].attrs['itemprop'] == 'streetAddress':
                        doc['Street_Addresses_3'] = addybase[addynum].contents[1].findAll('span')[addysize].text
                    elif addybase[addynum].contents[1].findAll('span')[addysize].attrs['itemprop'] == 'addressLocality':
                        doc['Cities_3'] = addybase[addynum].contents[1].findAll('span')[addysize].text
                    elif addybase[addynum].contents[1].findAll('span')[addysize].attrs['itemprop'] == 'addressRegrion':
                        doc['States_3'] = addybase[addynum].contents[1].findAll('span')[addysize].text
                    elif addybase[addynum].contents[1].findAll('span')[addysize].attrs['itemprop'] == 'postalcode':
                        doc['Zip_Codes_3'] = addybase[addynum].contents[1].findAll('span')[addysize].text
            else:
                print 'Either this doc has more than 3 locations or something has gone wrong!'
                break
                    
        phys_stmt = main_table.contents[3].contents[3].contents[9].contents[1].contents[1].contents[1].contents[3].text.replace('\n',' ').strip().replace('                        ','').strip()

        doc['Physician_Statement'] = phys_stmt
        
        ## need the try/except statment since some individuals omit Specialties!
        try:
            Specialties = pd.Series(main_table.contents[3].contents[3].contents[9].contents[1].contents[1].contents[3].contents[5].text.strip().replace('\n','').split('  ')).unique()[2:]
            #pd.Series(main_table.contents[3].contents[3].contents[9].contents[1].contents[1].contents[3].contents[5].text.strip().replace('\n','').split('  ')).unique().tolist()        
            num_subcat = len(main_table.contents[3].contents[3].contents[9].contents[1].contents[1].contents[3].contents[5].findAll('h3', recursive=False))
            subcats = [main_table.contents[3].contents[3].contents[9].contents[1].contents[1].contents[3].contents[5].findAll('h3', recursive=False)[i].text for i in range(num_subcat)]
            #Specialties = list(set(Specialties) - set(subcats))
            doc['Specialties'] = Specialties
            doc['Specialties_Subcategories'] = subcats
        except:
            print 'Specialties missing!'
        
        docs.append(doc)
        try:
            if First_Page == True:
                next_page_text = main_table.contents[1].contents[1].contents[1].contents[1].contents[3].findAll('a',href=True)[0].text
                next_page_url = main_table.contents[1].contents[1].contents[1].contents[1].contents[3].findAll('a',href=True)[0]['href']
            else:
                next_page_text = main_table.contents[1].contents[1].contents[1].contents[1].contents[3].findAll('a',href=True)[1].text
                next_page_url = main_table.contents[1].contents[1].contents[1].contents[1].contents[3].findAll('a',href=True)[1]['href']
            ############ first pass! 
            # Since the first page swaps the node location of the prev and next buttons, i needed this first pass flag
            First_Page = False
        except:
            print 'bad page!... or final page!'
            break
        
        
        if next_page_text == 'Next':  #if a 'next page' exists...
            url = next_page_url       # set url = next page url to prepare for next page of scraping
        else:
            break                    #scraping complete!
            print "Scraping Complete!"
    return docs

In [1]:
#detailed_docs = scrape_detailed('https://therapists.psychologytoday.com/rms/prof_detail.php?profid=311935&ref=1&sid=1500738331.5371_31603&city=Paramus&county=Bergen&state=NJ&tr=ResultsName')
edgewater_psych = scrape_detailed('https://therapists.psychologytoday.com/rms/prof_detail.php?profid=301543&search=07020&ref=1&sid=1500713290.2274_8813&zipcode=07020&tr=ResultsRow')
#nyc_deet = scrape_detailed('https://therapists.psychologytoday.com/rms/prof_detail.php?profid=61162&search=new+york&ref=1&sid=1500710514.4295_8813&city=New+York&state=NY&tr=ResultsName')

NameError: name 'scrape_detailed' is not defined

In [11]:
pd.DataFrame(detailed_docs)

NameError: name 'detailed_docs' is not defined

In [None]:
## adjust previous function, input list of urls
def scrape_detailed(urls):
    First_Page = True
    driver = webdriver.Chrome()  #select selenium web driver
    docs = []                   #generating empty list
    for url in urls:
        try:
            doc = {}
            time.sleep(.5)           #added a 1 second sleep to limit bot detection
            try:
                driver.get(url)         #open the url in selenium
            except:
                print ('bad url!')
            soup = BeautifulSoup(driver.page_source,'lxml') #grab the content with beautifulsoup for parsing
            main_table = soup.findAll('div',{'class':'no-min-height'})[0]  # select the desired html node
            doc = main_table.contents[3].attrs  #doc dict      # for i in range(1,docs_per_page*2,2)] #building a list of dictionaries - filled with doctor info

            ### grabbing professional designations aka certifications
            try:
                cert_suffix = main_table.contents[3].contents[1].contents[1].contents[3].text.replace('\n','').split(',')
                cert_suffix_list = [cert_suffix[i].strip() for i in range(len(cert_suffix))]
                doc['Certifications'] = cert_suffix_list #adding to dict
            except:
                print 'no certification found! probably end of links...'
                break
            ## finds number of locations
            address_quantity = len(main_table.contents[3].contents[3].contents[9].contents[1].contents[1].contents[1].contents[1].contents[1].findAll('div',recursive=False))

            ## need the try/except statment since some individuals omit a phone number!
            try:
                phone_nums = [main_table.contents[3].contents[3].contents[9].contents[1].contents[1].contents[1].contents[1].contents[1].findAll('div',recursive=False)[i].contents[1].findAll('a',href=True)[0].text for i in range(address_quantity)]
                doc['Phone_Numbers'] = phone_nums
            except:
                print 'bad or missing phone numbers!'

            ## separating addresses by component, list up to 3 locations    
            address_sizes = [len(main_table.contents[3].contents[3].contents[9].contents[1].contents[1].contents[1].contents[1].contents[1].findAll('div',recursive=False)[i].contents[1].findAll('span')) for i in range(address_quantity)]
            addybase = main_table.contents[3].contents[3].contents[9].contents[1].contents[1].contents[1].contents[1].contents[1].findAll('div',recursive=False)
            for addynum in range(address_quantity):
                if addynum == 0:
                    for addysize in (range(address_sizes[addynum])):
                        if addybase[addynum].contents[1].findAll('span')[addysize].attrs['itemprop'] == 'streetAddress':
                            doc['Street_Addresses_1'] = addybase[addynum].contents[1].findAll('span')[addysize].text
                        elif addybase[addynum].contents[1].findAll('span')[addysize].attrs['itemprop'] == 'addressLocality':
                            doc['Cities_1'] = addybase[addynum].contents[1].findAll('span')[addysize].text
                        elif addybase[addynum].contents[1].findAll('span')[addysize].attrs['itemprop'] == 'addressRegrion':
                            doc['States_1'] = addybase[addynum].contents[1].findAll('span')[addysize].text
                        elif addybase[addynum].contents[1].findAll('span')[addysize].attrs['itemprop'] == 'postalcode':
                            doc['Zip_Codes_1'] = addybase[addynum].contents[1].findAll('span')[addysize].text
                elif addynum == 1:
                    for addysize in (range(address_sizes[addynum])):
                        if addybase[addynum].contents[1].findAll('span')[addysize].attrs['itemprop'] == 'streetAddress':
                            doc['Street_Addresses_2'] = addybase[addynum].contents[1].findAll('span')[addysize].text
                        elif addybase[addynum].contents[1].findAll('span')[addysize].attrs['itemprop'] == 'addressLocality':
                            doc['Cities_2'] = addybase[addynum].contents[1].findAll('span')[addysize].text
                        elif addybase[addynum].contents[1].findAll('span')[addysize].attrs['itemprop'] == 'addressRegrion':
                            doc['States_2'] = addybase[addynum].contents[1].findAll('span')[addysize].text
                        elif addybase[addynum].contents[1].findAll('span')[addysize].attrs['itemprop'] == 'postalcode':
                            doc['Zip_Codes_2'] = addybase[addynum].contents[1].findAll('span')[addysize].text
                elif addynum == 2:
                    for addysize in (range(address_sizes[addynum])):
                        if addybase[addynum].contents[1].findAll('span')[addysize].attrs['itemprop'] == 'streetAddress':
                            doc['Street_Addresses_3'] = addybase[addynum].contents[1].findAll('span')[addysize].text
                        elif addybase[addynum].contents[1].findAll('span')[addysize].attrs['itemprop'] == 'addressLocality':
                            doc['Cities_3'] = addybase[addynum].contents[1].findAll('span')[addysize].text
                        elif addybase[addynum].contents[1].findAll('span')[addysize].attrs['itemprop'] == 'addressRegrion':
                            doc['States_3'] = addybase[addynum].contents[1].findAll('span')[addysize].text
                        elif addybase[addynum].contents[1].findAll('span')[addysize].attrs['itemprop'] == 'postalcode':
                            doc['Zip_Codes_3'] = addybase[addynum].contents[1].findAll('span')[addysize].text
                else:
                    print 'Either this doc has more than 3 locations or something has gone wrong!'
                    break

            phys_stmt = main_table.contents[3].contents[3].contents[9].contents[1].contents[1].contents[1].contents[3].text.replace('\n',' ').strip().replace('                        ','').strip()

            doc['Physician_Statement'] = phys_stmt

            ## need the try/except statment since some individuals omit Specialties!
            try:
                Specialties = pd.Series(main_table.contents[3].contents[3].contents[9].contents[1].contents[1].contents[3].contents[5].text.strip().replace('\n','').split('  ')).unique()[2:]
                #pd.Series(main_table.contents[3].contents[3].contents[9].contents[1].contents[1].contents[3].contents[5].text.strip().replace('\n','').split('  ')).unique().tolist()        
                num_subcat = len(main_table.contents[3].contents[3].contents[9].contents[1].contents[1].contents[3].contents[5].findAll('h3', recursive=False))
                subcats = [main_table.contents[3].contents[3].contents[9].contents[1].contents[1].contents[3].contents[5].findAll('h3', recursive=False)[i].text for i in range(num_subcat)]
                #Specialties = list(set(Specialties) - set(subcats))
                doc['Specialties'] = Specialties
                doc['Specialties_Subcategories'] = subcats
            except:
                print 'Specialties missing!'

            docs.append(doc)
        except:
            print ("something went wrong for %s" %(df['Physician Name'].astype(str)))
    return docs

In [4]:
df = pd.read_csv('NYC-mental-health-professionals-2017.csv')

In [46]:
urls = list('https://therapists.psychologytoday.com/rms/prof_detail.php?profid=' + df['Psychology Today id'].astype(str))


['https://therapists.psychologytoday.com/rms/prof_detail.php?profid=60941',
 'https://therapists.psychologytoday.com/rms/prof_detail.php?profid=124085',
 'https://therapists.psychologytoday.com/rms/prof_detail.php?profid=98767',
 'https://therapists.psychologytoday.com/rms/prof_detail.php?profid=42282',
 'https://therapists.psychologytoday.com/rms/prof_detail.php?profid=221659',
 'https://therapists.psychologytoday.com/rms/prof_detail.php?profid=138736',
 'https://therapists.psychologytoday.com/rms/prof_detail.php?profid=22158',
 'https://therapists.psychologytoday.com/rms/prof_detail.php?profid=237364',
 'https://therapists.psychologytoday.com/rms/prof_detail.php?profid=191647',
 'https://therapists.psychologytoday.com/rms/prof_detail.php?profid=223748',
 'https://therapists.psychologytoday.com/rms/prof_detail.php?profid=333686',
 'https://therapists.psychologytoday.com/rms/prof_detail.php?profid=129389',
 'https://therapists.psychologytoday.com/rms/prof_detail.php?profid=269687',
 'h

In [40]:
urls =['https://therapists.psychologytoday.com/rms/prof_detail.php?profid=' + df['Psychology Today id'].astype(str).tolist()[i] for i in df['Psychology Today id'].astype(str).shape[0]]

TypeError: 'int' object is not iterable

In [38]:
df['Psychology Today id'].shape[0]

5390

In [47]:
urls[0]

'https://therapists.psychologytoday.com/rms/prof_detail.php?profid=60941'