# Distributed scraping - mutiprocessing

In [12]:
import time
import re
import multiprocessing as mp
from urllib.request import urlopen, urljoin
from bs4 import BeautifulSoup


In [13]:
base_url = 'https://www.yelp.com/search?find_loc=Fremont,+CA&start=10'

In [14]:
def get_yelp_city_names():
    cities_page_url = domain_url + '/city'
    city_names = []

    city_page_html = urlopen(cities_page_url, timeout=1000).read().decode('utf-8')
    
    soup = BeautifulSoup(city_page_html, features='lxml')
    city_href_links = soup.find_all('a', {'href': re.compile('/city/.*?')})
    for link in city_href_links:
        city_names.append(link.get_text())

    return city_names

In [15]:
def get_biz_urls(city_name, page_number):
    base_url_template = 'https://www.yelp.com/search?find_loc=%s&start=%d'
    biz_urls = []

    target_city_url = base_url_template % (city_name.replace(' ', '+'), page_number)

    target_city_page_html = urlopen(target_city_url, 1).read().decode('utf-8')
    if not target_city_page_html:
        return []
    
    soup = BeautifulSoup(target_city_page_html, features='lxml')

    biz_href_elems = soup.find_all('a', {
        'class': 'biz-name',
        'href': re.compile('/biz/.*?')
    })

    for biz_href_elem in biz_href_elems:
        biz_urls.append(biz_href_elem['href'])

    return biz_urls

In [16]:
def crawl(url):
    response = urlopen(url)
    time.sleep(0.1)  # slightly delay for downloading
    return response.read().decode()

In [17]:
def parse(html):
    soup = BeautifulSoup(html, features='lxml')

    biz_name = soup.find('h1', {'class': 'biz-page-title'}).get_text().strip()
    biz_info['biz_name'] = biz_name

    review_count = soup.find('span', {'class': 'review-count rating-qualifier'}).get_text()
    review_count = int(review_count.strip().replace(' reviews', ''))
    biz_info['review_count'] = review_count

    tags_elems = soup.find('span', {'class': 'category-str-list'})\
               .find_all('a')
    tags = [tag_elem.get_text() for tag_elem in tags_elems]
    biz_info['tags'] = tags

    address_elem = soup.find('address')
    biz_info['address_street'] = address_elem.find('span', {'itemprop': 'streetAddress'}).get_text()
    biz_info['address_state'] = address_elem.find('span', {'itemprop': 'addressRegion'}).get_text()
    biz_info['address_postalcode'] = address_elem.find('span', {'itemprop': 'postalCode'}).get_text()


    biz_phone = soup.find('span', {'class': 'biz-phone'}).get_text().strip()
    biz_phone = biz_phone.replace('(', '')\
                         .replace(')', '')\
                         .replace('-', '')\
                         .replace(' ', '')
    biz_info['biz_phone'] = biz_phone

    biz_info['more_biz_info'] = {}
    more_biz_info_elems = soup.find('div', {'class': 'short-def-list'}).find_all('dl')
    for biz_info_elem in more_biz_info_elems:
        key = trim(biz_info_elem.find('dt').get_text())
        value = trim(biz_info_elem.find('dd').get_text())
        biz_info['more_biz_info'][key] = value

    return biz_info

In [18]:
def process(biz_info):
    print('[PROCESSED] ', biz_info['biz_name'])

## using multiprocessing to crawl/parse/process the pages

In [19]:
domain_url = 'https://www.yelp.com'
urls = []
for city_name in get_yelp_city_names():
    urls.extend(get_biz_urls(city_name, 1))
    
print('\n'.join(urls))

TypeError: message_body should be a bytes-like object or an iterable, got <class 'int'>