# Distributed scraping - mutiprocessing

In [None]:
import time
import re
import multiprocessing as mp
from urllib.request import urlopen, urljoin
from bs4 import BeautifulSoup


In [None]:
domain_url = 'https://www.yelp.com'
base_url = 'https://www.yelp.com/search?find_loc=Fremont,+CA&start=10'

In [None]:
def get_yelp_city_names():
    cities_page_url = domain_url + '/city'
    city_names = []

    city_page_html = urlopen(cities_page_url).read().decode('utf-8')
    
    soup = BeautifulSoup(city_page_html, features='lxml')
    city_href_links = soup.find_all('a', {'href': re.compile('/city/.*?')})
    for link in city_href_links:
        city_names.append(link.get_text())

    return city_names

In [None]:
get_yelp_city_names()

In [55]:
def get_biz_urls(city_name, page_number):
    base_url_template = 'https://www.yelp.com/search?find_loc=%s&start=%d'
    biz_urls = []

    target_city_url = base_url_template % (city_name.replace(' ', '+'), page_number)
#     print(target_city_url)

    target_city_page_html = urlopen(target_city_url).read().decode('utf-8')
    if not target_city_page_html:
        return []
    
    soup = BeautifulSoup(target_city_page_html, features='lxml')

    biz_href_elems = soup.find_all('a', {
        'class': 'biz-name',
        'href': re.compile('/biz/.*?')
    })

    for biz_href_elem in biz_href_elems:
        biz_urls.append(biz_href_elem['href'])

    return biz_urls

In [None]:
get_biz_urls('Madison', 1)
# target_city_page_html = urlopen('https://www.yelp.com/search?find_loc=Madison&start=1').read().decode('utf-8')
# target_city_page_html

In [None]:
def crawl(url):
    response = urlopen(url)
    time.sleep(0.1)  # slightly delay for downloading
    return response.read().decode()

In [64]:
def parse(html):
    biz_info = {}
    
    soup = BeautifulSoup(html, features='lxml')

    biz_name = soup.find('h1', {'class': 'biz-page-title'}).get_text().strip()
    biz_info['biz_name'] = biz_name

    review_count = soup.find('span', {'class': 'review-count rating-qualifier'}).get_text()
    review_count = int(review_count.strip().replace(' reviews', ''))
    biz_info['review_count'] = review_count

    tags_elems = soup.find('span', {'class': 'category-str-list'})\
               .find_all('a')
    tags = [tag_elem.get_text() for tag_elem in tags_elems]
    biz_info['tags'] = tags

    address_elem = soup.find('address')
    biz_info['address_street'] = address_elem.find('span', {'itemprop': 'streetAddress'}).get_text()
    biz_info['address_state'] = address_elem.find('span', {'itemprop': 'addressRegion'}).get_text()
    biz_info['address_postalcode'] = address_elem.find('span', {'itemprop': 'postalCode'}).get_text()


    biz_phone = soup.find('span', {'class': 'biz-phone'}).get_text().strip()
    biz_phone = biz_phone.replace('(', '')\
                         .replace(')', '')\
                         .replace('-', '')\
                         .replace(' ', '')
    biz_info['biz_phone'] = biz_phone

    biz_info['more_biz_info'] = {}
    more_biz_info_elems = soup.find('div', {'class': 'short-def-list'}).find_all('dl')
    for biz_info_elem in more_biz_info_elems:
        key = trim(biz_info_elem.find('dt').get_text())
        value = trim(biz_info_elem.find('dd').get_text())
        biz_info['more_biz_info'][key] = value

    return biz_info

def trim(string):
    if not string:
        return None
    
    string = string.strip()\
                   .replace('\r', '')\
                   .replace('\r', '')
    
    return string

In [61]:
def process(biz_info):
    print('[PROCESSED] ', biz_info['biz_name'])

In [62]:
urls = set()
urls_size = 10
for city_name in get_yelp_city_names():
    urls = urls.union(get_biz_urls(city_name, 1))
    if len(urls) > urls_size:
        break
    
print('\n'.join(urls))

/biz/babalu-tapas-and-tacos-birmingham-3
/biz/urban-standard-birmingham
/biz/below-the-radar-huntsville
/biz/chez-fonfon-birmingham
/biz/saws-soul-kitchen-birmingham-3
/biz/connors-steak-and-seafood-huntsville
/biz/dolce-pan-bakery-huntsville
/biz/trattoria-centrale-birmingham-2
/biz/taqueria-el-cazador-huntsville-6
/biz/toybox-bistro-huntsville
/biz/hattie-bs-hot-chicken-bham-birmingham
/biz/bamboo-on-2nd-birmingham
/biz/the-poboy-factory-huntsville
/biz/viet-huong-vietnamese-restaurant-huntsville
/biz/carrigans-public-house-birmingham
/biz/yo-mamas-birmingham-2
/biz/betty-maes-restaurant-huntsville
/biz/highlands-bar-and-grill-birmingham
/biz/alchemy-lounge-huntsville-3
/biz/cotton-row-restaurant-huntsville


### using multiprocessing to crawl the pages

In [71]:
pool = mp.Pool(5)

urls = list(urls)
while urls:
    print('Distributed crawling...')
    
    crawl_jobs = []
    for url in urls:
        crawl_jobs.append(pool.apply_async(crawl, args=(domain_url + url, )))
        urls.pop()
    htmls = [j.get() for j in crawl_jobs]
    
    parse_jobs = []
    for html in htmls:
        parse_jobs.append(pool.apply_async(parse, args=(html, )))
    biz_infos = [j.get() for j in parse_jobs]
    
    process_jobs = [pool.apply_async(process, args=(biz_info,)) for biz_info in biz_infos]
    [j.get() for j in process_jobs]
    
    print('[Done]')


Distributed crawling...
[PROCESSED]  Hattie B’s Hot Chicken - Bham
[PROCESSED]  Bamboo on 2nd
[PROCESSED]  Toybox Bistro
[PROCESSED]  Taqueria El Cazador
[PROCESSED]  Trattoria Centrale
[PROCESSED]  The Po’Boy Factory
[PROCESSED]  Viet Huong Vietnamese Restaurant
[Done]
Distributed crawling...
[PROCESSED]  Taqueria El Cazador
[PROCESSED]  Toybox Bistro
[PROCESSED]  Trattoria Centrale
[Done]
Distributed crawling...
[PROCESSED]  Trattoria Centrale
[PROCESSED]  Taqueria El Cazador
[Done]
Distributed crawling...
[PROCESSED]  Trattoria Centrale
[Done]
