In [None]:
# import packages

import requests
from bs4 import BeautifulSoup
import csv
import time
import random
import re
from IPython.display import clear_output
import concurrent.futures
from fake_useragent import UserAgent
import cloudscraper

In [None]:
# YellowPages main page URL
soloPageUrl = 'https://www.yellow-pages.ph'

In [None]:
# optional 
def add_spaces_after_punctuation(text):
    return re.sub(r'([,.;:!?)])([^\s])', r'\1 \2', text)

In [None]:
def scrape_business(business):

    # stores the reviews in this list per batch
    reviews = []

    # Access Page
    about = business.div.h2.a['href']

    # Page source
    pageUrl = soloPageUrl + about
    
    # adds headers to the request to avoid being blocked
    headers = {
        'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate, br',
        'Referer': 'https://www.google.com/',
        'DNT': '1',
        'Connection': 'keep-alive',
    }
    
    scraper = cloudscraper.create_scraper()  # create a Cloudscraper instance
    # use the scraper to get the page source
    aboutUrl = scraper.get(pageUrl, headers=headers).text
    soupPage = BeautifulSoup(aboutUrl, 'lxml')
    singlePage = soupPage.find_all('div', class_='yp-container')

    # fetches the reviews in every businness page. every instance is a new page
    for page in singlePage:
        time.sleep(random.uniform(1, 5))  # add random delay between requests
        review_text = page.find('div', class_='yp-see-morex text-break')
        if review_text:
            reviews.append(review_text.text.replace('\n', ''))
        else:
            reviews.append('')

    # fetches the necessary information in the main search page
    time.sleep(random.uniform(3, 5))
    # Search for name
    name = business.find(
        'h2', class_='search-tradename').text.replace('\n', '')

    # Search for address
    address = business.find('span', class_='ellipsis').text

    # Search for mobile number
    mobile = business.find(
        'a', class_='btn btn-yp-default mr-2 biz-btn-call yp-click')
    mobileNum = mobile['data-phone'] if mobile else ''

    # Search for average rating
    star_average = page.find('div', class_='rating-num')
    if star_average:
        star_average = star_average.text.replace('\n', '')
    else:
        star_average = ''

    if reviews:
        writer.writerow([name, address, mobileNum,
                        star_average, reviews.pop(0)])
    else:
        writer.writerow([name, address, mobileNum, star_average, ''])

# steps to scrape. every step is a whole batch of result page
steps = 90

# user input for starting value. this is to continue the scraping in case of being blocked
start_page = input("Enter start value: ")
start_page = int(start_page)

# add headers to the request to avoid being blocked
headers = {
    'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
    'Accept-Language': 'en-US,en;q=0.5',
    'Accept-Encoding': 'gzip, deflate, br',
    'Referer': 'https://www.google.com/',
    'DNT': '1',
    'Connection': 'keep-alive',
}

# opens broker.csv file to append new scraped data
with open("broker.csv", "a", newline="") as file:
    writer = csv.writer(file)

    # starts automated scraping with error handling. every instance (i) will be added with + 1 to simulate the URL
    # of the next page. i is concatenated to the URL to scrape the next page.
    for i in range(start_page, steps+1):
        htmlText = f'https://www.yellow-pages.ph/category/residential-properties/page-{i}'
        scraper = cloudscraper.create_scraper()
        try:
            getHtml = scraper.get(htmlText, headers=headers).text
            soup = BeautifulSoup(getHtml, 'lxml')
            scraped = soup.find_all('div', class_='search-listing')
            # raises this exception if no data was scraped. it means that the IP was blocked and the webpage won't load.
            if not scraped:
                raise Exception(
                    "Refresh IP, change start value, and try again.")
        except Exception as e:
            clear_output()
            # prints the blocked IP warning and the final step
            print(f"Blocked IP and the final step was {i}. {e}")
            break
        
        # prints the URL and the current step being scraped to check if the scraping is still running
        print(" ========================================================================")
        print(" URL: ", htmlText)
        print(f" Step {i} scraping...")

        # get list of businesses for this page
        businesses = soup.find_all('div', class_='search-listing')

        # scrape each business in parallel. 
        # this will speed up the scraping process and make it more efficient
        with concurrent.futures.ThreadPoolExecutor() as executor:
            results = [executor.submit(scrape_business, business)
                       for business in businesses]
            # add random delay between requests
            time.sleep(random.uniform(2, 5))

        time.sleep(5)
        # prints the finished batch and the name of the businesses scraped
        print(f" Step {i} finished successfully.")
        print(" Displaying results...")
        print(" ========================================================================")
        print(" Page", i, "& step", steps)
        print(" Names of businesses: ")
        for business in businesses:
            name = business.find(
                'h2', class_='search-tradename').text.replace('\n', '')
            print(" ", name)
        print(" ========================================================================")
        time.sleep(random.uniform(5, 8))
        clear_output()
