In [5]:
from playwright.sync_api import sync_playwright
from dataclasses import dataclass, asdict, field
import pandas as pd
import os

@dataclass
class Business:
    """holds business data"""
    name: str = None
    address: str = None
    website: str = None
    phone_number: str = None
    price: float = None
    reviews_count: int = None
    reviews_average: float = None
    latitude: float = None
    longitude: float = None

@dataclass
class BusinessList:
    """holds list of Business objects,
    and save to both excel and csv"""
    business_list: list[Business] = field(default_factory=list)
    save_at = 'result'

    def dataframe(self):
        """transform business_list to pandas dataframe"""
        return pd.json_normalize(
            (asdict(business) for business in self.business_list), sep="_"
        )

    def save_to_excel(self, filename):
        """saves pandas dataframe to excel (xlsx) file"""
        if not os.path.exists(self.save_at):
            os.makedirs(self.save_at)
        self.dataframe().to_excel(f"result/{filename}.xlsx", index=False)

    def save_to_csv(self, filename):
        """saves pandas dataframe to csv file"""
        if not os.path.exists(self.save_at):
            os.makedirs(self.save_at)
        self.dataframe().to_csv(f"result/{filename}.csv", index=False)

def extract_coordinates_from_url(url: str) -> tuple[float,float]:
    """helper function to extract coordinates from url"""
    coordinates = url.split('/@')[-1].split('/')[0]
    return float(coordinates.split(',')[0]), float(coordinates.split(',')[1])

##########
# Input
##########

# Define search queries and total count here
search_list = ['New York', 'Los Angeles']  # Example search queries
total = 10  # Example total count

###########
# Scraping
###########
with sync_playwright() as p:
    browser = p.chromium.launch(headless=False)
    page = browser.new_page()

    page.goto("https://www.google.com/maps", timeout=60000)
    page.wait_for_timeout(5000)
    
    for search_for_index, search_for in enumerate(search_list):
        print(f"-----\n{search_for_index} - {search_for}".strip())

        page.locator('//input[@id="searchboxinput"]').fill(search_for)
        page.wait_for_timeout(3000)
        page.keyboard.press("Enter")
        page.wait_for_timeout(5000)

        zoom_in_button = page.locator("#widget-zoom-in")
        for _ in range(7):
            zoom_in_button.click()
            page.wait_for_timeout(500)

        page.hover('//a[contains(@href, "https://www.google.com/maps/place")]')
        previously_counted = 0
        while True:
            page.mouse.wheel(0, 10000)
            page.wait_for_timeout(3000)

            if page.locator('//a[contains(@href, "https://www.google.com/maps/place")]').count() >= total:
                listings = page.locator('//a[contains(@href, "https://www.google.com/maps/place")]').all()[:total]
                listings = [listing.locator("xpath=..") for listing in listings]
                print(f"Total Scraped: {len(listings)}")
                break
            else:
                if page.locator('//a[contains(@href, "https://www.google.com/maps/place")]').count() == previously_counted:
                    listings = page.locator('//a[contains(@href, "https://www.google.com/maps/place")]').all()
                    print(f"Arrived at all available\nTotal Scraped: {len(listings)}")
                    break
                else:
                    previously_counted = page.locator('//a[contains(@href, "https://www.google.com/maps/place")]').count()
                    print(
                        f"Currently Scraped: ",
                        page.locator('//a[contains(@href, "https://www.google.com/maps/place")]').count(),
                    )

        business_list = BusinessList()
        for listing in listings:
            try:
                listing.click()
                page.wait_for_timeout(5000)

                name_xpath = '//div[contains(@class, "fontHeadlineSmall")]'
                address_xpath = '//button[@data-item-id="address"]//div[contains(@class, "fontBodyMedium")]'
                website_xpath = '//a[@data-item-id="authority"]//div[contains(@class, "fontBodyMedium")]'
                phone_number_xpath = '//button[contains(@data-item-id, "phone:tel:")]//div[contains(@class, "fontBodyMedium")]'
                price_xpath = '//span[@class="fontTitleLarge Cbys4b"]'
                reviews_span_xpath = '//span[@role="img"]'

                business = Business()

                if listing.locator(name_xpath).count() > 0:
                    business.name = listing.locator(name_xpath).all()[0].inner_text()
                else:
                    business.name = ""
                if page.locator(address_xpath).count() > 0:
                    business.address = page.locator(address_xpath).all()[0].inner_text()
                else:
                    business.address = ""
                if page.locator(website_xpath).count() > 0:
                    business.website = page.locator(website_xpath).all()[0].inner_text()
                else:
                    business.website = ""
                if page.locator(phone_number_xpath).count() > 0:
                    business.phone_number = page.locator(phone_number_xpath).all()[0].inner_text()
                else:
                    business.phone_number = ""
                if page.locator(price_xpath).count() > 0:
                    price_text = page.locator(price_xpath).all()[0].inner_text()
                    price_text = price_text.replace("MAD\xa0", "").replace(",", "")
                    business.price = float(price_text)
                else:
                    business.price = ""
                if listing.locator(reviews_span_xpath).count() > 0:
                    business.reviews_average = float(
                        listing.locator(reviews_span_xpath).all()[0]
                        .get_attribute("aria-label")
                        .split()[0]
                        .replace(",", ".")
                        .strip()
                    )
                    business.reviews_count = int(
                        listing.locator(reviews_span_xpath).all()[0]
                        .


SyntaxError: incomplete input (3251982039.py, line 150)

In [2]:
pip install playwright


Collecting playwright
  Downloading playwright-1.43.0-py3-none-win_amd64.whl.metadata (3.5 kB)
Collecting greenlet==3.0.3 (from playwright)
  Downloading greenlet-3.0.3-cp311-cp311-win_amd64.whl.metadata (3.9 kB)
Collecting pyee==11.1.0 (from playwright)
  Downloading pyee-11.1.0-py3-none-any.whl.metadata (2.8 kB)
Downloading playwright-1.43.0-py3-none-win_amd64.whl (29.4 MB)
   ---------------------------------------- 0.0/29.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/29.4 MB 660.6 kB/s eta 0:00:45
   ---------------------------------------- 0.2/29.4 MB 2.6 MB/s eta 0:00:12
    --------------------------------------- 0.7/29.4 MB 5.2 MB/s eta 0:00:06
   -- ------------------------------------- 1.5/29.4 MB 8.6 MB/s eta 0:00:04
   -- ------------------------------------- 2.0/29.4 MB 10.8 MB/s eta 0:00:03
   ---- ----------------------------------- 3.1/29.4 MB 11.6 MB/s eta 0:00:03
   ---- ----------------------------------- 3.6/29.4 MB 11.4 MB/s eta 0:00:03
   ----