## Install Libraries

In [None]:
!pip install selenium webdriver-manager beautifulsoup4 pandas openpyxl

## Import Libraries and Install Chromium Browser

In [None]:
# Import required libraries
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time
import pandas as pd
import re

# For Google Colab, we need to install Chrome
!apt-get update
!apt-get install -y chromium-browser
!apt-get install -y chromium-chromedriver

## Setup Driver Colab

In [None]:
def setup_driver_colab():
    """Set up Chrome WebDriver specifically for Google Colab environment."""
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--lang=id")  # Set language to Indonesian

    # In Colab, the ChromeDriver is already installed at this location
    chrome_options.binary_location = '/usr/bin/chromium-browser'

    # Initialize the Chrome driver
    driver = webdriver.Chrome(options=chrome_options)
    return driver

## Extract Review Counts

In [None]:
def extract_review_counts(url):
    """Extract place names and their review counts from Google Maps search results."""
    driver = setup_driver_colab()

    try:
        # Open the URL
        driver.get(url)
        print("Navigating to URL...")

        # Wait for the results to load
        wait = WebDriverWait(driver, 20)
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "div[role='feed']")))

        # Scroll to load more results
        results_div = driver.find_element(By.CSS_SELECTOR, "div[role='feed']")

        # Scroll down a few times to load more results
        print("Scrolling to load more results...")
        for i in range(50):
            driver.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight", results_div)
            time.sleep(5)  # Give time for new content to load
            print(f"Scroll {i+1}/50 completed")

        # Get the page source after scrolling
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')

        # Find all result items
        results = []
        items = soup.find_all('div', class_='Nv2PK')  # Main container for each listing

        print(f"Found {len(items)} places...")

        for item in items:
            place_data = {}

            # Extract place name
            try:
                name_element = item.find('div', class_='qBF1Pd')
                if name_element:
                    place_data['name'] = name_element.text.strip()
                else:
                    place_data['name'] = "Name not found"
            except Exception as e:
                place_data['name'] = f"Error extracting name: {str(e)}"

            # Extract review count
            try:
                review_element = item.find('span', class_='UY7F9')
                if review_element:
                    review_text = review_element.text.strip()

                    # In Indonesian format:
                    # - 4,5 is a rating (uses comma as decimal separator)
                    # - 1.234 is a review count (uses dot as thousands separator)

                    # Check if this contains a rating (contains a comma)
                    if ',' in review_text:
                        # This is likely a rating followed by review count
                        # Try to extract the review count which would be the number after the rating
                        numbers = re.findall(r'\d+(?:\.\d+)?', review_text)
                        # The second number would be the review count
                        place_data['reviews'] = int(numbers[1].replace('.', '')) if len(numbers) > 1 else 0
                    else:
                        # This is just a review count
                        numbers = re.findall(r'\d+(?:\.\d+)?', review_text)
                        if numbers:
                            # Remove dots from thousands separator and convert to int
                            place_data['reviews'] = int(numbers[0].replace('.', ''))
                        else:
                            place_data['reviews'] = 0
                else:
                    place_data['reviews'] = 0  # No reviews found
            except Exception as e:
                place_data['reviews'] = f"Error extracting reviews: {str(e)}"

            results.append(place_data)

        return results

    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return []
    finally:
        driver.quit()

## Run Script

In [None]:
def main():
    url = "https://www.google.com/maps/search/air+terjun+di+bandung+raya/@-7.0019743,107.2272392,10z/data=!3m1!4b1?hl=id&entry=ttu&g_ep=EgoyMDI1MDYxMS4wIKXMDSoASAFQAw%3D%3D"

    print("Starting to extract review counts...")
    results = extract_review_counts(url)

    if results:
        # Create a dataframe and export to CSV
        df = pd.DataFrame(results)
        df.sort_values(by='reviews', ascending=False, inplace=True)

        # Save files
        csv_filename = 'google_maps_reviews.csv'
        df.to_csv(csv_filename, index=False)

        print(f"\nResults saved to {csv_filename}")

        # For Colab, display download links
        from google.colab import files
        files.download(csv_filename)

        # Display the top 10 places by review count
        print("\nTop 10 Places by Review Count:")
        print(df.head(10).to_string(index=False))

        # Calculate total reviews
        total_reviews = df['reviews'].sum()
        print(f"\nTotal Reviews Across All Places: {total_reviews}")
    else:
        print("No results were found or an error occurred.")

# Run the script
main()