<a href="https://colab.research.google.com/github/dani-fadli/google-maps-reviews-scraper/blob/main/Scrape_Reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
# Step 1: Install Chrome and dependencies in Colab
!apt-get update
!apt-get install -y chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin

# Step 2: Install Python dependencies
!pip install selenium pandas --quiet

# Step 3: Use this configuration in your script
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.binary_location = "/usr/bin/chromium-browser"

0% [Working]            Hit:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:4 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:6 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:7 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading package lists... Done
Building depe

In [37]:
# Google Colab Notebook
# Title: Scraping Google Maps Reviews for Sentiment Analysis Thesis
# Author: dani-fadli
# Description: Scrapes reviews and ratings from Google Maps for a list of waterfall tourism spots in Bandung Raya.
# Output: CSV file per place, named with place and timestamp, containing 'rating' and 'review' columns.

# --- Install Requirements ---
!pip install selenium webdriver-manager pandas --quiet

# --- Imports ---
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from datetime import datetime
import re

# --- Helper Functions ---
def get_place_name_from_url(url):
    # Extract the place name from the Google Maps URL for filename
    # e.g., https://www.google.com/maps/place/Air+Terjun+X/...
    match = re.search(r'/place/([^/]+)', url)
    if match:
        return match.group(1).replace('+', '_')
    else:
        return 'unknown_place'

def scrape_google_maps_reviews(place_url, max_wait=2):
    # Set up Chrome options for headless operation in Colab
    # chrome_options = Options()
    # chrome_options.add_argument("--headless")
    # chrome_options.add_argument("--window-size=1920,1080")
    # chrome_options.add_argument("--disable-gpu")
    # chrome_options.add_argument("--no-sandbox")
    # chrome_options.add_argument("--disable-dev-shm-usage")

    # driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    driver = webdriver.Chrome(options=chrome_options)
    driver.get(place_url)
    time.sleep(5)  # Let the page load

    # Scroll to the reviews section (simulate click on 'All reviews' button if present)
    try:
        all_reviews_button = driver.find_element(By.XPATH, '//button[contains(@aria-label, "Ulasan")]')
        all_reviews_button.click()
        time.sleep(3)
    except Exception:
        pass  # Button not found, maybe already in reviews

    # Wait for reviews container to load
    time.sleep(3)

    # Find the scrollable reviews container
    try:
        scrollable_div = driver.find_element(By.XPATH, '//div[@role="region" and @tabindex="0"]')
    except Exception:
        scrollable_div = driver.find_element(By.XPATH, '//div[contains(@class, "m6QErb DxyBCb kA9KIf dS8AEf XiKgde ")]')
        # scrollable_div = driver.find_element(By.XPATH, '//div[contains(@class, "m6QErb WNBkOb XiKgde")]')

    # Auto-scroll loop
    last_height = driver.execute_script("return arguments[0].scrollHeight", scrollable_div)
    print(f"Last height: {last_height}")
    scroll_tries = 0
    while True:
        driver.execute_script("arguments[0].scrollTo(0, arguments[0].scrollHeight);", scrollable_div)
        time.sleep(max_wait)
        new_height = driver.execute_script("return arguments[0].scrollHeight", scrollable_div)
        print(f"New height: {new_height}")

        # Limit scrolling for debugging
        # if new_height > 5000:
        #     break

        # Retry if the new height is equal to the last height
        if new_height == last_height:
            scroll_tries += 1
            if scroll_tries > 4:
                break
        else:
            last_height = new_height
            scroll_tries = 0
        print(f"Scroll tries: {scroll_tries}")

    # Expand all truncated reviews
    more_buttons = driver.find_elements(By.XPATH, '//button[contains(@aria-label, "Lihat lainnya")]')
    for btn in more_buttons:
        try:
            driver.execute_script("arguments[0].click();", btn)
            time.sleep(0.1)
        except Exception:
            continue

    # Extract reviews and ratings
    reviews = []
    ratings = []
    # review_blocks = driver.find_elements(By.XPATH, '//div[@data-review-id]')
    review_blocks = driver.find_elements(By.XPATH, '//div[contains(@class, "jJc9Ad ")]')
    for block in review_blocks:
        try:
            rating = block.find_element(By.XPATH, './/span[@role="img"]').get_attribute("aria-label")
            rating_val = re.search(r'(\d+)\s+bintang', rating)

            if rating_val:
                rating_num = int(rating_val.group(1))
            else:
                rating_num = None
        except Exception:
            rating_num = None
        try:
            review_text = block.find_element(By.XPATH, './/span[contains(@class, "wiI7pd")]').text
            # # Find all elements that could contain review text within the block
            # review_text_elements = block.find_elements(By.XPATH, './/span[contains(@class, "wiI7pd")]')

            # # Find the element with the longest text, assuming it's the full review
            # longest_text = ""
            # for element in review_text_elements:
            #     current_text = element.text.strip()
            #     if len(current_text) > len(longest_text):
            #         longest_text = current_text

            # review_text = longest_text
        except Exception:
            review_text = ""
        print(f"Review: {review_text}")
        if review_text.strip():  # Only keep reviews with text
            reviews.append(review_text.strip())
            ratings.append(rating_num)

    driver.quit()
    return pd.DataFrame({'rating': ratings, 'review': reviews})

# --- Main Scraping Logic ---

# List your Google Maps URLs here
place_urls = [
    # "https://www.google.com/maps/place/NAME1/...",
    # "https://www.google.com/maps/place/NAME2/...",
    # ...
    "https://www.google.com/maps/place/Curug+Layung+%26+Camping+Ground/@-6.7770315,107.5799823,15z/data=!4m18!1m9!3m8!1s0x2e68e142f2d3bf0d:0x5765d9ceb0f35ae7!2sCurug+Layung+%26+Camping+Ground!8m2!3d-6.7768125!4d107.5778125!9m1!1b1!16s%2Fg%2F11hs54sggh!3m7!1s0x2e68e142f2d3bf0d:0x5765d9ceb0f35ae7!8m2!3d-6.7768125!4d107.5778125!9m1!1b1!16s%2Fg%2F11hs54sggh?hl=id&entry=ttu"
]

for url in place_urls:
    print(f"Processing: {url}")
    place_name = get_place_name_from_url(url)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    df = scrape_google_maps_reviews(url)
    filename = f"{place_name}_reviews_{timestamp}.csv"
    df.to_csv(filename, index=False)
    print(f"Saved: {filename}")

print("DONE. Check the files in your Colab workspace.")

Processing: https://www.google.com/maps/place/Curug+Layung+%26+Camping+Ground/@-6.7770315,107.5799823,15z/data=!4m18!1m9!3m8!1s0x2e68e142f2d3bf0d:0x5765d9ceb0f35ae7!2sCurug+Layung+%26+Camping+Ground!8m2!3d-6.7768125!4d107.5778125!9m1!1b1!16s%2Fg%2F11hs54sggh!3m7!1s0x2e68e142f2d3bf0d:0x5765d9ceb0f35ae7!8m2!3d-6.7768125!4d107.5778125!9m1!1b1!16s%2Fg%2F11hs54sggh?hl=id&entry=ttu
Last height: 5101
New height: 11163
Scroll tries: 0
New height: 17316
Scroll tries: 0
New height: 23261
Scroll tries: 0
New height: 29211
Scroll tries: 0
New height: 35231
Scroll tries: 0
New height: 41177
Scroll tries: 0
New height: 46775
Scroll tries: 0
New height: 52378
Scroll tries: 0
New height: 58286
Scroll tries: 0
New height: 63803
Scroll tries: 0
New height: 69578
Scroll tries: 0
New height: 75446
Scroll tries: 0
New height: 81100
Scroll tries: 0
New height: 86603
Scroll tries: 0
New height: 92554
Scroll tries: 0
New height: 97839
Scroll tries: 0
New height: 103271
Scroll tries: 0
New height: 108773
Scrol