There is in fact a TripAdvisor API, but it only lets you call 10 entries at a time and is far less powerful than just scraping TripAdvisor.com, unfortunately. 
So that's what we're gonna do!

In [4]:
# Install dependencies

!pip install --quiet requests beautifulsoup4 pandas

In [5]:
# Import and setup. Importing time to help pause between requests

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import re

In [6]:
# Constants
TRIPADVISOR_BASE_URL = "https://www.tripadvisor.com/FindRestaurants?geo=298184&sort=POPULARITY&establishmentTypes=10591&broadened=false"

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
    "Connection": "keep-alive",
}

def fetch_page(url):
    """
    Fetches the HTML content of a page with error handling and random delays.

    Args:
        url: The URL to fetch.

    Returns:
        The HTML content of the page, or None if the request fails.
    """
    for attempt in range(3):  # Retry 3 times
        try:
            # Add random delay
            delay = random.uniform(20, 60)  # Random delay between 20-60 seconds
            print(f"Waiting {delay:.2f} seconds before requesting {url}...")
            time.sleep(delay)

            # Make the request
            response = requests.get(url, headers=HEADERS)
            print(f"Status Code: {response.status_code}")

            if response.status_code == 200:
                # Print a snippet of the HTML for debugging
                print(f"Response snippet: {response.text[:500]}")  # Print first 500 characters
                return response.text
            elif response.status_code == 403:
                print(f"Access forbidden (403). Your IP might be blocked: {url}")
            elif response.status_code == 429:
                print(f"Too many requests (429). Rate limit exceeded: {url}")
            else:
                print(f"Unexpected status code {response.status_code} for URL: {url}")
        except requests.exceptions.RequestException as e:
            print(f"Request exception occurred for {url}: {e}. Retrying...")
            time.sleep(2 ** attempt)  # Exponential backoff

    print(f"Failed to fetch the page after 3 attempts: {url}")
    return None


def fetch_restaurant_details(restaurant_url):
    """
    Fetches details of a restaurant from its individual page.

    Args:
        restaurant_url: The URL of the restaurant.

    Returns:
        A dictionary containing restaurant details.
    """
    html_content = fetch_page(restaurant_url)
    if not html_content:
        print(f"Failed to fetch restaurant details from: {restaurant_url}")
        return {"Name": None, "Address": None, "Rating": None, "Review Count": None, "Categories": None}

    soup = BeautifulSoup(html_content, "html.parser")

    try:
        # Extract restaurant details
        name = safe_extract(soup, "h1.biGQs._P.hzzSG.BDkrC")
        address = safe_extract(soup, "span.biGQs._P.pZUbB.AWdfh[data-automation='restaurantsMapLinkOnName']")
        rating = safe_extract(soup, "span[class*='ui_bubble_rating']")
        reviews_text = safe_extract(soup, "a[href*='Reviews']")
        review_count = re.sub(r"[^\d]", "", reviews_text) if reviews_text else None
        categories = safe_extract(soup, "div.CsAqy span.HUMGB.cPbcf span.bTeln")

        if not name:
            print(f"Warning: Missing restaurant name for {restaurant_url}")
        if not address:
            print(f"Warning: Missing address for {restaurant_url}")

        return {
            "Name": name,
            "Address": address,
            "Rating": rating,
            "Review Count": review_count,
            "Categories": categories,
        }
    except Exception as e:
        print(f"Error extracting restaurant details from {restaurant_url}: {e}")
        return {"Name": None, "Address": None, "Rating": None, "Review Count": None, "Categories": None}


def scrape_tripadvisor_page(url, start_rank):
    """
    Scrapes a single page of restaurants from TripAdvisor.

    Args:
        url: The URL of the page to scrape.
        start_rank: The starting rank for the restaurants on this page.

    Returns:
        A list of restaurant data and the next rank to start from.
    """
    html_content = fetch_page(url)
    if not html_content:
        print(f"Stopping at page {url} due to a failure in fetching.")
        return [], start_rank

    soup = BeautifulSoup(html_content, "html.parser")
    restaurant_cards = soup.find_all("div", class_="tbrcR")
    if not restaurant_cards:
        print(f"No restaurant cards found on page: {url}")
        return [], start_rank

    data = []
    rank = start_rank

    for card in restaurant_cards:
        try:
            # Skip sponsored cards
            sponsored_tag = card.select_one("span.biGQs._P.navcl")
            if sponsored_tag and "Sponsored" in sponsored_tag.text:
                print(f"Skipping sponsored card at rank {rank}")
                continue

            # Extract restaurant URL
            restaurant_url = safe_extract(card, "a.BMQDV._F.Gv.wSSLS.SwZTJ.FGwzt.ukgoS", "href")
            if restaurant_url and not restaurant_url.startswith("http"):
                restaurant_url = f"https://www.tripadvisor.com{restaurant_url}"

            if not restaurant_url:
                print(f"Error: Missing URL for restaurant card at rank {rank}. Skipping this card.")
                continue

            # Fetch restaurant details
            details = fetch_restaurant_details(restaurant_url)

            # Append restaurant data
            data.append({
                "Ranking": rank,
                "Name": details["Name"],
                "URL": restaurant_url,
                "Rating": details["Rating"],
                "Price Range": safe_extract(card, "span.YECgr"),
                "Review Count": details["Review Count"],
                "Categories": details["Categories"],
                "Address": details["Address"],
            })

            print(f"Successfully scraped restaurant at rank {rank}: {details['Name']}")
            rank += 1

        except Exception as e:
            print(f"Error processing restaurant card at rank {rank}: {e}")

    return data, rank


def scrape_tripadvisor_pages(num_pages, base_url=TRIPADVISOR_BASE_URL):
    """
    Scrapes multiple pages of restaurants from TripAdvisor.

    Args:
        num_pages: Number of pages to scrape.
        base_url: The URL of the first page to scrape.

    Returns:
        A list of dictionaries containing restaurant data.
    """
    all_data = []
    current_rank = 1

    for page in range(num_pages):
        offset = page * 30
        current_url = f"{base_url}&offset={offset}" if page > 0 else base_url

        print(f"Scraping page {page + 1}: {current_url}")
        page_data, next_rank = scrape_tripadvisor_page(current_url, current_rank)

        if not page_data:
            print(f"Stopping at page {page + 1} due to an error.")
            break

        all_data.extend(page_data)
        current_rank = next_rank

    print(f"Scraping complete. Total restaurants scraped: {len(all_data)}")
    return all_data


def safe_extract(soup, selector, attribute=None):
    """
    Safely extracts text or attribute from a BeautifulSoup element.

    Args:
        soup: The BeautifulSoup object.
        selector: The CSS selector to find the element.
        attribute: The attribute to extract (e.g., 'href'). If None, extracts text.

    Returns:
        The extracted value or None if not found.
    """
    try:
        element = soup.select_one(selector)
        if element:
            return element[attribute] if attribute else element.get_text(strip=True)
    except Exception as e:
        print(f"Error extracting data with selector '{selector}': {e}")
    return None

In [30]:
# Scrape the first 7 pages (30 results per page = 210 total, slightly over 200)
data = scrape_tripadvisor_pages(num_pages=7)

Scraping page 1: https://www.tripadvisor.com/FindRestaurants?geo=298184&sort=POPULARITY&establishmentTypes=10591&broadened=false
Waiting 21.69 seconds before requesting https://www.tripadvisor.com/FindRestaurants?geo=298184&sort=POPULARITY&establishmentTypes=10591&broadened=false...
Status Code: 200
Response snippet: <!DOCTYPE html><html lang="en-US"><head><link rel="icon" id="favicon" type="image/x-icon" href="https://static.tacdn.com/img2/brand_refresh_2025/application_icons/favicon_2025.ico"/><link rel="icon" type="image/svg+xml" href="https://static.tacdn.com/img2/brand_refresh_2025/application_icons/icon.svg"/><link rel="apple-touch-icon" sizes="180x180" href="https://static.tacdn.com/img2/brand_refresh_2025/application_icons/apple_touch_icon.png"/><link rel="mask-icon" sizes="any" href="https://static.
Skipping sponsored card at rank 1
Skipping sponsored card at rank 1
Waiting 42.98 seconds before requesting https://www.tripadvisor.com/Restaurant_Review-g14129735-d1666071-Reviews

In [31]:
# Slice the data to include only the first 200 restaurants
top_200_data = data[:200]

# Create a DataFrame and export to CSV
df = pd.DataFrame(top_200_data)
df.to_csv("tripadvisor_top_200_restaurants_tokyo.csv", index=False, encoding="utf-8-sig")

print("Top 200 restaurants data scraped and saved to 'tripadvisor_top_200_restaurants_tokyo.csv'")

# Print the first few rows of the DataFrame to verify
df

Top 200 restaurants data scraped and saved to 'tripadvisor_top_200_restaurants_tokyo.csv'


Unnamed: 0,Ranking,Name,URL,Rating,Price Range,Review Count,Categories,Address
0,1,Gyopao Gyoza Roppongi,https://www.tripadvisor.com/Restaurant_Review-...,,,,Chinese,"4-9-8, Roppongi, Minato 106-0032 Tokyo Prefecture"
1,2,Sushi Koshikawa,https://www.tripadvisor.com/Restaurant_Review-...,,,,Japanese,"5-4-14 Trade Akasaka Bldg. 2F, Akasaka, Minato..."
2,3,Teppanyaki Sumiyaki Saito,https://www.tripadvisor.com/Restaurant_Review-...,,,,Japanese,"6 Chome−1−6 Zakusen Building 6F, Roppongi, Min..."
3,4,Rokkasen Otakibashiidori,https://www.tripadvisor.com/Restaurant_Review-...,,,,Seafood,"7 Chome−2−6 Nishishinjuku K-1 Bldg. B1F, Nishi..."
4,5,Premium Sake Pub Gashue,https://www.tripadvisor.com/Restaurant_Review-...,,,,Brew Pub,"2-13-5 Higashiueno, Taito 110-0015 Tokyo Prefe..."
...,...,...,...,...,...,...,...,...
195,196,V2TOKYO,https://www.tripadvisor.com/Restaurant_Review-...,,,,Bar,"7-13-7 Tower Of Vabel B1 to 1F, Roppongi, Mina..."
196,197,Tsujihan,https://www.tripadvisor.com/Restaurant_Review-...,,,,Japanese,"3-1-15, Nihonbashi, Chuo 103-0027 Tokyo Prefec..."
197,198,Lamb Shabu Kinnome,https://www.tripadvisor.com/Restaurant_Review-...,,,,Japanese,"2-9-2 Dogenzaka Shibu専 Bldg. 3F, Shibuya 150-0..."
198,199,Maruka,https://www.tripadvisor.com/Restaurant_Review-...,,,,Japanese,"3-16-1 Kanda Okadacho New Surugadai Bldg 1F, C..."


In [23]:
# Export the DataFrame to a CSV file
import os

# Define the output file
output_file = "tripadvisor_top_200_restaurants_tokyo.csv"

# Check if the file exists and delete it
if os.path.exists(output_file):
    os.remove(output_file)
    print(f"Existing file {output_file} has been deleted.")

# Save the DataFrame to a new CSV file
df.to_csv(output_file, index=False, encoding="utf-8-sig")  # Use utf-8-sig for Japanese characters
print(f"Data successfully exported to {output_file}")

Existing file tripadvisor_top_200_restaurants_tokyo.csv has been deleted.
Data successfully exported to tripadvisor_top_200_restaurants_tokyo.csv


In [3]:
# Great! We have our table. Now we need to get latitude and longitude for each 
# restaurant. We'll use a geocoding API for that.

import pandas as pd
import requests
import time

# Function to geocode an address
def geocode_address(address, api_key):
    """
    Geocodes an address using the Google Maps Geocoding API.
    
    Args:
        address (str): The address or place name to geocode.
        api_key (str): Your Google Maps API key.
    
    Returns:
        tuple: A tuple containing (latitude, longitude), or (None, None) if not found.
    """
    base_url = "https://maps.googleapis.com/maps/api/geocode/json"
    params = {
        "address": address,
        "key": api_key
    }
    try:
        response = requests.get(base_url, params=params)
        if response.status_code == 200:
            data = response.json()
            status = data.get("status")
            if status == "OK":
                location = data["results"][0]["geometry"]["location"]
                return location["lat"], location["lng"]
            else:
                error_message = data.get("error_message", "No additional error message provided.")
                print(f"Geocoding failed for '{address}': {status} - {error_message}")
        else:
            print(f"HTTP error {response.status_code} for '{address}'")
    except Exception as e:
        print(f"Unexpected error during geocoding for '{address}': {e}")
    return None, None

# Load data from a file named 'tabelog_data' (no .csv extension)
input_file = "tripadvisor_top_200_restaurants_tokyo.csv"
try:
    # Load the file, assuming the first row contains column headers
    trip_advisor_data = pd.read_csv(input_file, encoding="utf-8-sig", header=0)
    print(f"Data loaded successfully from {input_file}")
except FileNotFoundError:
    print(f"Error: The file '{input_file}' was not found.")
    exit()

# Verify the columns
if "Address" not in trip_advisor_data.columns:
    print("Error: The input file does not contain an 'Address' column.")
    exit()

# API Key
API_KEY = "AIzaSyATPu_4AcpWhRRDXC_B35BnhKg-eoiL0rU"

# Geocode each address and add latitude/longitude to the DataFrame
latitudes = []
longitudes = []

for i, row in trip_advisor_data.iterrows():
    address = row["Address"]
    lat, lon = geocode_address(address, API_KEY)
    latitudes.append(lat)
    longitudes.append(lon)
    print(f"Processed {i + 1}/{len(trip_advisor_data)}: {address} -> Lat: {lat}, Lon: {lon}")
    time.sleep(0.2)  # Add a delay to avoid hitting rate limits

# Add the latitudes and longitudes as new columns
trip_advisor_data["Latitude"] = latitudes
trip_advisor_data["Longitude"] = longitudes

# Export the updated DataFrame to a new file
output_file = "trip_advisor_data_geocoded.csv"
trip_advisor_data.to_csv(output_file, index=False, encoding="utf-8-sig")
print(f"Geocoded data successfully exported to {output_file}")

Data loaded successfully from tripadvisor_top_200_restaurants_tokyo.csv
Processed 1/200: 4-9-8, Roppongi, Minato 106-0032 Tokyo Prefecture -> Lat: 35.6635875, Lon: 139.7321578
Processed 2/200: 5-4-14 Trade Akasaka Bldg. 2F, Akasaka, Minato 107-0052 Tokyo Prefecture -> Lat: 35.6734093, Lon: 139.7315136
Processed 3/200: 6 Chome−1−6 Zakusen Building 6F, Roppongi, Minato 106-0032 Tokyo Prefecture -> Lat: 35.6624904, Lon: 139.7325036
Processed 4/200: 7 Chome−2−6 Nishishinjuku K-1 Bldg. B1F, Nishishinjuku, Shinjuku 160-0023 Tokyo Prefecture -> Lat: 35.6954537, Lon: 139.6986833
Processed 5/200: 2-13-5 Higashiueno, Taito 110-0015 Tokyo Prefecture -> Lat: 35.7094403, Lon: 139.7779731
Processed 6/200: 1-16-11 Jingumae 539 Bldg. B1, Shibuya 150-0001 Tokyo Prefecture -> Lat: 35.6709562, Lon: 139.7049288
Processed 7/200: 3 Chome−35−7 Sanraku Building B1, Shinjuku 3 Chome, Shinjuku 160-0022 Tokyo Prefecture -> Lat: 35.6897519, Lon: 139.7022634
Processed 8/200: 4-2-15 Mizuno Bldg. 3F, Nishiazabu, Min