There is in fact a TripAdvisor API, but it only lets you call 10 entries at a time and is far less powerful than just scraping TripAdvisor.com, unfortunately. 
So that's what we're gonna do!

In [3]:
# Install dependencies

!pip install --quiet requests beautifulsoup4 pandas

In [5]:
# Import and setup. Importing time to help pause between requests

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import re

In [19]:
# Constants
TRIPADVISOR_BASE_URL = "https://www.tripadvisor.com/Restaurants-g14129735-Roppongi_Minato_Tokyo_Tokyo_Prefecture_Kanto.html"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

# Helper function to extract data safely
def safe_extract(element, css_selector, attribute=None, default=None):
    try:
        target = element.select_one(css_selector)
        if attribute:
            return target[attribute] if target else default
        return target.text.strip() if target else default
    except Exception:
        return default

def fetch_restaurant_details(restaurant_url):
    """
    Fetches the details (name, address, etc.) from the restaurant's detailed page.

    Args:
        restaurant_url: The URL of the restaurant.

    Returns:
        A dictionary containing the name, address, rating, review count, and categories.
    """
    try:
        for _ in range(3):  # Retry up to 3 times
            response = requests.get(restaurant_url, headers=HEADERS)
            if response.status_code == 200:
                break
            time.sleep(2)
        else:
            print(f"Failed to fetch details for {restaurant_url}")
            return {"Name": None, "Address": None, "Rating": None, "Review Count": None, "Categories": None}

        soup = BeautifulSoup(response.text, "html.parser")

        # Extract restaurant name
        name = safe_extract(soup, "h1.biGQs._P.hzzSG.BDkrC")

        # Extract address
        address = safe_extract(soup, "div.biGQs._P.fiohW.ezezH")

        # Extract rating
        rating = safe_extract(soup, "div.biGQs._P.pZUbB.AWdfh span")

        # Extract review count
        reviews_text = safe_extract(soup, "div.biGQs._P.pZUbB.AWdfh a.BMQDV._F.Gv.wSSLS.SwZTJ.FGwzt.suezE div.biGQs._P.pZUbB.AWdfh span")
        review_count = re.sub(r"[^\d]", "", reviews_text) if reviews_text else None

        # Extract categories
        categories = safe_extract(soup, "div.CsAqy span.HUMGB.cPbcf span.bTeln")

        return {
            "Name": name,
            "Address": address,
            "Rating": rating,
            "Review Count": review_count,
            "Categories": categories,
        }

    except Exception as e:
        print(f"Error fetching details for {restaurant_url}: {e}")
        return {"Name": None, "Address": None, "Rating": None, "Review Count": None, "Categories": None}


def scrape_tripadvisor_page(url, start_rank):
    """
    Scrapes a page of restaurants from Tripadvisor, logging missing fields.

    Args:
        url: The URL of the page to scrape.
        start_rank: The starting rank for the page.

    Returns:
        A list of restaurant data on the page and the next rank.
    """
    try:
        for _ in range(3):  # Retry up to 3 times
            response = requests.get(url, headers=HEADERS)
            if response.status_code == 200:
                break
            time.sleep(2)
        else:
            print(f"Failed to fetch the main page: {url}")
            return [], start_rank

        soup = BeautifulSoup(response.text, "html.parser")
        restaurant_cards = soup.find_all("div", class_="tbrcR")
        data = []

        rank = start_rank

        for card in restaurant_cards:
            try:
                # Skip sponsored cards
                sponsored_tag = card.select_one("span.biGQs._P.navcl")
                if sponsored_tag and "Sponsored" in sponsored_tag.text:
                    print("Skipping sponsored card.")
                    continue

                # Extract restaurant URL
                restaurant_url = safe_extract(card, "a.BMQDV._F.Gv.wSSLS.SwZTJ.FGwzt.ukgoS", "href")
                if restaurant_url and not restaurant_url.startswith("http"):
                    restaurant_url = f"https://www.tripadvisor.com{restaurant_url}"

                if not restaurant_url:
                    print(f"Missing URL for restaurant at rank {rank}. Skipping.")
                    continue

                # Fetch details from the restaurant's page
                details = fetch_restaurant_details(restaurant_url)

                if not details["Name"]:
                    print(f"Missing name for restaurant at rank {rank}.")

                if not details["Address"]:
                    print(f"Missing address for restaurant at rank {rank}.")

                # Append the data for this restaurant
                data.append({
                    "Ranking": rank,
                    "Name": details["Name"],
                    "URL": restaurant_url,
                    "Rating": details["Rating"],
                    "Price Range": safe_extract(card, "div.ZvrsW.N.G.biQbm span:nth-of-type(2)"),
                    "Review Count": details["Review Count"],
                    "Categories": details["Categories"],
                    "Address": details["Address"],
                })

                rank += 1  # Increment the rank

            except Exception as e:
                print(f"Error processing restaurant card at rank {rank}: {e}")

        return data, rank

    except Exception as e:
        print(f"Error scraping the main page: {e}")
        return [], start_rank


def scrape_tripadvisor_pages(num_pages):
    """
    Scrapes multiple pages of restaurants from Tripadvisor.

    Args:
        num_pages: The number of pages to scrape.

    Returns:
        A list of all restaurant data.
    """
    all_data = []
    current_rank = 1  # Initialize the ranking counter

    for page in range(num_pages):
        offset = page * 30
        if page == 0:
            url = TRIPADVISOR_BASE_URL
        else:
            url = TRIPADVISOR_BASE_URL.replace(".html", f"-oa{offset}-zfp10955.html")

        print(f"Scraping page {page + 1}: {url}")
        time.sleep(random.uniform(5, 15))  # Add a polite delay

        page_data, current_rank = scrape_tripadvisor_page(url, current_rank)
        if page_data:
            all_data.extend(page_data)
        else:
            print(f"Stopping at page {page + 1} due to an error.")
            break

    return all_data

In [20]:
# Scrape the first 7 pages (30 results per page = 210 total, slightly over 200)
data = scrape_tripadvisor_pages(num_pages=7)

Scraping page 1: https://www.tripadvisor.com/Restaurants-g14129735-Roppongi_Minato_Tokyo_Tokyo_Prefecture_Kanto.html
Skipping sponsored card.
Skipping sponsored card.
Skipping sponsored card.
Skipping sponsored card.
Skipping sponsored card.
Scraping page 2: https://www.tripadvisor.com/Restaurants-g14129735-Roppongi_Minato_Tokyo_Tokyo_Prefecture_Kanto-oa30-zfp10955.html
Skipping sponsored card.
Skipping sponsored card.
Scraping page 3: https://www.tripadvisor.com/Restaurants-g14129735-Roppongi_Minato_Tokyo_Tokyo_Prefecture_Kanto-oa60-zfp10955.html
Skipping sponsored card.
Skipping sponsored card.
Missing name for restaurant at rank 83.
Missing address for restaurant at rank 83.
Scraping page 4: https://www.tripadvisor.com/Restaurants-g14129735-Roppongi_Minato_Tokyo_Tokyo_Prefecture_Kanto-oa90-zfp10955.html
Skipping sponsored card.
Skipping sponsored card.
Scraping page 5: https://www.tripadvisor.com/Restaurants-g14129735-Roppongi_Minato_Tokyo_Tokyo_Prefecture_Kanto-oa120-zfp10955.html


In [18]:
# Slice the data to include only the first 200 restaurants
top_200_data = data[:200]

# Create a DataFrame and export to CSV
df = pd.DataFrame(top_200_data)
df.to_csv("tripadvisor_top_200_restaurants_tokyo.csv", index=False, encoding="utf-8-sig")

print("Top 200 restaurants data scraped and saved to 'tripadvisor_top_200_restaurants_tokyo.csv'")

# Print the first few rows of the DataFrame to verify
df

Top 200 restaurants data scraped and saved to 'tripadvisor_top_200_restaurants_tokyo.csv'


Unnamed: 0,Ranking,Name,URL,Rating,Price Range,Review Count,Categories,Address
0,1,Teppanyaki Sumiyaki Saito,https://www.tripadvisor.com/Restaurant_Review-...,4.9,,705,"Japanese, Steakhouse",Teppanyaki Sumiyaki Saito
1,2,Gyopao Gyoza Roppongi,https://www.tripadvisor.com/Restaurant_Review-...,4.9,,3423,"Chinese, Asian",Gyopao Gyoza Roppongi
2,3,Downtown B’s Indian Kitchen,https://www.tripadvisor.com/Restaurant_Review-...,4.9,,377,"Indian, Asian",Downtown B’s Indian Kitchen
3,4,IPPUDO Roppongi,https://www.tripadvisor.com/Restaurant_Review-...,4.5,,578,"Japanese, Asian",IPPUDO Roppongi
4,5,Niigata Yukimuro Jukusei Yakiniku Nikuine Ropp...,https://www.tripadvisor.com/Restaurant_Review-...,4.9,,67,"Japanese, Barbecue",Niigata Yukimuro Jukusei Yakiniku Nikuine Ropp...
...,...,...,...,...,...,...,...,...
195,196,Azabu Tansumachi Tenryoan,https://www.tripadvisor.com/Restaurant_Review-...,4.2,,5,"Japanese, Steakhouse",Azabu Tansumachi Tenryoan
196,197,Ark Hills Cafe,https://www.tripadvisor.com/Restaurant_Review-...,3.4,,42,"Bar, Cafe",Ark Hills Cafe
197,198,Caipirinha Bar,https://www.tripadvisor.com/Restaurant_Review-...,4.3,,6,"Brazilian, Bar",Caipirinha Bar
198,199,Matsuoka,https://www.tripadvisor.com/Restaurant_Review-...,4.3,,4,"Japanese, Asian",Matsuoka


In [23]:
# Export the DataFrame to a CSV file
import os

# Define the output file
output_file = "tripadvisor_top_200_restaurants_tokyo.csv"

# Check if the file exists and delete it
if os.path.exists(output_file):
    os.remove(output_file)
    print(f"Existing file {output_file} has been deleted.")

# Save the DataFrame to a new CSV file
df.to_csv(output_file, index=False, encoding="utf-8-sig")  # Use utf-8-sig for Japanese characters
print(f"Data successfully exported to {output_file}")

Existing file tripadvisor_top_200_restaurants_tokyo.csv has been deleted.
Data successfully exported to tripadvisor_top_200_restaurants_tokyo.csv


In [None]:
# Great! We have our table. Now we need to get latitude and longitude for each 
# restaurant. We'll use a geocoding API for that.

import pandas as pd
import requests
import time

# Function to geocode an address
def geocode_address(address, api_key):
    """
    Geocodes an address using the Google Maps Geocoding API.
    
    Args:
        address (str): The address or place name to geocode.
        api_key (str): Your Google Maps API key.
    
    Returns:
        tuple: A tuple containing (latitude, longitude), or (None, None) if not found.
    """
    base_url = "https://maps.googleapis.com/maps/api/geocode/json"
    params = {
        "address": address,
        "key": api_key
    }
    try:
        response = requests.get(base_url, params=params)
        if response.status_code == 200:
            data = response.json()
            status = data.get("status")
            if status == "OK":
                location = data["results"][0]["geometry"]["location"]
                return location["lat"], location["lng"]
            else:
                error_message = data.get("error_message", "No additional error message provided.")
                print(f"Geocoding failed for '{address}': {status} - {error_message}")
        else:
            print(f"HTTP error {response.status_code} for '{address}'")
    except Exception as e:
        print(f"Unexpected error during geocoding for '{address}': {e}")
    return None, None

# Load data from a file named 'tabelog_data' (no .csv extension)
input_file = "tabelog_data.csv"
try:
    # Load the file, assuming the first row contains column headers
    tabelog_data = pd.read_csv(input_file, encoding="utf-8-sig", header=0)
    print(f"Data loaded successfully from {input_file}")
except FileNotFoundError:
    print(f"Error: The file '{input_file}' was not found.")
    exit()

# Verify the columns
if "Address" not in tabelog_data.columns:
    print("Error: The input file does not contain an 'Address' column.")
    exit()

# API Key
API_KEY = "AIzaSyATPu_4AcpWhRRDXC_B35BnhKg-eoiL0rU"

# Geocode each address and add latitude/longitude to the DataFrame
latitudes = []
longitudes = []

for i, row in tabelog_data.iterrows():
    address = row["Address"]
    lat, lon = geocode_address(address, API_KEY)
    latitudes.append(lat)
    longitudes.append(lon)
    print(f"Processed {i + 1}/{len(tabelog_data)}: {address} -> Lat: {lat}, Lon: {lon}")
    time.sleep(0.2)  # Add a delay to avoid hitting rate limits

# Add the latitudes and longitudes as new columns
tabelog_data["Latitude"] = latitudes
tabelog_data["Longitude"] = longitudes

# Export the updated DataFrame to a new file
output_file = "tabelog_data_geocoded.csv"
tabelog_data.to_csv(output_file, index=False, encoding="utf-8-sig")
print(f"Geocoded data successfully exported to {output_file}")