In [1]:
# Imports and setup
import time
import random
import re
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
# Base URL for the first page
TABELOG_BASE_URL = "https://tabelog.com/en/tokyo/rstLst/"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}


def safe_extract(element, css_selector, attribute=None, default=None):
    """
    Safely extracts text or an attribute from a BeautifulSoup element.

    Args:
        element: The parent BeautifulSoup element.
        css_selector: The CSS selector string to find the target element.
        attribute: The attribute to extract (e.g., "href"). If None, extracts text.
        default: The default value to return if extraction fails.

    Returns:
        The extracted text/attribute value or the default value.
    """
    try:
        target = element.select_one(css_selector)
        if attribute:
            return target[attribute] if target else default
        return target.text.strip() if target else default
    except Exception:
        return default

def clean_price(price_text):
    """
    Cleans the price text to remove 'JPY' and formats it as a numeric range.

    Args:
        price_text: A string containing the price, e.g., "JPY 50,000 - JPY 59,999".

    Returns:
        A string with the cleaned price range, e.g., "50000 - 59999", or None if invalid.
    """
    if not price_text:
        return None
    # Remove 'JPY' and commas, then strip whitespace
    cleaned_text = re.sub(r"JPY|\s|,", "", price_text)
    return cleaned_text

# Function to scrape a single page
def scrape_tabelog_page(url, starting_rank):
    """
    Scrapes a page of restaurants and fetches additional details from their individual pages.

    Args:
        url: The URL of the page to scrape.
        starting_rank: The ranking value to start from.

    Returns:
        A tuple containing:
            - List of restaurant data on the page.
            - The next starting rank value.
    """
    try:
        # Retry mechanism for the main page request
        for _ in range(3):  # Retry up to 3 times
            response = requests.get(url, headers=HEADERS)
            if response.status_code == 200:
                break
            time.sleep(2)  # Wait 2 seconds before retrying
        else:
            print(f"Failed to fetch the main page: {url}")
            return [], starting_rank  # Return empty data and current rank

        soup = BeautifulSoup(response.text, "html.parser")
        restaurant_cards = soup.find_all("div", class_="list-rst js-bookmark js-rst-cassette-wrap")
        data = []

        # Initialize ranking counter from the passed-in starting rank
        ranking = starting_rank

        for card in restaurant_cards:
            try:
                # Extract basic details from the card
                name = safe_extract(card, "a.list-rst__rst-name-target.cpy-rst-name")
                restaurant_url = safe_extract(card, "a.list-rst__rst-name-target.cpy-rst-name", "href")
                rating = safe_extract(card, "span.c-rating__val.c-rating__val--strong.list-rst__rating-val")

                # Extract price range from the aria-label "Average dinner price"
                price_range = None
                try:
                    price_element = card.select_one('li.c-rating-v3 i[aria-label="Average dinner price"] + span.c-rating-v3__val')
                    if price_element:
                        raw_price = price_element.text.strip()  # Extract text
                        price_range = clean_price(raw_price)   # Pass the cleaned text to clean_price
                except Exception as e:
                    print(f"Error extracting price range: {e}")

                # Extract the number of reviews
                review_count = None
                review_element = card.select_one("em.list-rst__rvw-count-num.cpy-review-count")
                if review_element:
                    review_count = review_element.text.strip()

                # Extract detailed information (e.g., address, categories) if needed
                nearest_station = None
                categories = None
                address = None
                reservation_only = False

                # Add a random sleep to avoid overloading the server
                time.sleep(random.uniform(2, 5))  # Sleep for 2-5 seconds

                try:
                    for _ in range(3):  # Retry up to 3 times for individual pages
                        page_response = requests.get(restaurant_url, headers=HEADERS)
                        if page_response.status_code == 200:
                            break
                        time.sleep(2)  # Wait before retrying
                    else:
                        print(f"Failed to fetch details for {name}: {restaurant_url}")
                        continue

                    rest_soup = BeautifulSoup(page_response.text, "html.parser")
                    details = rest_soup.find_all("dl", class_="rdheader-subinfo__item")

                    # Extract Nearest Station and Categories
                    for detail in details:
                        dt = detail.find("dt", class_="rdheader-subinfo__item-title")
                        dd = detail.find("dd", class_="rdheader-subinfo__item-text")
                        if not dt or not dd:
                            continue

                        title = dt.text.strip()
                        if title == "Nearest station：":
                            nearest_station_span = dd.select_one("span.linktree__parent-target-text")
                            if nearest_station_span:
                                nearest_station = nearest_station_span.text.strip()
                        elif title == "Categories：":
                            categories_span = dd.select_one("span.linktree__parent-target-text")
                            if categories_span:
                                categories = categories_span.text.strip()

                    # Extract Address and Reservation Status
                    table = rest_soup.find("table", class_="c-table c-table--form rstinfo-table__table")
                    if table:
                        rows = table.find_all("tr")
                        for row in rows:
                            th = row.find("th")
                            td = row.find("td")

                            if not th or not td:
                                continue

                            title = th.text.strip()
                            if title == "Address":
                                address_p = td.find("p", class_="rstinfo-table__address")
                                if address_p:
                                    address = address_p.text.strip()
                            elif title == "Reservation availability":
                                reservation_status = td.find("p", class_="rstinfo-table__reserve-status")
                                if reservation_status:
                                    text = reservation_status.text.strip()
                                    if "Reservation only" in text or "Reservations available" in text:
                                        reservation_only = "Reservation only" in text

                except Exception as e:
                    print(f"Error fetching detailed info for {restaurant_url}: {e}")

                # Append the data for this restaurant
                data.append({
                    "Ranking": ranking,
                    "Name": name,
                    "URL": restaurant_url,
                    "Rating": rating,
                    "Price Range": price_range,
                    "Review Count": review_count,
                    "Nearest Station": nearest_station,
                    "Categories": categories,
                    "Address": address,
                    "Reservation Only": reservation_only
                })

                ranking += 1  # Increment ranking for the next restaurant

            except Exception as e:
                print(f"Error processing restaurant card: {e}")

        return data, ranking  # Return updated rank value

    except Exception as e:
        print(f"Error scraping the main page: {e}")
        return [], starting_rank  # Return empty data and current rank on error


def scrape_tabelog_pages(num_pages):
    """
    Scrapes multiple pages of restaurants from Tabelog.

    Args:
        num_pages: The number of pages to scrape.

    Returns:
        A list of all restaurant data.
    """
    all_data = []
    current_rank = 1  # Initialize the global ranking counter

    for page in range(1, num_pages + 1):
        print(f"Scraping page {page}...")
        url = f"{TABELOG_BASE_URL}{page}/?SrtT=rt" if page > 1 else TABELOG_BASE_URL + "?SrtT=rt"

        # Add a random delay before the request
        time.sleep(random.uniform(5, 15))  # Wait between 5 and 15 seconds
        
        page_data, current_rank = scrape_tabelog_page(url, current_rank)  # Pass in and update the rank

        if page_data:  # Only extend if data is not empty
            all_data.extend(page_data)
        else:
            print(f"Stopping at page {page} due to an error.")
            break

        # Be polite by adding a delay between requests
        time.sleep(2)

    return all_data

In [3]:
# Scrape the first 2 pages (adjust as needed)
data = scrape_tabelog_pages(num_pages=10)

# Create a DataFrame and export to CSV
df = pd.DataFrame(data)
#df.to_csv("tabelog_top_restaurants_tokyo.csv", index=False)

#print("Data scraped and saved to 'tabelog_top_restaurants_tokyo.csv'")
df

Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...


Unnamed: 0,Ranking,Name,URL,Rating,Price Range,Review Count,Nearest Station,Categories,Address,Reservation Only
0,1,Shimbashi Hoshino,https://tabelog.com/en/tokyo/A1314/A131401/131...,4.65,50000-59999,275,Onarimon Sta.,Japanese Cuisine,東京都港区新橋5-31-3,True
1,2,Nihonbashi Kakigaracho Sugita,https://tabelog.com/en/tokyo/A1302/A130204/130...,4.64,40000-49999,897,Suitengumae Sta.,Sushi,東京都中央区日本橋蛎殻町1-33-6 ビューハイツ日本橋 B1F,True
2,3,aca,https://tabelog.com/en/tokyo/A1302/A130202/132...,4.63,60000-79999,523,Mitsukoshimae Sta.,Spain,東京都中央区日本橋室町2-1-1 三井2号館,True
3,4,Higashiazabu Amamoto,https://tabelog.com/en/tokyo/A1314/A131401/131...,4.62,60000-79999,572,Akabanebashi Sta.,Sushi,東京都港区東麻布1-7-9 ザ・ソノビル 102,True
4,5,Matsukawa,https://tabelog.com/en/tokyo/A1307/A130701/131...,4.61,80000-99999,509,Roppongi Itchome Sta.,Japanese Cuisine,東京都港区赤坂1-11-6 赤坂テラスハウス １階,True
...,...,...,...,...,...,...,...,...,...,...
195,196,Ginza Kitafuku,https://tabelog.com/en/tokyo/A1301/A130101/131...,4.01,40000-49999,265,Ginza Sta.,Crab,東京都中央区銀座7-4-5 銀座745ビル 3F,True
196,197,Onarimon Haru,https://tabelog.com/en/tokyo/A1314/A131401/132...,4.01,40000-49999,206,Onarimon Sta.,Japanese Cuisine,東京都港区芝大門1-2-2 中川ビル 1F,True
197,198,Mori Fuji,https://tabelog.com/en/tokyo/A1309/A130905/132...,4.01,30000-39999,121,Ushigome Kagurazaka Sta.,Japanese Cuisine,東京都新宿区納戸町32-3 ザ・フィガロ市ヶ谷 1F,True
198,199,tens.,https://tabelog.com/en/tokyo/A1306/A130603/133...,4.01,20000-29999,78,Gaiemmae Sta.,Italian,東京都港区南青山2-27-28 グラン青山 B1F,False


In [5]:
# Export the DataFrame to a CSV file
output_file = "tabelog_data.csv"
df.to_csv(output_file, index=False, encoding="utf-8-sig")  # Use utf-8-sig for Japanese characters
print(f"Data successfully exported to {output_file}")

Data successfully exported to tabelog_data.csv


In [1]:
# Great! We have our table. Now we need to get latitude and longitude for each 
# restaurant. We'll use a geocoding API for that.

import pandas as pd
import requests
import time

# Function to geocode an address
def geocode_address(address, api_key):
    """
    Geocodes an address using the Google Maps Geocoding API.
    
    Args:
        address (str): The address or place name to geocode.
        api_key (str): Your Google Maps API key.
    
    Returns:
        tuple: A tuple containing (latitude, longitude), or (None, None) if not found.
    """
    base_url = "https://maps.googleapis.com/maps/api/geocode/json"
    params = {
        "address": address,
        "key": api_key
    }
    try:
        response = requests.get(base_url, params=params)
        if response.status_code == 200:
            data = response.json()
            status = data.get("status")
            if status == "OK":
                location = data["results"][0]["geometry"]["location"]
                return location["lat"], location["lng"]
            else:
                error_message = data.get("error_message", "No additional error message provided.")
                print(f"Geocoding failed for '{address}': {status} - {error_message}")
        else:
            print(f"HTTP error {response.status_code} for '{address}'")
    except Exception as e:
        print(f"Unexpected error during geocoding for '{address}': {e}")
    return None, None

# Load data from a file named 'tabelog_data' (no .csv extension)
input_file = "tabelog_data.csv"
try:
    # Load the file, assuming the first row contains column headers
    tabelog_data = pd.read_csv(input_file, encoding="utf-8-sig", header=0)
    print(f"Data loaded successfully from {input_file}")
except FileNotFoundError:
    print(f"Error: The file '{input_file}' was not found.")
    exit()

# Verify the columns
if "Address" not in tabelog_data.columns:
    print("Error: The input file does not contain an 'Address' column.")
    exit()

# API Key
API_KEY = ""

# Geocode each address and add latitude/longitude to the DataFrame
latitudes = []
longitudes = []

for i, row in tabelog_data.iterrows():
    address = row["Address"]
    lat, lon = geocode_address(address, API_KEY)
    latitudes.append(lat)
    longitudes.append(lon)
    print(f"Processed {i + 1}/{len(tabelog_data)}: {address} -> Lat: {lat}, Lon: {lon}")
    time.sleep(0.2)  # Add a delay to avoid hitting rate limits

# Add the latitudes and longitudes as new columns
tabelog_data["Latitude"] = latitudes
tabelog_data["Longitude"] = longitudes

# Export the updated DataFrame to a new file
output_file = "tabelog_data_geocoded.csv"
tabelog_data.to_csv(output_file, index=False, encoding="utf-8-sig")
print(f"Geocoded data successfully exported to {output_file}")

Data loaded successfully from tabelog_data.csv
Processed 1/200: 東京都港区新橋5-31-3 -> Lat: 35.6624582, Lon: 139.7553081
Processed 2/200: 東京都中央区日本橋蛎殻町1-33-6 ビューハイツ日本橋 B1F -> Lat: 35.6818515, Lon: 139.7849984
Processed 3/200: 東京都中央区日本橋室町2-1-1 三井2号館 -> Lat: 35.6868636, Lon: 139.7724807
Processed 4/200: 東京都港区東麻布1-7-9 ザ・ソノビル 102 -> Lat: 35.6573486, Lon: 139.7434681
Processed 5/200: 東京都港区赤坂1-11-6 赤坂テラスハウス １階 -> Lat: 35.6674621, Lon: 139.7423903
Processed 6/200: 東京都港区六本木1-4-5 アークヒルズサウスタワー 1Ｆ -> Lat: 35.6656056, Lon: 139.7394165
Processed 7/200: 東京都中央区銀座8-10-2 ルアンビル B1F 2F -> Lat: 35.6680421, Lon: 139.7627122
Processed 8/200: 東京都渋谷区恵比寿2-3-4 -> Lat: 35.648029, Lon: 139.7184741
Processed 9/200: 東京都中央区銀座2-8-17 ハビウル銀座2 B1 -> Lat: 35.673039, Lon: 139.7678421
Processed 10/200: 東京都港区西麻布3-21-3 オリンピアード麻布霞坂 2F -> Lat: 35.6598526, Lon: 139.7251681
Processed 11/200: 東京都港区南麻布4-7-5 -> Lat: 35.6495836, Lon: 139.7283406
Processed 12/200: 東京都中央区銀座5-9-19 ＭＣビル3Ｆ -> Lat: 35.6704606, Lon: 139.7652779
Processed 13/200: 