In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import json
import time
import pandas as pd
import concurrent.futures
from datetime import datetime

In [None]:
def setup_driver(driver_path):
    # Get the path for the ChromeDriver
    # driver_path = ChromeDriverManager().install()

    # Set up Chrome options or capabilities (if needed)
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument("--headless")
    # chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-images")

    # Create a Chrome service with the driver path
    chrome_service = Service(driver_path)

    # Initialize the Chrome WebDriver with options and service
    driver = webdriver.Chrome(service=chrome_service, options=chrome_options)
    return driver

In [None]:
def scrape_menu_item(url, restaurant_id, driver_path):
    driver = setup_driver(driver_path)
    try:
        driver.get(url)
        # Wait for the page to load completely (optional, but recommended)
        driver.implicitly_wait(60)  # Wait for up to 10 seconds for elements to appear

        # Get the store name
        wait = WebDriverWait(driver, 30)
        store_name_element = wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'StoreAuthHeader__StoreName-ihWYeu')))
        # Get the text of the store name element
        store_name = store_name_element.text

        # find sections that contain the cat btn
        cat_sections = driver.find_elements(By.CSS_SELECTOR, ".OrderingMenuCategoryChip__Wrapper-jaTTMW.dfpPQD")
        cat_btn_lst = [cat_section.find_element(By.CSS_SELECTOR, """div[role="button"]""") for cat_section in cat_sections]

        item_list = []
        # iterate through cat btns, click the btn then find ele
        processed_categories = set()  # Keep track of processed categories
        for btn in cat_btn_lst:
            btn.click()
            time.sleep(1)  # Wait for the page to scroll and load items

            # Find all categories in the current section
            cats = driver.find_elements(By.CSS_SELECTOR, 'div[data-index]')

            for cat in cats:
                cat_element = cat.find_element(By.TAG_NAME, 'h4')
                cat_name = cat_element.text

                if cat_name not in processed_categories:
                    processed_categories.add(cat_name)

                    # Find all items within the current category
                    menu_items = cat.find_elements(By.CSS_SELECTOR, "li[data-test-id='menuItem']")
                    for item in menu_items:
                        # print(item.text)
                        item_info = item.text.split('\n')
                        if len(item_info) == 3:
                            item_list.append([cat_name, item_info[0], item_info[1], item_info[2], item.text])
                        elif len(item_info) == 2:
                            item_list.append([cat_name, item_info[0], None, item_info[1], item.text])
                        else:
                            item_list.append([None, None, None, None, item.text])

        print(store_name, item_list)
        return store_name, item_list, restaurant_id
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return None, None, restaurant_id
    finally:
        driver.quit()

In [None]:
def dish_dict(text_list):
    section = text_list[0]
    text = text_list[4]
    splitted = text.split('\n')
    # 名稱
    item_name = splitted[0]
    # 價錢
    prices = [int(item.replace("NT$", '').replace(',', '')) for item in splitted if item.replace('-', '').startswith('NT$')]
    if len(prices) == 1:
        org_price = prices[0]
        disc_price = prices[0]
    elif len(prices) == 2:
        org_price = max(prices)
        disc_price = min(prices)
    # 描述
    if text_list[2] and 'NT$' not in text_list[2]:
        descrip = text_list[2]
    else:
        descrip = None
    
    dish = {
        'section': section,
        'product': item_name,
        'price': org_price,
        'discounted_price': disc_price,
        'description': descrip
    }
    return dish

def new_menu(menu):
    new_menu = []
    if menu == [] or menu == None:
        return []
    for dish in menu:
        print(dish)
        new_menu.append(dish_dict(dish))
    return new_menu

# new_menu_list = []
# for i in range(uni_df.shape[0]):
#     new_menu_list.append(new_menu(uni_df['Items'][i]))

# uni_df['new menu'] = new_menu_list

In [None]:
# ===Read Restaurant Link===
df = pd.read_json("restaurant_final.json")

In [None]:
# Initialize the Chrome driver (outside the scraping function)
driver_path = ChromeDriverManager().install()

store_name_list, item_list, id_list, time_list = [], [], [], []
# Function to process a chunk of URLs
def process_chunk(start_index, end_index):
    for a_store in range(start_index, end_index):
        # get current time
        current_time = datetime.now()
        # scrape
        print(f"The {a_store} store")
        url, restaurant_id = df['link'][a_store], df['id'][a_store] # Target df with url
        store_name, items, id = scrape_menu_item(url, restaurant_id, driver_path)
        store_name_list.append(store_name)
        item_list.append(items)
        id_list.append(id)
        time_list.append(current_time)

# Divide URLs into chunks and process them in parallel
#=========
chunk_size = 100  # Adjust chunk size as needed
start = 0
num_urls = df.shape[0]
#=========

with concurrent.futures.ThreadPoolExecutor(max_workers=6) as executor:
    futures = []
    for start_index in range(start, num_urls, chunk_size):
        end_index = min(start_index + chunk_size, num_urls)
        futures.append(executor.submit(process_chunk, start_index, end_index))

    # Wait for all tasks to complete
    concurrent.futures.wait(futures)

# Create a new DataFrame to store the scraped data
scraped_data = pd.DataFrame({
    'Store_name': store_name_list,
    'Items': item_list,
    'id': id_list,
    'update_time': time_list
})

# Merge the scraped data with the original DataFrame df based on index or any other common column
# df2 = pd.concat([df2, scraped_data], axis=0)

# Save DataFrame to JSON
# df2.to_json('restaurant_menu.json', orient='records')

scraped_data.to_json('restaurant_menu0425.json', orient='records')

In [None]:
uni_df = scraped_data.drop_duplicates(subset='id')
uni_df.reset_index(drop=True, inplace=True)
uni_df

In [None]:
# ===Merge to the original df===
merged_df = pd.merge(df, uni_df, on = 'id', how = 'left')
null_rows = merged_df[merged_df['Store_name'].isnull()]
not_scraped_list = null_rows.index
merged_df = merged_df.drop_duplicates(subset='id')
merged_df.to_csv('restaurant_menu0418.csv', index = False)

In [None]:
# ===For those with error===
re_scrape_df = merged_df[merged_df['Store_name'].isnull()].reset_index(drop=True)
re_scrape_df

In [None]:
# ===Re Scrape===
# Initialize the Chrome driver (outside the scraping function)
driver_path = ChromeDriverManager().install()

store_name_list, item_list, id_list, time_list = [], [], [], []
# Function to process a chunk of URLs
def process_chunk(start_index, end_index):
    for a_store in range(start_index, end_index):
        # get current time
        current_time = datetime.now()
        # scrape
        print(f"The {a_store} store")
        url, restaurant_id = re_scrape_df['link'][a_store], re_scrape_df['id'][a_store] # Target df with url
        store_name, items, id = scrape_menu_item(url, restaurant_id)
        store_name_list.append(store_name)
        item_list.append(items)
        id_list.append(id)
        time_list.append(current_time)

# Divide URLs into chunks and process them in parallel
#=========
chunk_size = 50  # Adjust chunk size as needed
start = 0
num_urls = re_scrape_df.shape[0]
#=========

with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
    futures = []
    for start_index in range(start, num_urls, chunk_size):
        end_index = min(start_index + chunk_size, num_urls)
        futures.append(executor.submit(process_chunk, start_index, end_index))

    # Wait for all tasks to complete
    concurrent.futures.wait(futures)

# Create a new DataFrame to store the scraped data
rescraped_data = pd.DataFrame({
    'Store_name': store_name_list,
    'Items': item_list,
    'id': id_list,
    'update_time': time_list
})

# Merge the scraped data with the original DataFrame df based on index or any other common column
scraped_data = pd.concat([scraped_data, rescraped_data], axis=0)

# Save DataFrame to JSON
scraped_data.to_json('restaurant_menu0418.json', orient='records')

In [None]:
uni_df = scraped_data.drop_duplicates(subset='id')
uni_df.reset_index(drop=True, inplace=True)

new_menu_list = []
for i in range(uni_df.shape[0]):
    new_menu_list.append(new_menu(uni_df['Items'][i]))

uni_df['new menu'] = new_menu_list

# ===Merge to the original df===
merged_df = pd.merge(df, uni_df, on = 'id', how = 'left')
null_rows = merged_df[merged_df['Store_name'].isnull()]
not_scraped_list = null_rows.index
merged_df = merged_df.drop_duplicates(subset='id')
merged_df.reset_index(drop=True, inplace=True)
merged_df.to_csv('restaurant_menu0418.csv', index = False)
merged_df

In [None]:
def dish_dict(text_list):
    section = text_list[0]
    text = text_list[4]
    splitted = text.split('\n')
    # 名稱
    item_name = splitted[0]
    # 價錢
    prices = [int(item.replace("NT$", '').replace(',', '')) for item in splitted if item.replace('-', '').startswith('NT$')]
    if len(prices) == 1:
        org_price = prices[0]
        disc_price = prices[0]
    elif len(prices) == 2:
        org_price = max(prices)
        disc_price = min(prices)
    # 描述
    if text_list[2] and 'NT$' not in text_list[2]:
        descrip = text_list[2]
    else:
        descrip = None
    
    dish = {
        'section': section,
        'product': item_name,
        'price': org_price,
        'discounted_price': disc_price,
        'description': descrip
    }
    return dish

def new_menu(menu):
    new_menu = []
    if menu == [] or menu == None:
        return []
    for dish in menu:
        print(dish)
        new_menu.append(dish_dict(dish))
    return new_menu

# new_menu_list = []
# for i in range(uni_df.shape[0]):
#     new_menu_list.append(new_menu(uni_df['Items'][i]))

# uni_df['new menu'] = new_menu_list