In [18]:
'''
匯入套件
'''
# 操作 browser 的 API
from selenium import webdriver
from selenium.webdriver.chrome.service import Service

# ChromeDriver 的下載管理工具
from webdriver_manager.chrome import ChromeDriverManager

# 處理逾時例外的工具
from selenium.common.exceptions import TimeoutException

# 面對動態網頁，等待某個元素出現的工具，通常與 exptected_conditions 搭配
from selenium.webdriver.support.ui import WebDriverWait

# 搭配 WebDriverWait 使用，對元素狀態的一種期待條件，若條件發生，則等待結束，往下一行執行
from selenium.webdriver.support import expected_conditions as EC

# 期待元素出現要透過什麼方式指定，通常與 EC、WebDriverWait 一起使用
from selenium.webdriver.common.by import By

# 取得系統時間的工具
from datetime import datetime

# 強制停止/強制等待 (程式執行期間休息一下)
from time import sleep

# 處理下拉式選單的工具
from selenium.webdriver.support.ui import Select

import pprint
import os

# 隨機取得 User-Agent
from fake_useragent import UserAgent
ua = UserAgent()

# 啟動瀏覽器工具的選項
my_options = webdriver.ChromeOptions()
# my_options.add_argument("--headless")                # 不開啟實體瀏覽器背景執行
my_options.add_argument("--start-maximized")         # 最大化視窗
my_options.add_argument("--incognito")               # 開啟無痕模式
my_options.add_argument("--disable-popup-blocking")  # 禁用彈出攔截
my_options.add_argument("--disable-notifications")   # 取消通知
my_options.add_argument(f'--user-agent={ua.random}') # (Optional)加入 User-Agent

# 使用 Chrome 的 WebDriver
driver = webdriver.Chrome(
    options = my_options,
)

# 儲存資料的變數
list_data = []

# 走訪網址
url = 'https://consumer.fda.gov.tw/Food/TFND.aspx?nodeID=178'
type_index = 18
keyword = '冷凍'

In [19]:
#visit url page
def visit():
    driver.get(url);

#select drop-down menu:
def setDropDownMenu(type_index):
    try:
        #強制等待
        sleep(1)

        # 選擇食物類別
        type = Select(driver.find_element(By.CSS_SELECTOR, 'li > select'))
        type.select_by_index(type_index)
                      
        # 強制等待
        sleep(2)
    except Exception as err:
        print(str(err))

def key_in_search(keyword):
    try:
        search_box = driver.find_element(
            By.CSS_SELECTOR, 'input#ctl00_content_SelectK'
        )
        search_box.send_keys(keyword)

        # 強制等待
        sleep(1)
        
        # 按下查詢
        driver.find_element(
            By.CSS_SELECTOR, 'input#ctl00_content_searchBut'
        ).click()

        # 強制等待
        sleep(2)
    except Exception as err:
            print(str(err))

visit()
setDropDownMenu(type_index)
key_in_search(keyword)

## 開啟分頁

In [20]:
no_of_handles = int(driver.find_element(
            By.CSS_SELECTOR, '#ctl00_content_ListPanel div a:nth-child(7)'
        ).get_attribute('href')[-1])-1

# 連續開 3 個分頁 (javascript)
for i in range(no_of_handles):
    driver.execute_script(f'window.open("");')

In [21]:
# 切換到初始分頁 (索引為 0)
driver.switch_to.window(driver.window_handles[0])

In [22]:
# 將所有 tabs 轉址，以便取得對應列表; 0 不變
for index in range(1, len(driver.window_handles)):
    try:
        #change tabs
        driver.switch_to.window(driver.window_handles[index])
        # 使分頁自動連結到指定網址 (此時的 drive 變數指向切後的分頁)
        driver.get(f'https://consumer.fda.gov.tw/Food/TFND.aspx?nodeID=178&t={type_index}&k={keyword}&p={index}')  
        
        # Use explicit wait instead of sleep
        WebDriverWait(driver, 5).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'td[data-th="樣品名稱"]>a'))
        )
    except Exception as e:
            print(f"Error loading page at tab {index}: {e}")

In [23]:
# 取得分頁列表
for index in range(1, len(driver.window_handles)):
    try:
        #change tabs
        driver.switch_to.window(driver.window_handles[index])

        #取得列表連結與內文
        items = driver.find_elements(By.CSS_SELECTOR, 'td[data-th="樣品名稱"]>a')
        for a in items:
            list_data.append({
                'Food': a.get_attribute('innerText'),
                'Link': a.get_attribute('href')
            })

    except Exception as e:
        print(f"Error extracting data at tab {index}: {e}")

In [24]:
#view list
list_data

[{'Food': '冷凍火腿炒飯',
  'Link': 'https://consumer.fda.gov.tw/Food/tfndDetail.aspx?nodeID=178&f=0&id=1753'},
 {'Food': '冷凍蝦仁炒飯',
  'Link': 'https://consumer.fda.gov.tw/Food/tfndDetail.aspx?nodeID=178&f=0&id=1770'},
 {'Food': '冷凍筒仔米糕',
  'Link': 'https://consumer.fda.gov.tw/Food/tfndDetail.aspx?nodeID=178&f=0&id=1797'},
 {'Food': '冷凍芝麻湯圓',
  'Link': 'https://consumer.fda.gov.tw/Food/tfndDetail.aspx?nodeID=178&f=0&id=1755'},
 {'Food': '冷凍花生湯圓',
  'Link': 'https://consumer.fda.gov.tw/Food/tfndDetail.aspx?nodeID=178&f=0&id=1756'},
 {'Food': '冷凍豬肉湯圓',
  'Link': 'https://consumer.fda.gov.tw/Food/tfndDetail.aspx?nodeID=178&f=0&id=1774'},
 {'Food': '冷凍烏龍麵',
  'Link': 'https://consumer.fda.gov.tw/Food/tfndDetail.aspx?nodeID=178&f=0&id=1972'},
 {'Food': '冷凍牛肉水餃',
  'Link': 'https://consumer.fda.gov.tw/Food/tfndDetail.aspx?nodeID=178&f=0&id=1784'},
 {'Food': '冷凍豬肉水餃',
  'Link': 'https://consumer.fda.gov.tw/Food/tfndDetail.aspx?nodeID=178&f=0&id=1771'},
 {'Food': '冷凍豬肉韭菜水餃',
  'Link': 'https://consum

## 拿資料

In [25]:
# 儲存食物資料跟營養成分
food_info = []
food_nutr = []
selected_items = []

In [26]:
def extract_food_info():
    #driver.get(url_page)
    info = {
        'Food Category 食品分類': driver.find_element(By.CSS_SELECTOR, '#ctl00_content_lbFoodCategoryName').get_attribute('innerText'),
        'Data Type 資料類別': driver.find_element(By.CSS_SELECTOR, '#ctl00_content_lbDataType').get_attribute('innerText'),
        #'Int No 整合編號': driver.find_elements(By.CSS_SELECTOR, '#ctl00_content_lbIntNo').get_attribute('innerText'),
        'Food Name 樣品名稱': driver.find_element(By.CSS_SELECTOR, '#ctl00_content_lbFoodName').get_attribute('innerText'),
        'Other Name 俗名': driver.find_element(By.CSS_SELECTOR, '#ctl00_content_lbTrivialName').get_attribute('innerText'),
        'English Name 樣品英文名稱': driver.find_element(By.CSS_SELECTOR, '#ctl00_content_lbFoodEName').get_attribute('innerText'),
        'Content 內容物描述': driver.find_element(By.CSS_SELECTOR, '#ctl00_content_lbContent').get_attribute('innerText')
    }

    food_info.append(info)

'''
extract_food_info()
'''

'\nextract_food_info()\n'

In [28]:
# Loop through each row and extract '分析項' and '每100克含量'
def extract_food_nutr():
    """Extract food name and nutrition data for selected items from the TFDA website."""
    try:
        # Define the specific '分析項' you're interested in
        selected_items = ["熱量", "粗蛋白", "粗脂肪", "飽和脂肪", "總碳水化合物", "膳食纖維", "糖質總量", "鈉", "膽固醇", "反式脂肪"]

        # Locate rows in a table
        trs = driver.find_elements(By.CSS_SELECTOR, 'table.rwd-table > tbody > tr')    # append food name
        
        # Append food name
        nutr = []
        food_name = driver.find_element(By.CSS_SELECTOR, '#ctl00_content_lbFoodName').get_attribute('innerText')
        nutr.append(food_name)
        for tr in trs:
            tds = tr.find_elements(By.CSS_SELECTOR, 'td.txt_C')
            if len(tds) >= 2:  # Check if there are at least 2 <td> elements
                analysis_item = tds[1].text.strip()
                # Skip the '修正熱量' column
                if analysis_item == "修正熱量":
                    continue  # Skip this iteration if it's "修正熱量"
                if analysis_item in selected_items:
                    # Extract based on selected items
                    nutr.append(tds[3].text.strip())
                    
        # Append the extracted info
        food_nutr.append(nutr)

    except Exception as e:
        print(f"Error extracting data: {e}")
        return []

#execute function and print the nutrition data dictionary
#extract_food_nutr()
#pprint.pprint(food_nutr)

" \n        #取得列表內文\n        analysis_item = row.find_elements(By.CSS_SELECTOR, 'td:nth-child(2)').get_attribute('innerText')\n        if analysis_item in selected_items:\n            content_per_100g = row.find_element(By.CSS_SELECTOR, 'td:nth-child(4)').text.strip()\n\n            # Append the item and its content per 100g to the dictionary\n            food_nutr[analysis_item] = content_per_100g\n"

In [29]:
max_tabs = len(driver.window_handles)

#iterate over list_data (all of frozen food links)
for index, item in enumerate(list_data):
    url_link = item['Link']
    food_name = item['Food']

    # Switch tabs
    tab_index = (index % max_tabs)  # Cycle through tabs
    driver.switch_to.window(
            driver.window_handles[tab_index]
    )
    
    # 使分頁自動連結到指定網址 (此時的 drive 變數指向切後的分頁)
    driver.get(url_link)
        
    extract_food_info()
    extract_food_nutr()

    # You can also process or store the food_name if needed
    print(f"Processing food: {food_name} at {url_link}")
        
    sleep(3)

#check output
pprint.pprint(food_info, sort_dicts=False)
pprint.pprint(food_nutr)

Processing food: 冷凍火腿炒飯 at https://consumer.fda.gov.tw/Food/tfndDetail.aspx?nodeID=178&f=0&id=1753
Processing food: 冷凍蝦仁炒飯 at https://consumer.fda.gov.tw/Food/tfndDetail.aspx?nodeID=178&f=0&id=1770
Processing food: 冷凍筒仔米糕 at https://consumer.fda.gov.tw/Food/tfndDetail.aspx?nodeID=178&f=0&id=1797
Processing food: 冷凍芝麻湯圓 at https://consumer.fda.gov.tw/Food/tfndDetail.aspx?nodeID=178&f=0&id=1755
Processing food: 冷凍花生湯圓 at https://consumer.fda.gov.tw/Food/tfndDetail.aspx?nodeID=178&f=0&id=1756
Processing food: 冷凍豬肉湯圓 at https://consumer.fda.gov.tw/Food/tfndDetail.aspx?nodeID=178&f=0&id=1774
Processing food: 冷凍烏龍麵 at https://consumer.fda.gov.tw/Food/tfndDetail.aspx?nodeID=178&f=0&id=1972
Processing food: 冷凍牛肉水餃 at https://consumer.fda.gov.tw/Food/tfndDetail.aspx?nodeID=178&f=0&id=1784
Processing food: 冷凍豬肉水餃 at https://consumer.fda.gov.tw/Food/tfndDetail.aspx?nodeID=178&f=0&id=1771
Processing food: 冷凍豬肉韭菜水餃 at https://consumer.fda.gov.tw/Food/tfndDetail.aspx?nodeID=178&f=0&id=1766
Processin

## Convert to Dataframe Graph & Save as CSV

In [30]:
import pandas as pd

selected_items = []
selected_items = ["熱量", "粗蛋白", "粗脂肪", "飽和脂肪", "總碳水化合物", "膳食纖維", "糖質總量", "鈉", "膽固醇", "反式脂肪"]
selected_items.insert(0,"樣品名稱")
selected_items


['樣品名稱',
 '熱量',
 '粗蛋白',
 '粗脂肪',
 '飽和脂肪',
 '總碳水化合物',
 '膳食纖維',
 '糖質總量',
 '鈉',
 '膽固醇',
 '反式脂肪']

In [31]:
df = pd.DataFrame(food_nutr, columns=selected_items)
df

Unnamed: 0,樣品名稱,熱量,粗蛋白,粗脂肪,飽和脂肪,總碳水化合物,膳食纖維,糖質總量,鈉,膽固醇,反式脂肪
0,冷凍火腿炒飯,189,5.0,5.6,1.1,29.7,2.0,,220,37,
1,冷凍蝦仁炒飯,148,4.7,3.4,0.5,24.6,2.7,1.3,222,11,13.41
2,冷凍筒仔米糕,212,6.5,5.8,2.2,33.6,0.7,,331,13,
3,冷凍芝麻湯圓,352,4.8,16.3,6.6,46.7,0.9,9.7,3,0,69.29
4,冷凍花生湯圓,350,5.2,15.6,7.3,47.1,0.6,9.8,6,0,66.28
...,...,...,...,...,...,...,...,...,...,...,...
75,冷凍蟹味棒,118,9.3,0.7,0.2,18.6,,3.1,694,,4.80
76,冷凍魚卵卷,115,12.9,0.2,0.1,15.5,,3.7,830,28,6.32
77,冷凍花枝漿,227,11.2,12.6,4.5,17.1,2.0,1.8,637,188,
78,冷凍花枝羹,126,11.4,4.7,1.8,9.4,,,448,80,


In [32]:
df.shape

(80, 11)

In [33]:
df.to_csv('FrozenFood_ExtractedNutrInfo.csv', index=False)

## 關閉分頁及browser

In [34]:
# 讀最後一頁開始，把所有分頁關掉 (初始頁要保留)
while len(driver.window_handles) > 1:
    #change tabs
    driver.switch_to.window(
        driver.window_handles[ len(driver.window_handles) - 1 ]
    )

    #close tabs
    driver.close()

In [35]:
#close browser
driver.quit()