In [1]:
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from time import sleep

def save_data_to_excel(df, output_path):
    # 将数据保存到 Excel 文件
    df.to_excel(output_path, index=False)
    print(f'数据已保存到 {output_path}')

def initialize_driver():
    # 设置 ChromeDriver 的路径
    driver_path = '/Users/zhuyenan/Documents/jupyter/geckodriver'
    # 创建 Service 对象
    service = Service(driver_path)
    # 创建 Chrome 浏览器对象
    driver = webdriver.Firefox(service=service)
    driver.implicitly_wait(3)
    return driver

def get_user_inputs():
    additional_info = "GDP"
    target_text = "中国统计年鉴"
    return additional_info, target_text


def build_url(base_url, additional_info):
    # 构建完整的 URL
    return f"{base_url}{additional_info}"

def setup_initial_page(driver, url):
    # 打开初始页面
    driver.get(url)
    driver.switch_to.window(driver.window_handles[-1])
    sleep(5)

def configure_search(driver):
    # 选择精确搜索
    model = driver.find_element(By.XPATH, '/html/body/div[5]/div[2]/div[1]/div/div[1]/div[1]/div[1]/div')
    model.click()
    model_select = driver.find_element(By.XPATH, '/html/body/div[5]/div[2]/div[1]/div/div[1]/div[1]/div[1]/div/div')
    model_select.find_element(By.XPATH, "//li/a[text()='模糊']").click()

    # 设置起始年份为2011年
    start = driver.find_element(By.XPATH, '/html/body/div[5]/div[2]/div[1]/div/div[1]/div[1]/div[3]/div[1]/div')
    start.click()
    driver.execute_script("window.scrollBy(0, 80);")
    sleep(2)
    start_select = driver.find_element(By.CSS_SELECTOR, 'div.valueSearch_primary-select__1Be9L:nth-child(2) > ul:nth-child(2)')
    start_select.find_element(By.XPATH, "//li/a[text()='2011']").click()

    # 设置终止年份为2021年 76=2023，
    driver.execute_script("window.scrollBy(0, 80);")
    sleep(2)
    end = driver.find_element(By.XPATH, '/html/body/div[5]/div[2]/div[1]/div/div[1]/div[1]/div[3]/div[2]/div')
    end.click()
    end_select = driver.find_element(By.CSS_SELECTOR, 'div.valueSearch_primary-select__1Be9L:nth-child(4) > ul:nth-child(2)')
    end_select.find_element(By.XPATH, '/html/body/div[5]/div[2]/div[1]/div/div[1]/div[1]/div[3]/div[2]/ul/li[76]').click()

    # 设置每页显示50条记录
    content_number = driver.find_element(By.XPATH, '/html/body/div[5]/div[2]/div[1]/div/div[1]/div[2]/span/span[3]')
    content_number.click()
    sleep(3)

def get_info(driver, target_text):
    df_all = pd.DataFrame()
    
    try:
        driver.execute_script("window.scrollBy(0, 300);")
        # 定位到包含<li>的<ul>
        resources = driver.find_element(By.XPATH, '/html/body/div[5]/div[2]/div[1]/div/div[2]/div[3]/div')
        resources.click()
        
        # 定位包含<li>标签的<ul>
        ul_element = driver.find_element(By.CSS_SELECTOR, "ul.valueSearch_panel-body__1Mejz.valueSearch_panel-all__27BTp")
        li_elements = ul_element.find_elements(By.TAG_NAME, "li")  # 找到所有的<li>元素
        found = False

        # 遍历<li>元素，匹配文本
        for li in li_elements:
            li_text = li.text.strip()  # 获取文本并去掉空白字符
            if target_text == li_text:
                li.click()  # 点击匹配的<li>元素
                sleep(2)
                found = True
                break

        if not found:
            print(f"未找到与 '{target_text}' 完全匹配的<li>节点。")

    except Exception as e:
        print(f"选择节点时出错: {e}")


    # 循环获取所有页面数据
    while True:
        try:
            sleep(2)
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, 'tbody.valueSearch_s-tab-tbody__1QPXw')))
            html = driver.page_source
            soup = BeautifulSoup(html, 'html.parser')
            tbody = soup.find('tbody', class_='valueSearch_s-tab-tbody__1QPXw')

            rows = []
            for tr in tbody.find_all('tr'):
                tds = tr.find_all('td')
                try:
                    row = {
                        '序号': tds[0].get_text(),
                        '时间': tds[1].get_text(),
                        '地区': tds[2].get_text(),
                        '指标': tds[3].get_text(),
                        '数值': tds[4].get_text(),
                        '单位': tds[5].get_text(),
                        '来源': tds[6].get_text(),
                        '页码': tds[7].get_text(),
                    }
                    rows.append(row)
                except:
                    pass
            df = pd.DataFrame(rows)
            print(df)

            df_all = pd.concat([df_all, df], ignore_index=True)
            sleep(2)

            try:
                next_button = driver.find_element(By.CLASS_NAME, "btn-next")
                next_button.click()
                sleep(7)
            except:
                break

        except Exception as e:
            print(f"数据提取时出错: {e}")
            break

    return df_all

def main():
    base_url = "https://data.cnki.net/valueSearch/index?ky="
    additional_info, target_text = get_user_inputs()
    url = build_url(base_url, additional_info)
    cities = [
        '陕西', '山西', '广西', '山东'
    ]

    driver = initialize_driver()
    setup_initial_page(driver, url)
    configure_search(driver)

    all_dfs = []
    for count, city in enumerate(cities, 1):
        driver.execute_script("var q=document.documentElement.scrollTop=0")
        search_box = driver.find_element(By.XPATH, '//*[@id="root"]/div[2]/div[1]/div/div[1]/div[1]/div[2]/input')
        search_box.clear()
        sleep(1)
        search_box.send_keys(city)
        sleep(1)
        driver.find_element(By.XPATH, '//*[@id="root"]/div[2]/div[1]/div/div[1]/div[1]/div[5]/span').click()
        sleep(2)

        df_all = get_info(driver, target_text)
        all_dfs.append(df_all)

        print(f'{city} 数据获取完成。({count}/{len(cities)})')

    final_df = pd.concat(all_dfs, ignore_index=True)
    final_df['数值'] = pd.to_numeric(final_df['数值'], errors='coerce')
    final_df.dropna(inplace=True)

    output_file_path = f"/Users/zhuyenan/Downloads/{additional_info}.xlsx"
    save_data_to_excel(final_df, output_file_path)

    driver.quit()

if __name__ == "__main__":
    main()


未找到与 '中国统计年鉴' 完全匹配的<li>节点。
数据提取时出错: Message: 
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.sys.mjs:8:8
WebDriverError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:193:5
NoSuchElementError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:511:5
dom.find/</<@chrome://remote/content/shared/DOM.sys.mjs:136:16

陕西 数据获取完成。(1/4)
选择节点时出错: Message: Unable to locate element: ul.valueSearch_panel-body__1Mejz.valueSearch_panel-all__27BTp; For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.sys.mjs:8:8
WebDriverError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:193:5
NoSuchElementError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:511:5
dom.find/</<@chrome://remote/content/shared/DOM.sys.mjs:136:16

数据提取时出错: Message: 
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteErro

KeyError: '数值'