In [5]:
import requests
from bs4 import BeautifulSoup

def translate_tag(source):

    if "原文" in source or "爻辞" in source:
        return "description"
    if source == "白话文解释":
        return "interpretation"
    if source == "北宋易学家邵雍解" or source == "台湾国学大儒傅佩荣解":
        return "scholar_interpretation"
    if source == "《断易天机》解":
        return "book_DuanYi"
    if source == "传统解卦":
        return "traditional"
    if "哲学含义" in source:
        return "philosophy"
    if "变卦" in source:
        return None
    return source

def parse_gua(url):
    response = requests.get(url)
    response.encoding = 'utf-8'  # 设置编码
    html_content = response.text

    # 解析HTML
    soup = BeautifulSoup(html_content, 'html.parser')
    # 提取卦的总体解

    gua_id = soup.title.string.split('_')[0].replace("周易第", "").replace("卦", "")
    gua_wp = soup.find('div', class_='gua_wp')
    guatts = gua_wp.find_all('div', class_='guatt')
    gua_lss = gua_wp.find_all('div', class_='gualist')
    dic = {}
    line_counter = 0

    for guatt, gua_ls in zip(guatts, gua_lss):
        result = {}
        # 查找所有的 <strong> 标签
        strong_tags = gua_ls.find_all('strong')
        # 遍历每个 <strong> 标签
        for idx, strong_tag in enumerate(strong_tags):
            # 提取键（去除多余的空白字符）
            key = translate_tag(strong_tag.get_text(strip=True))
            if key is None:
                continue

            value_parts = []
            if key in result:
                value_parts.append(result[key])
            # 遍历当前 <strong> 标签之后的所有兄弟节点
            for sibling in strong_tag.next_siblings:
                # 如果遇到下一个 <strong> 标签，停止收集
                if sibling.name == 'strong':
                    break
                else:
                    # 如果是标签，获取其文本内容
                    if hasattr(sibling, 'get_text'):
                        text = sibling.get_text(strip=True)
                        if text:
                            value_parts.append(text)
                    # 如果是字符串，直接添加
                    elif isinstance(sibling, str):
                        text = sibling.strip()
                        if text:
                            value_parts.append(text)
            # 合并收集到的文本作为值
            value = '\n'.join(value_parts)
            result[key] = value
        if line_counter == 0:
            section_key = "general"
        else:    
            section_key = f"line_{line_counter}"
        dic[section_key] = result
        line_counter += 1
    return gua_id, dic



In [2]:
url_parent = "https://www.zhouyi.cc/zhouyi/yijing64/"
response = requests.get(url_parent)
response.encoding = 'utf-8'  # 设置编码
soup = BeautifulSoup(response.text, 'html.parser')
links = soup.find_all('a')
urls = set()
for link in links:
    url = link.get('href')
    if url and url.startswith("/zhouyi/yijing64/") and url.endswith(".html"):
        urls.add(url)
print(urls)

{'/zhouyi/yijing64/4127.html', '/zhouyi/yijing64/4168.html', '/zhouyi/yijing64/4183.html', '/zhouyi/yijing64/4190.html', '/zhouyi/yijing64/4170.html', '/zhouyi/yijing64/4153.html', '/zhouyi/yijing64/4182.html', '/zhouyi/yijing64/4184.html', '/zhouyi/yijing64/4113.html', '/zhouyi/yijing64/4110.html', '/zhouyi/yijing64/4197.html', '/zhouyi/yijing64/4149.html', '/zhouyi/yijing64/4188.html', '/zhouyi/yijing64/4109.html', '/zhouyi/yijing64/4141.html', '/zhouyi/yijing64/4194.html', '/zhouyi/yijing64/4179.html', '/zhouyi/yijing64/4145.html', '/zhouyi/yijing64/4189.html', '/zhouyi/yijing64/4148.html', '/zhouyi/yijing64/4186.html', '/zhouyi/yijing64/4193.html', '/zhouyi/yijing64/4173.html', '/zhouyi/yijing64/4164.html', '/zhouyi/yijing64/4174.html', '/zhouyi/yijing64/4167.html', '/zhouyi/yijing64/4180.html', '/zhouyi/yijing64/4143.html', '/zhouyi/yijing64/4103.html', '/zhouyi/yijing64/4255.html', '/zhouyi/yijing64/4140.html', '/zhouyi/yijing64/4144.html', '/zhouyi/yijing64/4106.html', '/zhouyi/

In [3]:
urls = list(urls)
from tqdm.auto import tqdm
import json
all_gua = {}
for url in tqdm(urls):
    try:
        gua_id, gua_content = parse_gua(f"https://www.zhouyi.cc{url}")
        all_gua[gua_id] = gua_content
        with open("yijing.json", "w") as f:
            json.dump(all_gua, f, ensure_ascii=False, indent=2)
    except Exception as e:
        print(f"Error when parsing {url}: {e}")
        continue

  from .autonotebook import tqdm as notebook_tqdm


 14%|█▍        | 9/64 [00:09<01:00,  1.11s/it]

Error when parsing /zhouyi/yijing64/4113.html: Unknown source: 第十卦


 45%|████▌     | 29/64 [00:28<00:32,  1.07it/s]

Error when parsing /zhouyi/yijing64/4103.html: Unknown source: 乾卦所包含的范围是：


100%|██████████| 64/64 [01:01<00:00,  1.04it/s]


In [11]:
gua_id, gua_content = parse_gua(f"https://www.zhouyi.cc/zhouyi/yijing64/4113.html")
all_gua[gua_id] = gua_content

In [12]:
with open("yijing.json", "w") as f:
    json.dump(all_gua, f, ensure_ascii=False, indent=2)