# 大唐杯题库提取

## 必要的库

In [None]:
import os
from bs4 import BeautifulSoup
import pandas as pd

## 读取

In [54]:
# 读取 HTML 文件
html_file = r'.\Qbank.html'
with open(html_file, 'r', encoding='utf-8') as file:
    soup = BeautifulSoup(file, 'html.parser')

data = []

## 特征数据蒸馏

In [55]:

# 提取所有问题容器
question_blocks = soup.select('div.pr.pos_fl12.yxt-col.yxt-col-23')

# 提取所有选项容器（与问题容器一一对应）
option_blocks = soup.select('div[role="radiogroup"], div[role="group"]')

# 确保问题和选项一一对应
if len(question_blocks) != len(option_blocks):
    print("警告：问题数量与选项数量不匹配！请检查HTML结构。")
    # 为了避免错位，截取匹配的部分
    min_length = min(len(question_blocks), len(option_blocks))
    question_blocks = question_blocks[:min_length]
    option_blocks = option_blocks[:min_length]

# 遍历问题和选项
for i, (question, options) in enumerate(zip(question_blocks, option_blocks)):
    # 提取问题文本
    question_text = question.find('span', {'data-rich-text': '1'}).get_text(strip=True)

    # 提取选项
    option_texts = []
    for label in options.select('label'):
        # 提取选项内容
        option_label = label.find('span', {'data-rich-text': '1'})
        if option_label:
            option_texts.append(option_label.get_text(strip=True))
        else:
            option_texts.append('')

    # 提取答案和考点
    answer_section = question.find_next('div', class_='yxtulcdsdk-review-tag')
    answer_text = '无'
    key_point = '无'

    if answer_section:
        # 使用标签文本精准定位
        for item in answer_section.select('div.mv16'):
            tag = item.find('span', class_='yxtulcdsdk-pc-marking-tag')
            if not tag:
                continue

            if tag.get_text(strip=True) == '答案':
                answer_text = item.find('span', class_='color-gray-9').get_text(strip=True) or '无'
            elif tag.get_text(strip=True) == '考点':
                key_point = item.find('span', class_='color-gray-9').get_text(strip=True) or '无'

    # 判断题型
    if answer_text in ['正确', '错误']:
        question_type = '判断'
    elif options.get('role') == 'radiogroup':
        question_type = '单选'
    else:
        question_type = '多选'

    # 补齐4个选项（仅对单选题和多选题有效）
    if question_type != '判断':
        option_texts += [''] * (4 - len(option_texts))

    # 生成“答案速查”列
    if question_type == '判断':
        answer_lookup = answer_text  # 判断题直接使用答案
    elif question_type == '单选':
        # 单选题根据答案找到对应选项
        try:
            answer_lookup = option_texts[ord(answer_text) - ord('A')] if answer_text in ['A', 'B', 'C', 'D'] else '无'
        except (IndexError, ValueError):
            answer_lookup = '无'
    else:  # 多选题
        # 多选题可能有多个答案，用分号分隔
        try:
            if not answer_text.strip():  # 如果答案为空
                answer_lookup = '无'
            else:
                # 检查是否为全选
                selected_answers = [ans.strip() for ans in answer_text.split('、') if ans.strip() in ['A', 'B', 'C', 'D']]
                if set(selected_answers) == {'A', 'B', 'C', 'D'}:
                    answer_lookup = '全选'
                else:
                    # 确保答案格式正确，并逐一解析
                    answer_lookup = ';'.join(
                        [option_texts[ord(ans) - ord('A')] for ans in selected_answers]
                    )
        except (IndexError, ValueError):
            answer_lookup = '无'

    # 添加到数据列表
    data.append({
        '考点': key_point,
        '题型': question_type,
        '问题': question_text,
        'A选项': option_texts[0] if len(option_texts) > 0 else '',
        'B选项': option_texts[1] if len(option_texts) > 1 else '',
        'C选项': option_texts[2] if len(option_texts) > 2 else '',
        'D选项': option_texts[3] if len(option_texts) > 3 else '',
        '答案': answer_text,
        '答案速查': answer_lookup
    })

## 去重

In [56]:
# 将 data 转换为 DataFrame
df = pd.DataFrame(data)

# 检查重复值并记录
duplicates = df.duplicated(subset=['问题'], keep='first')
duplicate_count = duplicates.sum()
duplicate_rate = duplicate_count / len(df) * 100

# 去重操作：只保留第一个重复项
df = df.drop_duplicates(subset=['问题'], keep='first')

# 输出重复信息
print(f"发现重复问题数量：{duplicate_count}")
print(f"重复率：{duplicate_rate:.2f}%")

发现重复问题数量：486
重复率：80.33%


## 导出

In [57]:
# 导出到Excel
output_file = r'.\output.xlsx'
df.to_excel(output_file, index=False, engine='openpyxl')
print(f"Process completed! Exported to {output_file}")

Process completed! Exported to .\output.xlsx
