In [None]:
import pandas as pd
import numpy as np
import os
from datetime import datetime, timedelta

# -------------------------- 全局配置 --------------------------
WIDETABLE_PATH = "D:\\workspace\\xiaoyao\\data\\widetable.parquet"
INDUSTRY_INDEX_PATH = "D:\\workspace\\xiaoyao\\data\\sw_all_levels_index_v2.parquet"
CONCEPT_INDEX_PATH = "D:\\workspace\\xiaoyao\\data\\concept_index_v2.parquet"
OUTPUT_DIR = "D:\\workspace\\xiaoyao\\stock_pool"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# 选股参数
TOP_N_INDUSTRY = 10  # 取TOP10行业
TOP_N_CONCEPT = 10   # 取TOP10概念
TARGET_LEVEL = "L2"  # 行业层级

# -------------------------- 通用工具函数：概念字段格式处理 --------------------------
def process_concept_field(concept_value):
    """支持所有常见格式，统一转为标准非空列表"""
    # 1. 优先处理numpy数组
    if isinstance(concept_value, np.ndarray):
        if concept_value.size == 0:
            return []
        return [str(item).strip() for item in concept_value.tolist() if str(item).strip()]
    
    # 2. 处理空值情况
    if pd.isna(concept_value) or concept_value is None:
        return []
    
    # 3. 列表类型：清理空元素
    if isinstance(concept_value, list):
        return [str(item).strip() for item in concept_value if str(item).strip()]
    
    # 4. 字符串类型：多分隔符处理
    if isinstance(concept_value, str):
        separators = [",", "|", ";", "，", "、", "；", " "]
        for sep in separators:
            if sep in concept_value:
                return [item.strip() for item in concept_value.split(sep) if item.strip()]
        return [concept_value.strip()] if concept_value.strip() else []
    
    # 5. 其他类型：转为字符串后处理
    return [str(concept_value).strip() for item in concept_value if str(item).strip()] if isinstance(concept_value, (list, np.ndarray)) else [str(concept_value).strip()] if str(concept_value).strip() else []


# -------------------------- 1. 行业模块：获取热门行业 --------------------------
def load_industry_data():
    df = pd.read_parquet(INDUSTRY_INDEX_PATH)
    df["trade_date"] = pd.to_datetime(df["trade_date"])
    return df.sort_values(["level", "industry_name", "trade_date"])

def get_top_industries(industry_df, top_n=10, level="L2"):
    latest_date = industry_df["trade_date"].max()
    latest_data = industry_df[
        (industry_df["trade_date"] == latest_date) &
        (industry_df["level"] == level)
    ]
    top_industries = latest_data.sort_values("std_pct", ascending=False)[
        "industry_name"
    ].head(top_n).tolist()
    return top_industries, latest_date


# -------------------------- 2. 概念模块：获取热门概念 --------------------------
def load_concept_data():
    df = pd.read_parquet(CONCEPT_INDEX_PATH)
    df["trade_date"] = pd.to_datetime(df["trade_date"])
    return df.sort_values(["concept_name", "trade_date"])

def get_top_concepts(concept_df, top_n=10):
    latest_date = concept_df["trade_date"].max()
    latest_data = concept_df[concept_df["trade_date"] == latest_date].copy()
    latest_data["total_score"] = (
        latest_data["std_pct"] * 0.5 +  # 热度分
        (1 - latest_data["max_weight_ratio"]/100) * 0.3 +  # 分散度分
        np.log1p(latest_data["component_count"]) * 0.2  # 成分股数量分
    )
    top_concepts = latest_data.sort_values("total_score", ascending=False)[
        "concept_name"
    ].head(top_n).tolist()
    return top_concepts, latest_date


# -------------------------- 3. 核心模块：候选股挖掘与热点次数统计 --------------------------
def 挖掘_candidate_stocks(target_date, top_industries, top_concepts):
    """挖掘行业-概念交集候选股，并统计热点匹配次数"""
    # 读取原始数据
    df = pd.read_parquet(
        WIDETABLE_PATH,
        columns=["date", "stock_code", "stock_name", 
                 "sw_l2_industry_name", "concept_name_list", "close", 
                 "pre_close", "volume", "turnover_ratio"]
    )
    df["date"] = pd.to_datetime(df["date"])
    target_data = df[df["date"] == target_date].copy()
    if len(target_data) == 0:
        print(f"❌ 无{target_date}的股票数据")
        return None

    # 处理概念字段
    print("\n=== 概念字段处理中 ===")
    target_data["clean_concepts"] = target_data["concept_name_list"].apply(process_concept_field)
    
    # 过滤无有效概念的股票
    target_data = target_data[
        target_data["clean_concepts"].apply(lambda x: len(x) > 0)
    ].copy()
    if len(target_data) == 0:
        print("❌ 无有效概念的股票数据")
        total = len(df[df["date"] == target_date])
        null_count = df[df["date"] == target_date]["concept_name_list"].isna().sum()
        print(f"统计：目标日期共{total}只股票，其中{null_count}只为空值")
        return None

    # 匹配行业-概念组合并记录匹配关系
    stock_matches = []
    for industry in top_industries:
        for concept in top_concepts:
            mask = (
                (target_data["sw_l2_industry_name"] == industry) &
                (target_data["clean_concepts"].apply(lambda x: concept in x))
            )
            stocks = target_data[mask].copy()
            if len(stocks) > 0:
                stocks["匹配组合"] = f"{industry}+{concept}"
                stock_matches.append(stocks)
                print(f"匹配 {industry}+{concept}：{len(stocks)} 只股票")

    if not stock_matches:
        print("❌ 无交集股票")
        print(f"\n热门行业：{top_industries}")
        print(f"热门概念：{top_concepts}")
        print(f"\n部分股票的有效概念示例：")
        sample = target_data[["stock_code", "stock_name", "clean_concepts"]].head(5)
        for _, row in sample.iterrows():
            print(f"{row['stock_code']}({row['stock_name']})：{row['clean_concepts']}")
        return None

    # 合并所有匹配结果
    all_matches = pd.concat(stock_matches, ignore_index=True)
    
    # 统计每只股票的匹配次数（热点覆盖度）
    stock_stats = all_matches.groupby(["stock_code", "stock_name"]).agg({
        "匹配组合": [
            ("匹配次数", "count"),
            ("匹配组合列表", lambda x: "|".join(x.unique()))  # 去重后展示所有匹配组合
        ],
        "close": "first",  # 最新收盘价
        "turnover_ratio": "first",  # 换手率
        "sw_l2_industry_name": "first",  # 所属行业
        "clean_concepts": "first"  # 所有概念列表
    }).reset_index()

    # 简化列名
    stock_stats.columns = [
        "股票代码", "股票名称", "匹配次数", "匹配组合", 
        "收盘价", "换手率(%)", "所属行业", "全部概念"
    ]

    # 按匹配次数降序排序（热点覆盖度越高越靠前）
    stock_stats = stock_stats.sort_values("匹配次数", ascending=False).reset_index(drop=True)
    stock_stats["排名"] = range(1, len(stock_stats)+1)

    # 调整列顺序
    stock_stats = stock_stats[
        ["排名", "股票代码", "股票名称", "匹配次数", "匹配组合", 
         "所属行业", "换手率(%)", "收盘价", "全部概念"]
    ]

    return stock_stats


# -------------------------- 4. 主函数：生成候选股池并保存 --------------------------
def generate_candidate_pool():
    # 1. 加载数据并获取热门行业/概念
    industry_df = load_industry_data()
    top_industries, trade_date = get_top_industries(
        industry_df, 
        top_n=TOP_N_INDUSTRY, 
        level=TARGET_LEVEL
    )
    print(f"热门行业TOP{TOP_N_INDUSTRY}：{top_industries}")

    concept_df = load_concept_data()
    top_concepts, _ = get_top_concepts(concept_df, top_n=TOP_N_CONCEPT)
    print(f"热门概念TOP{TOP_N_CONCEPT}：{top_concepts}")

    # 2. 挖掘候选股并统计热点次数
    candidate_pool = 挖掘_candidate_stocks(trade_date, top_industries, top_concepts)
    if candidate_pool is None:
        return

    # 3. 保存候选股池
    output_path = os.path.join(OUTPUT_DIR, f"candidate_pool_with_hot_count_{trade_date.strftime('%Y%m%d')}.csv")
    candidate_pool.to_csv(output_path, index=False, encoding="utf-8-sig")
    print(f"\n✅ 候选股池生成完成（共{len(candidate_pool)}只股票），路径：{output_path}")
    print("\n候选股池前10名（按热点匹配次数排序）：")
    print(candidate_pool.head(10)[["排名", "股票代码", "股票名称", "匹配次数", "匹配组合"]].to_string(index=False))

    # 4. 输出统计信息
    print(f"\n=== 统计信息 ===")
    print(f"平均每只股票匹配热点次数：{candidate_pool['匹配次数'].mean():.2f}")
    print(f"匹配次数最多：{candidate_pool['匹配次数'].max()}次")
    print(f"匹配次数分布：")
    print(candidate_pool["匹配次数"].value_counts().sort_index(ascending=False))


if __name__ == "__main__":
    generate_candidate_pool()

SyntaxError: invalid syntax (3694342513.py, line 86)