In [6]:
import pandas as pd

df = pd.read_parquet(r'D:\workspace\xiaoyao\data\factortable.parquet')
df.head()


Unnamed: 0,date,stock_code,open,close,low,high,volume,money,factor,high_limit,...,order_book_volume_ratio,obv_ratio_vs_yesterday,obv_ratio_vs_5d_avg,turnover_ratio_vs_yesterday,turnover_ratio_vs_5d_avg,money_ratio_vs_yesterday,money_ratio_vs_5d_avg,amplitude,amplitude_vs_yesterday,amplitude_vs_5d_avg
0,2025-01-02,000001.XSHE,1630.12,1588.43,1582.87,1635.68,1309344.0,2102923000.0,138.970157,1788.55,...,1.106891,,,,,,,3.247947,,
1,2025-01-03,000001.XSHE,1589.82,1581.48,1578.7,1603.72,830884.0,1320521000.0,138.970157,1746.85,...,2.388428,2.15778,2.15778,0.634531,0.634531,0.627945,0.627945,1.57514,0.484965,0.484965
2,2025-01-06,000001.XSHE,1581.48,1589.82,1559.25,1595.38,781129.0,1234306000.0,138.970157,1739.91,...,6.639049,2.779673,3.798823,0.940168,0.729954,0.934711,0.72109,2.284569,1.450391,0.947347
3,2025-01-07,000001.XSHE,1587.04,1599.55,1580.09,1602.33,538146.0,858329000.0,138.970157,1748.24,...,0.444144,0.066899,0.131476,0.688952,0.55265,0.695394,0.552839,1.398901,0.612326,0.590448
4,2025-01-08,000001.XSHE,1598.16,1598.16,1584.26,1616.22,764471.0,1223599000.0,138.970157,1759.36,...,1.405289,3.164042,0.531375,1.420602,0.883956,1.425559,0.887296,1.998062,1.428309,0.93954


In [7]:
df.columns

Index(['date', 'stock_code', 'open', 'close', 'low', 'high', 'volume', 'money',
       'factor', 'high_limit', 'low_limit', 'avg', 'pre_close', 'paused',
       'zjw_industry_code', 'zjw_industry_name', 'jq_l1_industry_code',
       'jq_l1_industry_name', 'jq_l2_industry_code', 'jq_l2_industry_name',
       'sw_l1_industry_code', 'sw_l1_industry_name', 'sw_l2_industry_code',
       'sw_l2_industry_name', 'sw_l3_industry_code', 'sw_l3_industry_name',
       'capitalization', 'circulating_cap', 'market_cap',
       'circulating_market_cap', 'turnover_ratio', 'pe_ratio', 'pe_ratio_lyr',
       'pb_ratio', 'ps_ratio', 'pcf_ratio', 'current', 'auc_volume',
       'auc_money', 'a1_p', 'a1_v', 'a2_p', 'a2_v', 'a3_p', 'a3_v', 'a4_p',
       'a4_v', 'a5_p', 'a5_v', 'b1_p', 'b1_v', 'b2_p', 'b2_v', 'b3_p', 'b3_v',
       'b4_p', 'b4_v', 'b5_p', 'b5_v', 'return_0d', 'return_1d', 'return_2d',
       'return_3d', 'return_4d', 'return_5d', 'ma5', 'ma10', 'ma20', 'ma60',
       'rsi14', 'macd_line', '

In [9]:
import pandas as pd
import numpy as np

def score_industries_daily(df, industry_type="sw_l1"):
    """
    按每日为每个行业计算得分（无未来数据）
    
    参数：
    df: 原始DataFrame（需包含所有字段，且有date列）
    industry_type: 行业分类类型（sw_l1/zjw/jq_l1）
    
    返回：
    daily_industry_scores: 每日各行业的评分表（含分项分和总分）
    """
    # --------------------------
    # 1. 确定行业分类字段
    # --------------------------
    if industry_type == "sw_l1":
        industry_code = "sw_l1_industry_code"
        industry_name = "sw_l1_industry_name"
    elif industry_type == "zjw":
        industry_code = "zjw_industry_code"
        industry_name = "zjw_industry_name"
    elif industry_type == "jq_l1":
        industry_code = "jq_l1_industry_code"
        industry_name = "jq_l1_industry_name"
    else:
        raise ValueError("industry_type仅支持'sw_l1'/'zjw'/'jq_l1'")

    # 排除停牌股票，确保数据有效性
    df_valid = df[df["paused"] == 0].copy()
    # 确保日期格式正确
    df_valid["date"] = pd.to_datetime(df_valid["date"])

    # --------------------------
    # 2. 按【日期+行业】分组计算每日指标
    # --------------------------
    # 分组键：日期+行业代码+行业名称
    group_keys = ["date", industry_code, industry_name]
    
    # 计算每日行业指标（全部基于当日数据，无未来数据）
    daily_metrics = df_valid.groupby(group_keys).agg({
        # 趋势类指标（当日行业整体趋势）
        "close": "mean",                # 当日行业平均收盘价
        "ma5": "mean",                  # 当日行业平均5日均线
        "ma10": "mean",                 # 当日行业平均10日均线
        "macd_line": "mean",            # 当日行业平均MACD线
        "signal_line": "mean",          # 当日行业平均MACD信号线
        "macd_hist": "mean",            # 当日行业平均MACD柱状图
        "momentum14": "mean",           # 当日行业平均14日动量
        
        # 量能类指标（当日资金流入强度）
        "volume_ratio_vs_yesterday": "mean",  # 当日行业平均成交量比昨日
        "turnover_ratio_vs_yesterday": "mean",# 当日行业平均换手率比昨日
        "auc_volume_ratio_vs_yesterday": "mean",# 当日行业平均竞价量比昨日
        
        # 盘口类指标（当日买盘强度）
        "order_book_volume_ratio": "mean",    # 当日行业平均盘口量比（买/卖）
        "obv_ratio_vs_yesterday": "mean",     # 当日行业平均盘口量比与昨日比
        
        # 估值类指标（当日估值安全性）
        "rsi14": "mean",                # 当日行业平均RSI14
        "pe_ratio": "mean"              # 当日行业平均动态市盈率
    }).reset_index()

    # 重命名字段便于理解
    daily_metrics.columns = [
        "date", "ind_code", "ind_name",
        # 趋势类
        "avg_close", "avg_ma5", "avg_ma10",
        "avg_macd_line", "avg_signal_line", "avg_macd_hist",
        "avg_momentum14",
        # 量能类
        "avg_volume_ratio", "avg_turnover_ratio", "avg_auc_ratio",
        # 盘口类
        "avg_order_book_ratio", "avg_obv_ratio",
        # 估值类
        "avg_rsi14", "avg_pe"
    ]

    # --------------------------
    # 3. 按日计算行业得分（每日独立评分）
    # --------------------------
    # 3.1 趋势得分（100分）：当日行业趋势强度
    daily_metrics["trend_score"] = (
        (daily_metrics["avg_close"] > daily_metrics["avg_ma5"]).astype(int) * 20 +
        (daily_metrics["avg_close"] > daily_metrics["avg_ma10"]).astype(int) * 20 +
        (daily_metrics["avg_macd_line"] > daily_metrics["avg_signal_line"]).astype(int) * 30 +
        (daily_metrics["avg_macd_hist"] > 0).astype(int) * 15 +
        (daily_metrics["avg_momentum14"] > 0).astype(int) * 15
    )

    # 3.2 量能得分（100分）：当日资金流入强度
    daily_metrics["volume_score"] = (
        np.clip((daily_metrics["avg_volume_ratio"] - 1) * 100, 0, 30) +
        np.clip((daily_metrics["avg_turnover_ratio"] - 1) * 100, 0, 30) +
        np.clip((daily_metrics["avg_auc_ratio"] - 1) * 100, 0, 20) +
        np.clip((daily_metrics["avg_obv_ratio"] - 1) * 100, 0, 20)
    )

    # 3.3 盘口得分（100分）：当日买盘强度
    daily_metrics["order_score"] = (
        np.clip((daily_metrics["avg_order_book_ratio"] - 1) * 200, 0, 60) +
        (daily_metrics["avg_order_book_ratio"] > 1.2).astype(int) * 40
    )

    # 3.4 估值得分（100分）：当日估值安全性
    daily_metrics["valuation_score"] = (
        np.clip(70 - daily_metrics["avg_rsi14"], 0, 50) +
        np.clip(1 / daily_metrics["avg_pe"] * 1000, 0, 50)  # 市盈率越低得分越高
    )

    # 3.5 当日总得分（加权求和）
    daily_metrics["total_score"] = (
        daily_metrics["trend_score"] * 0.3 +
        daily_metrics["volume_score"] * 0.3 +
        daily_metrics["order_score"] * 0.2 +
        daily_metrics["valuation_score"] * 0.2
    )

    return daily_metrics


def get_top_industries_by_date(daily_scores, target_date, top_n=5):
    """
    获取指定日期的前N个潜力行业
    
    参数：
    daily_scores: 由score_industries_daily生成的每日评分表
    target_date: 目标日期（格式："YYYY-MM-DD" 或 datetime对象）
    top_n: 取前N个行业
    
    返回：
    指定日期的前N个行业及其得分
    """
    # 统一日期格式
    if isinstance(target_date, str):
        target_date = pd.to_datetime(target_date)
    
    # 筛选目标日期的数据
    date_data = daily_scores[daily_scores["date"] == target_date].copy()
    if date_data.empty:
        raise ValueError(f"目标日期 {target_date.strftime('%Y-%m-%d')} 无数据或非交易日")
    
    # 按总得分降序排序，取前N个
    top_industries = date_data.sort_values("total_score", ascending=False).head(top_n)
    
    # 整理输出格式
    result = top_industries[
        ["ind_name", "trend_score", "volume_score", 
         "order_score", "valuation_score", "total_score"]
    ].rename(columns={
        "ind_name": "行业名称",
        "trend_score": "趋势得分",
        "volume_score": "量能得分",
        "order_score": "盘口得分",
        "valuation_score": "估值得分",
        "total_score": "总得分"
    }).reset_index(drop=True)
    
    return result


In [None]:

# --------------------------
# 调用示例
# --------------------------
# 1. 读取数据
df = pd.read_parquet(r'D:\workspace\xiaoyao\data\factortable.parquet')

# 2. 计算每日行业得分（全量日期）
daily_scores = score_industries_daily(df, industry_type="sw_l1")

# 3. 获取指定日期的前5个潜力行业（例如2023-10-10）
target_date = "2025-10-10"
top5 = get_top_industries_by_date(daily_scores, target_date, top_n=5)

# 4. 打印结果
print(f"{target_date} 最可能大涨的前5个行业：")
print(top5)
    

ValueError: 目标日期 2025-10-10 无数据或非交易日