In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.metrics import r2_score
import matplotlib.font_manager as fm

# ---------------------- 中文显示配置 ----------------------
def setup_chinese_font():
    """配置matplotlib以正确显示中文"""
    try:
        font_names = set()
        font_files = fm.findSystemFonts()
        
        for font_file in font_files:
            try:
                font_prop = fm.FontProperties(fname=font_file)
                font_names.add(font_prop.get_name())
            except:
                continue
        
        chinese_fonts = ["SimHei", "Microsoft YaHei", "Heiti TC", "WenQuanYi Micro Hei", "Arial Unicode MS"]
        available_fonts = [f for f in chinese_fonts if f in font_names]
        
        if available_fonts:
            plt.rcParams["font.family"] = available_fonts
            print(f"使用中文字体: {available_fonts}")
        else:
            print("警告：未找到中文字体，可能会显示乱码。")
            plt.rcParams["font.family"] = ["Arial Unicode MS", "sans-serif"]
    
    except Exception as e:
        print(f"字体配置警告: {str(e)}")
        plt.rcParams["font.family"] = ["SimHei", "Microsoft YaHei", "sans-serif"]
    
    plt.rcParams["axes.unicode_minus"] = False

setup_chinese_font()

# ---------------------- 数据预处理与特征计算 ----------------------
def calculate_sentiment_features(daily_data: pd.DataFrame) -> pd.DataFrame:
    """计算情绪指标的原始特征（用于后续机器学习优化权重）"""
    df = daily_data.copy()
    df['date'] = pd.to_datetime(df['date'])
    df = df.sort_values(['date', 'stock_code'])
    
    # 个股级指标
    df['daily_return'] = (df['close'] - df['open']) / df['open'] * 100
    df['is_limit_up'] = (df['close'] >= df['high_limit']).astype(int)
    df['is_limit_down'] = (df['close'] <= df['low_limit']).astype(int)
    df['is_up'] = (df['daily_return'] > 0).astype(int)
    
    # 市场级特征（原始指标，未标准化）
    daily_features = df.groupby('date').agg(
        total_stocks=('stock_code', 'nunique'),
        up_stocks=('is_up', 'sum'),
        down_stocks=('is_up', lambda x: sum(1 - x)),
        limit_up_stocks=('is_limit_up', 'sum'),
        limit_down_stocks=('is_limit_down', 'sum'),
        avg_return=('daily_return', 'mean'),
        total_volume=('volume', 'sum'),
        up_volume=('volume', lambda x: x[df.loc[x.index, 'is_up'] == 1].mean() if sum(df.loc[x.index, 'is_up']) > 0 else 0),
        down_volume=('volume', lambda x: x[df.loc[x.index, 'is_up'] == 0].mean() if sum(1 - df.loc[x.index, 'is_up']) > 0 else 0)
    ).reset_index()
    
    # 计算核心特征（与原逻辑一致，但保留原始值用于后续标准化）
    daily_features['up_down_ratio'] = np.where(
        daily_features['down_stocks'] > 0,
        daily_features['up_stocks'] / daily_features['down_stocks'],
        10
    )
    
    daily_features['limit_spread_ratio'] = (
        daily_features['limit_up_stocks'] + daily_features['limit_down_stocks']
    ) / daily_features['total_stocks']
    
    daily_features['volume_price_fit'] = (
        daily_features['up_volume'] / daily_features['down_volume']
    ).rolling(window=5, min_periods=1).mean()
    
    daily_features['volume_expansion'] = (
        daily_features['total_volume'] / 
        daily_features['total_volume'].rolling(window=20, min_periods=5).mean()
    )
    
    daily_features['price_volatility'] = daily_features['avg_return'].abs()
    
    # 保留原始特征（后续会基于训练集标准化）
    return daily_features

def calculate_rise_ratio(daily_data: pd.DataFrame) -> pd.DataFrame:
    """计算每日上涨股票比例（预测目标）"""
    df = daily_data.copy()
    df['date'] = pd.to_datetime(df['date'])
    df['prev_close'] = df.groupby('stock_code')['close'].shift(1)
    df['is_rise'] = (df['close'] > df['prev_close']).astype(int)
    
    rise_ratio_df = df.groupby('date').agg(
        total_stocks=('stock_code', 'nunique'),
        rise_stocks=('is_rise', 'sum')
    ).reset_index()
    
    rise_ratio_df['rise_ratio'] = rise_ratio_df['rise_stocks'] / rise_ratio_df['total_stocks']
    rise_ratio_df = rise_ratio_df.dropna(subset=['rise_ratio'])
    rise_ratio_df = rise_ratio_df[(rise_ratio_df['rise_ratio'] >= 0) & (rise_ratio_df['rise_ratio'] <= 1)]
    
    return rise_ratio_df[['date', 'rise_ratio']]

# ---------------------- 特征标准化与标签生成 ----------------------
def prepare_training_data(features_df, rise_ratio_df, lag_days=1):
    """
    准备训练数据：
    1. 特征标准化（基于训练集的均值和标准差）
    2. 生成滞后的目标变量（次日上涨比例）
    """
    # 合并特征与目标变量（滞后N天）
    rise_ratio_shifted = rise_ratio_df.copy()
    rise_ratio_shifted['date'] = rise_ratio_shifted['date'] - pd.Timedelta(days=lag_days)
    
    merged_df = pd.merge(
        features_df, 
        rise_ratio_shifted[['date', 'rise_ratio']], 
        on='date', 
        how='inner'
    ).dropna(subset=['rise_ratio'])
    
    # 定义用于预测的特征列（5个核心指标）
    feature_cols = [
        'up_down_ratio', 
        'volume_price_fit', 
        'volume_expansion', 
        'price_volatility', 
        'limit_spread_ratio'
    ]
    
    # 对特征进行特殊标准化（根据指标特性）
    def normalize_feature(series, higher_better=True, train_mean=None, train_std=None):
        """
        标准化特征：
        - 对正向指标（越高越好）：(x - min)/(max - min)
        - 对负向指标（越低越好）：1 - (x - min)/(max - min)
        - 对中性指标（适度最好）：特殊处理
        """
        if train_mean is None:  # 训练集标准化
            min_val = series.min()
            max_val = series.max()
            if max_val == min_val:
                return series.transform(lambda x: 0.5), (min_val, max_val)
            
            if higher_better:
                normalized = (series - min_val) / (max_val - min_val)
            else:
                normalized = 1 - (series - min_val) / (max_val - min_val)
            return normalized, (min_val, max_val)
        else:  # 测试集标准化（使用训练集的min/max）
            min_val, max_val = train_mean
            if max_val == min_val:
                return series.transform(lambda x: 0.5)
            
            if higher_better:
                normalized = (series - min_val) / (max_val - min_val)
            else:
                normalized = 1 - (series - min_val) / (max_val - min_val)
            # 处理测试集超出训练集范围的情况
            return normalized.clip(0, 1)
    
    # 为每个特征定义标准化方式（根据指标特性）
    feature_params = {
        'up_down_ratio': {'higher_better': True},         # 正向指标
        'volume_price_fit': {'higher_better': True},      # 正向指标
        'volume_expansion': {'higher_better': True},      # 正向指标
        'price_volatility': {'higher_better': False},     # 中性指标（特殊处理）
        'limit_spread_ratio': {'higher_better': False}    # 负向指标
    }
    
    # 存储每个特征的标准化参数（用于测试集）
    normalization_params = {}
    
    # 标准化特征
    for col in feature_cols:
        if col == 'price_volatility':
            # 价格波动：适度波动最好（0.3-0.7区间）
            if train_mean is None:
                series = merged_df[col]
                min_val = series.min()
                max_val = series.max()
                normalized = 1 - (series - 0.5).abs() * 2  # 转换为0-1
                merged_df[f'normalized_{col}'] = normalized.clip(0, 1)
                normalization_params[col] = (min_val, max_val)
            else:
                merged_df[f'normalized_{col}'] = 1 - (merged_df[col] - 0.5).abs() * 2
                merged_df[f'normalized_{col}'] = merged_df[f'normalized_{col}'].clip(0, 1)
        else:
            if train_mean is None:
                normalized, params = normalize_feature(
                    merged_df[col], 
                    higher_better=feature_params[col]['higher_better']
                )
                merged_df[f'normalized_{col}'] = normalized
                normalization_params[col] = params
            else:
                normalized = normalize_feature(
                    merged_df[col], 
                    higher_better=feature_params[col]['higher_better'],
                    train_mean=train_mean[col]
                )
                merged_df[f'normalized_{col}'] = normalized
    
    # 提取标准化后的特征列
    normalized_feature_cols = [f'normalized_{col}' for col in feature_cols]
    
    return merged_df, normalized_feature_cols, normalization_params

# ---------------------- 机器学习优化权重 ----------------------
def optimize_sentiment_weights(train_data, normalized_feature_cols):
    """
    使用线性回归（带正则化）优化特征权重，目标是最大化对次日上涨比例的预测能力
    """
    # 特征矩阵（X）和目标变量（y）
    X_train = train_data[normalized_feature_cols]
    y_train = train_data['rise_ratio']
    
    # 使用带L2正则化的Ridge回归（避免过拟合）
    # 时间序列交叉验证确定最优正则化参数
    best_alpha = 0
    best_score = -np.inf
    alphas = [0.001, 0.01, 0.1, 1, 10, 100]
    
    tscv = TimeSeriesSplit(n_splits=5)  # 时间序列交叉验证（避免数据泄露）
    
    for alpha in alphas:
        model = Ridge(alpha=alpha, random_state=42)
        scores = cross_val_score(model, X_train, y_train, cv=tscv, scoring='r2')
        mean_score = np.mean(scores)
        
        if mean_score > best_score:
            best_score = mean_score
            best_alpha = alpha
    
    # 用最优参数训练模型
    final_model = Ridge(alpha=best_alpha, random_state=42)
    final_model.fit(X_train, y_train)
    
    # 提取权重（并归一化到总和为1）
    weights = final_model.coef_
    weights = np.abs(weights)  # 取绝对值（确保权重为正）
    weights = weights / np.sum(weights)  # 归一化
    
    # 输出权重
    feature_names = [col.replace('normalized_', '') for col in normalized_feature_cols]
    weight_df = pd.DataFrame({
        '特征名称': feature_names,
        '优化后权重': weights.round(4)
    })
    
    print("\n===== 机器学习优化后的特征权重 =====")
    print(weight_df)
    
    # 训练集上的R²分数（解释力）
    y_pred = final_model.predict(X_train)
    train_r2 = r2_score(y_train, y_pred)
    print(f"\n训练集R²分数（解释力）：{train_r2:.4f}")
    
    return weights, final_model, weight_df

# ---------------------- 验证优化后的情绪指标 ----------------------
def validate_optimized_sentiment(test_data, normalized_feature_cols, weights, normalization_params):
    """使用优化后的权重计算情绪得分，并验证其与上涨比例的相关性"""
    # 计算优化后的情绪得分（加权求和）
    feature_names = [col.replace('normalized_', '') for col in normalized_feature_cols]
    test_data['optimized_sentiment'] = 0
    
    for i, col in enumerate(normalized_feature_cols):
        test_data['optimized_sentiment'] += test_data[col] * weights[i]
    
    # 转换为0-100分
    test_data['optimized_sentiment'] = test_data['optimized_sentiment'] * 100
    
    # 计算相关性
    pearson_corr, pearson_pvalue = stats.pearsonr(
        test_data['optimized_sentiment'], 
        test_data['rise_ratio']
    )
    
    spearman_corr, spearman_pvalue = stats.spearmanr(
        test_data['optimized_sentiment'], 
        test_data['rise_ratio']
    )
    
    print("\n===== 优化后情绪指标与次日上涨比例的相关性 =====")
    print(f"皮尔逊相关系数：{pearson_corr:.4f}，p值：{pearson_pvalue:.4f}")
    print(f"斯皮尔曼相关系数：{spearman_corr:.4f}，p值：{spearman_pvalue:.4f}")
    
    # 按情绪区间分组（0-25,25-50,50-75,75-100）
    test_data['sentiment_bin'] = pd.cut(
        test_data['optimized_sentiment'],
        bins=[0, 25, 50, 75, 100],
        labels=['极度悲观', '谨慎', '乐观', '狂热']
    )
    
    group_stats = test_data.groupby('sentiment_bin')['rise_ratio'].agg(
        平均上涨比例='mean',
        样本数量='count',
        标准差='std'
    ).reset_index()
    
    print("\n===== 不同情绪区间对应的次日上涨比例统计 =====")
    print(group_stats.round(4))
    
    # 可视化对比（优化前后）
    plt.figure(figsize=(15, 10))
    
    # 1. 优化后情绪得分与次日上涨比例的散点图
    plt.subplot(2, 1, 1)
    sns.scatterplot(x='optimized_sentiment', y='rise_ratio', data=test_data, alpha=0.6)
    plt.title(f'优化后情绪得分与次日上涨比例的关系 (r={pearson_corr:.2f})')
    plt.xlabel('优化后情绪得分（0-100）')
    plt.ylabel('次日上涨股票比例')
    plt.grid(alpha=0.3)
    
    # 2. 不同情绪区间的上涨比例箱线图
    plt.subplot(2, 1, 2)
    sns.boxplot(x='sentiment_bin', y='rise_ratio', data=test_data, 
                order=['极度悲观', '谨慎', '乐观', '狂热'])
    plt.title('优化后不同情绪区间对应的次日上涨比例分布')
    plt.xlabel('情绪区间')
    plt.ylabel('次日上涨股票比例')
    plt.grid(alpha=0.3)
    
    plt.tight_layout()
    plt.savefig('optimized_sentiment_vs_rise_ratio.png', dpi=300, bbox_inches='tight')
    print("\n可视化图表已保存至 optimized_sentiment_vs_rise_ratio.png")
    plt.show()
    
    return test_data, pearson_corr, spearman_corr

# ---------------------- 主函数 ----------------------
if __name__ == "__main__":
    # 1. 加载全量数据（包含2024和2025年）
    print("加载全量数据中...")
    try:
        # 替换为你的数据路径（确保包含2024和2025年数据）
        df = pd.read_parquet(r"D:\workspace\xiaoyao\data\stock_daily_price.parquet")
        df['date'] = pd.to_datetime(df['date'])
        
        # 检查数据时间范围
        print(f"数据时间范围: {df['date'].min().date()} 至 {df['date'].max().date()}")
        print(f"总数据量: {len(df):,} 条记录")
    except Exception as e:
        print(f"数据加载失败: {str(e)}")
        exit(1)
    
    # 2. 划分训练集（2024年）和测试集（2025年）
    train_df = df[df['date'].dt.year == 2024].copy()
    test_df = df[df['date'].dt.year == 2025].copy()
    
    print(f"\n训练集（2024年）数据量: {len(train_df):,} 条")
    print(f"测试集（2025年）数据量: {len(test_df):,} 条")
    
    # 3. 计算特征和目标变量
    print("\n计算情绪特征...")
    train_features = calculate_sentiment_features(train_df)
    test_features = calculate_sentiment_features(test_df)
    
    print("计算上涨比例...")
    train_rise_ratio = calculate_rise_ratio(train_df)
    test_rise_ratio = calculate_rise_ratio(test_df)
    
    # 4. 准备训练数据（标准化特征+生成目标变量）
    print("\n准备训练数据...")
    train_data, normalized_feature_cols, norm_params = prepare_training_data(
        train_features, 
        train_rise_ratio, 
        lag_days=1
    )
    
    # 5. 准备测试数据（使用训练集的标准化参数）
    print("准备测试数据...")
    test_data, _, _ = prepare_training_data(
        test_features, 
        test_rise_ratio, 
        lag_days=1,
        train_mean=norm_params  # 关键：使用训练集的标准化参数
    )
    
    print(f"训练样本量: {len(train_data)} 个交易日")
    print(f"测试样本量: {len(test_data)} 个交易日")
    
    # 6. 机器学习优化权重（基于2024年数据）
    print("\n开始优化情绪指标权重...")
    optimized_weights, model, weight_df = optimize_sentiment_weights(
        train_data, 
        normalized_feature_cols
    )
    
    # 7. 在2025年测试集上验证效果
    print("\n在2025年测试集上验证效果...")
    result_data, pearson_corr, spearman_corr = validate_optimized_sentiment(
        test_data, 
        normalized_feature_cols, 
        optimized_weights, 
        norm_params
    )
    
    # 8. 保存结果
    result_data.to_parquet("optimized_sentiment_results.parquet", index=False)
    weight_df.to_csv("optimized_sentiment_weights.csv", index=False, encoding='utf-8-sig')
    print("\n优化结果已保存至 optimized_sentiment_results.parquet")
    print("优化后的权重已保存至 optimized_sentiment_weights.csv")
    

使用中文字体: ['SimHei', 'Microsoft YaHei', 'Arial Unicode MS']
加载全量数据中...
数据时间范围: 2005-01-04 至 2025-09-25
总数据量: 15,039,052 条记录

训练集（2024年）数据量: 1,236,214 条
测试集（2025年）数据量: 925,900 条

计算情绪特征...
计算上涨比例...

准备训练数据...


NameError: name 'train_mean' is not defined