In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import matplotlib.font_manager as fm

# ---------------------- 解决中文显示问题 ----------------------
def setup_chinese_font():
    """配置matplotlib以正确显示中文，修复字体属性错误"""
    try:
        # 尝试获取系统中所有字体的名称
        font_names = set()
        # 获取所有字体文件路径
        font_files = fm.findSystemFonts()
        
        # 遍历字体文件，获取字体名称
        for font_file in font_files:
            try:
                font_prop = fm.FontProperties(fname=font_file)
                font_names.add(font_prop.get_name())
            except:
                continue
        
        # 常见中文字体列表
        chinese_fonts = ["SimHei", "Microsoft YaHei", "Heiti TC", "WenQuanYi Micro Hei", "Arial Unicode MS"]
        available_fonts = [f for f in chinese_fonts if f in font_names]
        
        if available_fonts:
            plt.rcParams["font.family"] = available_fonts
            print(f"使用中文字体: {available_fonts}")
        else:
            # 如果没有找到中文字体，使用默认字体并发出警告
            print("警告：未找到中文字体，可能会显示乱码。请安装SimHei或Microsoft YaHei字体。")
            plt.rcParams["font.family"] = ["Arial Unicode MS", "sans-serif"]
    
    except Exception as e:
        print(f"字体配置警告: {str(e)}")
        #  fallback方案，直接设置已知的中文字体
        plt.rcParams["font.family"] = ["SimHei", "Microsoft YaHei", "Arial Unicode MS", "sans-serif"]
    
    plt.rcParams["axes.unicode_minus"] = False  # 解决负号显示问题

# 初始化中文显示
setup_chinese_font()

# ---------------------- 情绪指标计算 ----------------------
def calculate_market_sentiment(daily_data: pd.DataFrame) -> pd.DataFrame:
    """基于全市场日线数据计算市场情绪指标"""
    # 复制数据并预处理
    df = daily_data.copy()
    df['date'] = pd.to_datetime(df['date'])
    df = df.sort_values(['date', 'stock_code'])
    
    # 计算个股级指标
    df['daily_return'] = (df['close'] - df['open']) / df['open'] * 100
    df['is_limit_up'] = (df['close'] >= df['high_limit']).astype(int)
    df['is_limit_down'] = (df['close'] <= df['low_limit']).astype(int)
    df['is_up'] = (df['daily_return'] > 0).astype(int)
    
    # 按日期聚合，计算市场级指标
    daily_sentiment = df.groupby('date').agg(
        total_stocks=('stock_code', 'nunique'),
        up_stocks=('is_up', 'sum'),
        down_stocks=('is_up', lambda x: sum(1 - x)),
        limit_up_stocks=('is_limit_up', 'sum'),
        limit_down_stocks=('is_limit_down', 'sum'),
        avg_return=('daily_return', 'mean'),
        total_volume=('volume', 'sum'),
        up_volume=('volume', lambda x: x[df.loc[x.index, 'is_up'] == 1].mean() if sum(df.loc[x.index, 'is_up']) > 0 else 0),
        down_volume=('volume', lambda x: x[df.loc[x.index, 'is_up'] == 0].mean() if sum(1 - df.loc[x.index, 'is_up']) > 0 else 0)
    ).reset_index()
    
    # 计算核心情绪指标
    daily_sentiment['up_down_ratio'] = np.where(
        daily_sentiment['down_stocks'] > 0,
        daily_sentiment['up_stocks'] / daily_sentiment['down_stocks'],
        10
    )
    
    daily_sentiment['limit_spread_ratio'] = (
        daily_sentiment['limit_up_stocks'] + daily_sentiment['limit_down_stocks']
    ) / daily_sentiment['total_stocks']
    
    daily_sentiment['volume_price_fit'] = (
        daily_sentiment['up_volume'] / daily_sentiment['down_volume']
    ).rolling(window=5, min_periods=1).mean()
    
    daily_sentiment['volume_expansion'] = (
        daily_sentiment['total_volume'] / 
        daily_sentiment['total_volume'].rolling(window=20, min_periods=5).mean()
    )
    
    daily_sentiment['price_volatility'] = daily_sentiment['avg_return'].abs()
    
    # 综合情绪评分（0-100分）
    def normalize(series, higher_better=True):
        min_val = series.min()
        max_val = series.max()
        if max_val == min_val:
            return 0.5
        if higher_better:
            return (series - min_val) / (max_val - min_val)
        else:
            return 1 - (series - min_val) / (max_val - min_val)
    
    weights = {
        'up_down_ratio': 0.3,
        'volume_price_fit': 0.2,
        'volume_expansion': 0.15,
        'price_volatility': 0.1,
        'limit_spread_ratio': 0.25
    }
    
    daily_sentiment['score_up_down'] = normalize(daily_sentiment['up_down_ratio'])
    daily_sentiment['score_volume_fit'] = normalize(daily_sentiment['volume_price_fit'])
    daily_sentiment['score_volume_expand'] = normalize(daily_sentiment['volume_expansion'])
    daily_sentiment['score_volatility'] = 1 - (daily_sentiment['price_volatility'] - 0.5).abs() * 2
    daily_sentiment['score_limit_spread'] = np.where(
        daily_sentiment['limit_spread_ratio'] > 0.5,
        0.3,
        normalize(daily_sentiment['limit_spread_ratio'], higher_better=False)
    )
    
    daily_sentiment['sentiment_score'] = (
        daily_sentiment['score_up_down'] * weights['up_down_ratio'] +
        daily_sentiment['score_volume_fit'] * weights['volume_price_fit'] +
        daily_sentiment['score_volume_expand'] * weights['volume_expansion'] +
        daily_sentiment['score_volatility'] * weights['price_volatility'] +
        daily_sentiment['score_limit_spread'] * weights['limit_spread_ratio']
    ) * 100
    
    # 情绪标签
    def label_sentiment(score):
        if score < 30:
            return '极度悲观'
        elif score < 50:
            return '谨慎'
        elif score < 70:
            return '乐观'
        else:
            return '狂热'
    
    daily_sentiment['sentiment_label'] = daily_sentiment['sentiment_score'].apply(label_sentiment)
    
    return daily_sentiment[['date', 'sentiment_score', 'sentiment_label',
                           'up_down_ratio', 'volume_price_fit', 'limit_spread_ratio']]

# ---------------------- 上涨比例计算 ----------------------
def calculate_rise_ratio(daily_data: pd.DataFrame) -> pd.DataFrame:
    """计算每日上涨股票的比例"""
    df = daily_data.copy()
    df['date'] = pd.to_datetime(df['date'])
    
    # 计算个股每日是否上涨（基于收盘价较前一日的变化）
    df['prev_close'] = df.groupby('stock_code')['close'].shift(1)
    df['is_rise'] = (df['close'] > df['prev_close']).astype(int)
    
    # 按日期聚合计算上涨比例
    rise_ratio_df = df.groupby('date').agg(
        total_stocks=('stock_code', 'nunique'),
        rise_stocks=('is_rise', 'sum')
    ).reset_index()
    
    # 计算上涨比例并清理无效值
    rise_ratio_df['rise_ratio'] = rise_ratio_df['rise_stocks'] / rise_ratio_df['total_stocks']
    rise_ratio_df = rise_ratio_df.dropna(subset=['rise_ratio'])
    
    # 过滤极端值（上涨比例应在0-1之间）
    rise_ratio_df = rise_ratio_df[(rise_ratio_df['rise_ratio'] >= 0) & (rise_ratio_df['rise_ratio'] <= 1)]
    
    return rise_ratio_df[['date', 'rise_ratio']]

# ---------------------- 相关性分析 ----------------------
def analyze_correlation(sentiment_df: pd.DataFrame, rise_ratio_df: pd.DataFrame, lag_days: int = 1):
    """
    分析情绪指标与滞后N日上涨比例的相关性
    
    参数:
        lag_days: 滞后天数，默认为1（分析与次日的相关性）
    """
    # 调整上涨比例数据的日期，使其对应情绪数据的滞后N日
    rise_ratio_shifted = rise_ratio_df.copy()
    rise_ratio_shifted['date'] = rise_ratio_shifted['date'] - pd.Timedelta(days=lag_days)
    
    # 合并数据并清理缺失值
    merged_df = pd.merge(
        sentiment_df, 
        rise_ratio_shifted[['date', 'rise_ratio']], 
        on='date', 
        how='inner'
    )
    
    # 清理缺失值
    initial_count = len(merged_df)
    merged_df = merged_df.dropna(subset=['sentiment_score', 'rise_ratio'])
    cleaned_count = len(merged_df)
    
    if initial_count > cleaned_count:
        print(f"清理了 {initial_count - cleaned_count} 条含缺失值的记录，保留 {cleaned_count} 条有效记录")
    
    # 计算相关性
    if len(merged_df) < 2:
        print("数据量不足，无法计算相关性")
        return merged_df, None
    
    pearson_corr, pearson_pvalue = stats.pearsonr(merged_df['sentiment_score'], merged_df['rise_ratio'])
    spearman_corr, spearman_pvalue = stats.spearmanr(merged_df['sentiment_score'], merged_df['rise_ratio'])
    
    print(f"\n情绪得分与{lag_days}日后上涨股票比例的相关性：")
    print(f"皮尔逊相关系数：{pearson_corr:.4f}，p值：{pearson_pvalue:.4f}（p<0.05表示显著相关）")
    print(f"斯皮尔曼相关系数：{spearman_corr:.4f}，p值：{spearman_pvalue:.4f}（p<0.05表示显著相关）")
    
    # 按情绪标签分组统计
    group_stats = merged_df.groupby('sentiment_label')['rise_ratio'].agg(
        平均上涨比例='mean',
        样本数量='count',
        标准差='std'
    ).reset_index()
    
    # 按情绪标签排序
    label_order = ['极度悲观', '谨慎', '乐观', '狂热']
    group_stats['sentiment_label'] = pd.Categorical(group_stats['sentiment_label'], categories=label_order)
    group_stats = group_stats.sort_values('sentiment_label')
    
    print("\n不同情绪标签对应的{}日后上涨股票比例统计：".format(lag_days))
    print(group_stats.round(4))
    
    # 可视化
    plt.figure(figsize=(15, 10))
    
    # 1. 散点图：情绪得分与滞后N日上涨比例
    plt.subplot(2, 2, 1)
    sns.scatterplot(x='sentiment_score', y='rise_ratio', data=merged_df, alpha=0.6)
    plt.title(f'情绪得分与{lag_days}日后上涨股票比例的关系 (r={pearson_corr:.2f})')
    plt.xlabel('情绪得分')
    plt.ylabel(f'{lag_days}日后上涨股票比例')
    plt.grid(alpha=0.3)
    
    # 2. 箱线图：不同情绪标签的上涨比例分布
    plt.subplot(2, 2, 2)
    sns.boxplot(x='sentiment_label', y='rise_ratio', data=merged_df, order=label_order)
    plt.title(f'不同情绪标签对应的{lag_days}日后上涨股票比例分布')
    plt.xlabel('情绪标签')
    plt.ylabel(f'{lag_days}日后上涨股票比例')
    plt.grid(alpha=0.3)
    
    # 3. 折线图：情绪得分与上涨比例的走势（平滑处理）
    plt.subplot(2, 1, 2)
    merged_df_sorted = merged_df.sort_values('date')
    # 计算移动平均以平滑曲线
    window_size = min(5, len(merged_df_sorted) // 5)  # 自适应窗口大小
    if window_size >= 2:
        merged_df_sorted['sentiment_smoothed'] = merged_df_sorted['sentiment_score'].rolling(window=window_size).mean()
        merged_df_sorted['rise_ratio_smoothed'] = merged_df_sorted['rise_ratio'].rolling(window=window_size).mean()
        
        plt.plot(merged_df_sorted['date'], merged_df_sorted['sentiment_smoothed'], 
                 label='情绪得分（平滑）', color='blue', alpha=0.7)
        plt.twinx()
        plt.plot(merged_df_sorted['date'], merged_df_sorted['rise_ratio_smoothed'], 
                 label=f'{lag_days}日上涨比例（平滑）', color='red', alpha=0.7)
        
        # 合并图例
        lines1, labels1 = plt.gca().get_legend_handles_labels()
        lines2, labels2 = plt.gca().get_legend_handles_labels()
        plt.gca().legend(lines1 + lines2, labels1 + labels2, loc='upper left')
    
    plt.title(f'情绪得分与{lag_days}日后上涨比例的走势对比')
    plt.xlabel('日期')
    plt.grid(alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(f'sentiment_vs_rise_ratio_lag{lag_days}.png', dpi=300, bbox_inches='tight')
    print(f"\n可视化图表已保存至 sentiment_vs_rise_ratio_lag{lag_days}.png")
    plt.show()
    
    return merged_df, group_stats

# ---------------------- 主函数 ----------------------
if __name__ == "__main__":
    # 1. 加载数据
    print("加载数据中...")
    try:
        # 替换为你的数据路径
        df = pd.read_parquet(r"D:\workspace\xiaoyao\data\stock_daily_price.parquet")
        
        # 数据过滤
        df['date'] = pd.to_datetime(df['date'])
        start_date = '2025-01-01'
        df = df[df['date'] >= start_date]
        
        print(f"数据加载完成，时间范围: {df['date'].min().date()} 至 {df['date'].max().date()}")
        print(f"数据量: {len(df):,} 条记录")
    except Exception as e:
        print(f"数据加载失败: {str(e)}")
        exit(1)
    
    # 2. 计算市场情绪指标
    print("\n计算市场情绪指标...")
    try:
        sentiment_df = calculate_market_sentiment(df)
        print(f"情绪指标计算完成，共 {len(sentiment_df)} 个交易日")
    except Exception as e:
        print(f"情绪指标计算失败: {str(e)}")
        exit(1)
    
    # 3. 计算每日上涨股票比例
    print("\n计算每日上涨股票比例...")
    try:
        rise_ratio_df = calculate_rise_ratio(df)
        print(f"上涨比例计算完成，共 {len(rise_ratio_df)} 个交易日")
    except Exception as e:
        print(f"上涨比例计算失败: {str(e)}")
        exit(1)
    
    # 4. 分析相关性（可调整滞后天数）
    print("\n开始相关性分析...")
    try:
        # 分析与次日（滞后1天）的相关性，可改为2、3分析后续几天的关系
        merged_data, group_stats = analyze_correlation(sentiment_df, rise_ratio_df, lag_days=1)
        
        # 保存分析结果
        merged_data.to_parquet("sentiment_vs_rise_ratio_analysis.parquet", index=False)
        print("\n分析结果已保存至 sentiment_vs_rise_ratio_analysis.parquet")
    except Exception as e:
        print(f"相关性分析失败: {str(e)}")
        exit(1)
    

AttributeError: 'str' object has no attribute 'name'