In [None]:
import pandas as pd
import numpy as np

def process_daily_data(raw_data):
    """
    处理日K数据，返回清洗后的涨跌幅数据
    
    参数：
        raw_data: DataFrame，原始日K数据（包含date, stock_code, close, pre_close, paused等字段）
    返回：
        return_df: DataFrame，索引为date，列为stock_code，值为日涨跌幅
    """
    # ----------------------
    # 1. 筛选核心字段并初步清洗
    # ----------------------
    # 保留需要的字段
    df = raw_data[['date', 'stock_code', 'close', 'pre_close', 'paused']].copy()
    
    # 转换date为 datetime 类型（确保时间格式正确）
    df['date'] = pd.to_datetime(df['date'])
    
    # 剔除停牌数据（paused=1表示停牌）
    df = df[df['paused'] != 1].drop(columns='paused')
    
    # 只需要2024年以前的数据
    df = df[df['date'].dt.year <= 2025]

    # 剔除缺失值（close或pre_close为空的行）
    df = df.dropna(subset=['close', 'pre_close'])
    
    # ----------------------
    # 2. 计算日涨跌幅
    # ----------------------
    # 涨跌幅 = (当日收盘价 / 前一日收盘价) - 1
    df['daily_return'] = (df['close'] / df['pre_close']) - 1
    
    # ----------------------
    # 3. 整理为透视表（方便后续相关性计算）
    # ----------------------
    # 转换为：行=date，列=stock_code，值=daily_return
    return_df = df.pivot(index='date', columns='stock_code', values='daily_return')
    
    # 补充说明：透视后可能存在NaN（如某股票在该日无交易或被过滤），后续计算相关性时会自动忽略这些NaN
    print(f"数据清洗完成：时间范围 {return_df.index.min()} 至 {return_df.index.max()}，包含 {return_df.shape[1]} 只股票")
    return return_df

# ----------------------
# 示例使用（假设你的原始数据已读入为DataFrame）
# ----------------------
# 假设raw_daily是从文件读取的原始日K数据（例如从CSV读取）
raw_daily = pd.read_parquet(r'D:\workspace\xiaoyao\data\stock_daily_price.parquet')
cleaned_returns = process_daily_data(raw_daily)
cleaned_returns


数据清洗完成：时间范围 2005-01-04 00:00:00 至 2024-12-31 00:00:00，包含 5383 只股票


stock_code,000001.XSHE,000002.XSHE,000004.XSHE,000005.XSHE,000006.XSHE,000007.XSHE,000008.XSHE,000009.XSHE,000010.XSHE,000011.XSHE,...,688787.XSHG,688788.XSHG,688789.XSHG,688793.XSHG,688798.XSHG,688799.XSHG,688800.XSHG,688819.XSHG,688981.XSHG,689009.XSHG
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2005-01-04,-0.010584,0.001856,-0.016150,-0.031548,0.016986,0.011752,-0.002353,-0.018083,-0.010847,-0.010883,...,,,,,,,,,,
2005-01-05,-0.009233,0.036048,0.020844,0.023485,0.014133,0.034845,0.026415,0.018416,0.041124,0.034230,...,,,,,,,,,,
2005-01-06,0.009319,-0.005501,0.004339,-0.004441,-0.004645,-0.005345,0.000000,-0.006329,0.010533,-0.011820,...,,,,,,,,,,
2005-01-07,-0.001520,0.009265,-0.004320,0.050558,0.098854,0.039764,0.028033,0.020928,0.009121,0.017943,...,,,,,,,,,,
2005-01-10,0.012236,-0.005481,0.005870,0.030432,0.049421,0.100775,0.049620,0.025847,0.040671,0.012926,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-12-25,0.005054,-0.024582,-0.099752,,-0.061902,-0.026434,0.015824,-0.015776,-0.035354,-0.001217,...,0.025211,-0.026344,-0.020412,-0.031502,-0.013695,-0.008995,-0.038669,-0.027469,0.011666,-0.008530
2024-12-26,-0.005029,-0.003974,0.061599,,0.001303,0.025651,-0.034294,-0.007487,0.014605,0.001218,...,0.020193,0.009819,-0.015611,0.010842,0.010486,0.009298,0.151276,0.025555,-0.012858,0.001765
2024-12-27,-0.002530,0.005322,-0.067624,,0.036178,0.044162,0.003250,0.015087,0.010864,0.012409,...,-0.023194,0.005402,0.001770,-0.035272,-0.003363,0.000000,-0.074289,-0.011803,0.008064,0.038758
2024-12-30,0.010140,-0.025161,-0.050454,,-0.067343,0.011216,-0.022678,-0.003140,-0.010747,-0.015621,...,-0.023620,-0.023426,0.002106,-0.071413,-0.010220,-0.000658,-0.019937,-0.005309,0.018255,0.027560


In [3]:
import pandas as pd
import torch

# 1. 验证PyTorch GPU是否正常工作
print("PyTorch是否检测到GPU：", torch.cuda.is_available())  # 应输出True
print("当前GPU型号：", torch.cuda.get_device_name(0))  # 应输出Quadro T1000

# 2. 定义用PyTorch GPU计算相关性矩阵的函数
def pytorch_gpu_corr(return_df):
    """
    用PyTorch在GPU上计算全量股票相关性矩阵
    参数：return_df - 清洗后的涨跌幅DataFrame（index=date, columns=stock_code）
    返回：corr_matrix - 股票间的相关性矩阵（DataFrame）
    """
    # 预处理：填充NaN为0（避免计算误差），转为float32节省GPU显存
    filled_returns = return_df.fillna(0).astype('float32')
    
    # 将数据转移到GPU
    device = torch.device('cuda')  # 指定使用GPU
    gpu_returns = torch.tensor(filled_returns.values, device=device)  # 形状：(n_days, n_stocks)
    
    # 计算相关性矩阵（皮尔逊系数）
    # 步骤1：计算每只股票的涨跌幅均值（沿时间轴，即axis=0）
    mean = torch.mean(gpu_returns, dim=0, keepdim=True)  # 形状：(1, n_stocks)
    # 步骤2：涨跌幅去均值（消除绝对波动影响）
    centered = gpu_returns - mean  # 形状：(n_days, n_stocks)
    # 步骤3：计算协方差矩阵（cov = (centered^T · centered) / (n_days - 1)）
    cov = torch.matmul(centered.T, centered) / (centered.shape[0] - 1)  # 形状：(n_stocks, n_stocks)
    # 步骤4：计算标准差（协方差矩阵对角线开平方）
    std = torch.sqrt(torch.diag(cov)).reshape(-1, 1)  # 形状：(n_stocks, 1)
    # 步骤5：计算相关性矩阵（corr = cov / (std · std^T)）
    corr_matrix = cov / torch.matmul(std, std.T)  # 形状：(n_stocks, n_stocks)
    
    # 将结果从GPU转回CPU，并转为DataFrame（保留股票代码索引）
    corr_df = pd.DataFrame(
        corr_matrix.cpu().numpy(),  # GPU tensor → CPU numpy数组
        index=return_df.columns,    # 行索引：股票代码
        columns=return_df.columns   # 列索引：股票代码
    )
    return corr_df

# 3. 调用函数计算相关性矩阵（用你的cleaned_returns数据）
# 注意：如果数据量较大，首次运行会有1-2秒的GPU初始化时间



corr_matrix = pytorch_gpu_corr(cleaned_returns)

# 4. 验证结果（查看前5行5列）
print("\n相关性矩阵前5行5列：")
print(corr_matrix.iloc[:5, :5])

PyTorch是否检测到GPU： True
当前GPU型号： Quadro T1000

相关性矩阵前5行5列：
stock_code   000001.XSHE  000002.XSHE  000004.XSHE  000005.XSHE  000006.XSHE
stock_code                                                                  
000001.XSHE     1.000000     0.531361     0.205605     0.276980     0.391140
000002.XSHE     0.531361     1.000000     0.179688     0.305854     0.523389
000004.XSHE     0.205605     0.179688     1.000000     0.313687     0.302872
000005.XSHE     0.276980     0.305854     0.313687     1.000000     0.443187
000006.XSHE     0.391140     0.523389     0.302872     0.443187     1.000000


In [4]:
import torch
import pandas as pd
import numpy as np

def torch_find_high_corr_groups(corr_matrix, min_corr=0.7, min_group_size=3):
    """
    用PyTorch向量化运算加速筛选高关联股票组
    参数同前：corr_matrix=相关性矩阵，min_corr=最小相关系数，min_group_size=每组最小股票数
    返回：high_corr_groups=高关联股票组列表
    """
    # 1. 相关性矩阵转为GPU张量（复用GPU算力）
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    corr_tensor = torch.tensor(corr_matrix.values, dtype=torch.float32, device=device)
    stock_codes = corr_matrix.columns.tolist()  # 保留股票代码映射
    n_stocks = len(stock_codes)

    # 2. 生成“高关联矩阵”：相关性≥min_corr记为True，否则False（排除自身关联）
    high_corr_mask = (corr_tensor >= min_corr) & (torch.eye(n_stocks, device=device) == 0)
    # 注：torch.eye生成对角矩阵，排除股票与自身的关联（避免干扰分组）

    # 3. 批量寻找连通组（核心加速步骤）
    visited = torch.zeros(n_stocks, dtype=torch.bool, device=device)  # 标记是否已分组
    high_corr_groups = []

    for i in range(n_stocks):
        if not visited[i]:
            # 步骤1：找到与当前股票直接高关联的股票（初始组）
            current_group = torch.nonzero(high_corr_mask[i], as_tuple=True)[0]  # 直接关联的股票索引
            current_group = torch.cat([torch.tensor([i], device=device), current_group])  # 加入当前股票

            # 步骤2：扩展组到“间接高关联”（用布尔运算批量判断，替代循环）
            while True:
                # 找到与当前组中所有股票高关联的股票（批量计算）
                group_mask = high_corr_mask[current_group].any(dim=0)  # 只要与组内任一股票关联，就纳入
                new_group = torch.nonzero(group_mask, as_tuple=True)[0]  # 新扩展的组

                if new_group.shape[0] == current_group.shape[0]:  # 组不再扩展，停止
                    break
                current_group = new_group

            # 步骤3：筛选有效组（排除重复、满足最小规模）
            current_group = current_group[~visited[current_group]]  # 排除已分组的股票
            if current_group.shape[0] >= min_group_size:
                # 股票索引转代码，加入结果列表
                group_codes = [stock_codes[idx] for idx in current_group.cpu().numpy()]
                high_corr_groups.append(group_codes)

            # 标记当前组的股票为“已访问”
            visited[current_group] = True

    # 4. 按组内股票数量排序（从大到小）
    high_corr_groups.sort(key=lambda x: len(x), reverse=True)
    return high_corr_groups

# 二、调用并验证结果（复用你之前的corr_matrix）
# 1. 筛选高关联组（30秒内完成，取决于股票数量）
high_corr_groups = torch_find_high_corr_groups(corr_matrix, min_corr=0.7, min_group_size=3)

# 2. 输出结果（同前，便于对比）
print(f"共找到 {len(high_corr_groups)} 个高关联股票组（相关性≥0.7，每组≥3只股票）")
print("\n前5个最大的股票组：")
for i, group in enumerate(high_corr_groups[:5], 1):
    print(f"\n组{i}（{len(group)}只股票）：")
    print("股票代码：", group[:10] + ["..." if len(group) > 10 else ""][:len(group)-10])
    # 计算组内平均相关性（验证关联性）
    group_corr = corr_matrix.loc[group, group]
    avg_corr = group_corr.values[np.triu_indices_from(group_corr.values, k=1)].mean()
    print(f"组内平均相关性：{avg_corr:.3f}")

共找到 60 个高关联股票组（相关性≥0.7，每组≥3只股票）

前5个最大的股票组：

组1（24只股票）：
股票代码： ['688001.XSHG', '688002.XSHG', '688003.XSHG', '688005.XSHG', '688006.XSHG', '688007.XSHG', '688008.XSHG', '688009.XSHG', '688010.XSHG', '688011.XSHG', '...']
组内平均相关性：0.697

组2（16只股票）：
股票代码： ['300862.XSHE', '300863.XSHE', '300864.XSHE', '300865.XSHE', '300866.XSHE', '300867.XSHE', '300868.XSHE', '300869.XSHE', '300870.XSHE', '300871.XSHE', '...']
组内平均相关性：0.727

组3（15只股票）：
股票代码： ['000552.XSHE', '000933.XSHE', '000937.XSHE', '000983.XSHE', '600123.XSHG', '600188.XSHG', '600348.XSHG', '600395.XSHG', '600508.XSHG', '600971.XSHG', '...']
组内平均相关性：0.697

组4（10只股票）：
股票代码： ['001286.XSHE', '001287.XSHE', '001328.XSHE', '001360.XSHE', '001367.XSHE', '601061.XSHG', '601065.XSHG', '601133.XSHG', '603125.XSHG', '603135.XSHG']
组内平均相关性：0.793

组5（6只股票）：
股票代码： ['000001.XSHE', '600000.XSHG', '600015.XSHG', '600016.XSHG', '600036.XSHG', '601166.XSHG']
组内平均相关性：0.729


In [None]:
import pandas as pd
import numpy as np

# 1. 整理高关联组的结构化数据
group_result = []
for group_id, stock_group in enumerate(high_corr_groups, 1):
    # 计算当前组的平均相关性（排除自身关联的对角线元素）
    group_corr_matrix = corr_matrix.loc[stock_group, stock_group]
    # 提取上三角元素（避免重复计算i-j和j-i的相关性）
    upper_triangle = group_corr_matrix.values[np.triu_indices_from(group_corr_matrix.values, k=1)]
    avg_corr = round(upper_triangle.mean(), 4)  # 保留4位小数，更精准
    
    # 整理单组数据：股票代码用逗号分隔，方便后续拆分
    group_result.append({
        "组号": group_id,
        "股票数量": len(stock_group),
        "组内平均相关性": avg_corr,
        "组内股票代码": ",".join(stock_group)  # 如"688001.XSHG,688002.XSHG,..."
    })

# 2. 转为DataFrame（便于查看和保存）
result_df = pd.DataFrame(group_result)

# 3. 保存为CSV文件（指定保存路径，可根据你的需求修改）
save_path = r"D:\workspace\xiaoyao\high_corr_stock_groups_2005_2025.csv"
result_df.to_csv(save_path, index=False, encoding="utf-8-sig")  # utf-8-sig支持中文显示

# 4. 打印保存结果，确认成功
print(f"高关联股票组结果已保存至：{save_path}")
print(f"\nCSV文件预览（前5组）：")
print(result_df.head())

高关联股票组结果已保存至：D:\workspace\xiaoyao\high_corr_stock_groups_2005_2024.csv

CSV文件预览（前5组）：
   组号  股票数量  组内平均相关性                                             组内股票代码
0   1    24   0.6968  688001.XSHG,688002.XSHG,688003.XSHG,688005.XSH...
1   2    16   0.7274  300862.XSHE,300863.XSHE,300864.XSHE,300865.XSH...
2   3    15   0.6971  000552.XSHE,000933.XSHE,000937.XSHE,000983.XSH...
3   4    10   0.7927  001286.XSHE,001287.XSHE,001328.XSHE,001360.XSH...
4   5     6   0.7286  000001.XSHE,600000.XSHG,600015.XSHG,600016.XSH...
