In [1]:
# 依次读取项目data目录下的parquet文件

import pandas as pd

# 读取股票日k线数据，行业数据，竞价数据，市值数据
price_df = pd.read_parquet(r'D:\workspace\xiaoyao\data\stock_daily_price.parquet')
industry_df = pd.read_parquet(r'D:\workspace\xiaoyao\data\stock_daily_industry.parquet')
auction_df = pd.read_parquet(r'D:\workspace\xiaoyao\data\stock_daily_auction.parquet')
marketcap_df = pd.read_parquet(r'D:\workspace\xiaoyao\data\stock_daily_marketcap.parquet')

# 将date转换为字符串类型
price_df['date'] = price_df['date'].astype(str)
industry_df['date'] = industry_df['date'].astype(str)
marketcap_df['date'] = marketcap_df['date'].astype(str)

# 将auction的date只取日期部分
auction_df['date'] = auction_df['date'].astype(str).str[:10]

In [2]:
# price_df 只取2025-01-01以后的数据
price_df = price_df[price_df['date'] >= '2025-01-01']

In [3]:
# 将四个df合并到price_df
merged_df = price_df.merge(industry_df, on=['date', 'stock_code'], how='left')
merged_df = merged_df.merge(marketcap_df, on=['date', 'stock_code'], how='left')
merged_df = merged_df.merge(auction_df, on=['date', 'stock_code'], how='left')

In [4]:
# merged_df将字段重命名 其中volume重命名为volume_daily
merged_df = merged_df.rename(columns={'volume_x': 'volume'})
merged_df = merged_df.rename(columns={'volume_y': 'auc_volume'})
merged_df = merged_df.rename(columns={'money_x': 'money'})
merged_df = merged_df.rename(columns={'money_y': 'auc_money'})

In [5]:
merged_df.columns

Index(['date', 'stock_code', 'open', 'close', 'low', 'high', 'volume', 'money',
       'factor', 'high_limit', 'low_limit', 'avg', 'pre_close', 'paused',
       'zjw_industry_code', 'zjw_industry_name', 'jq_l1_industry_code',
       'jq_l1_industry_name', 'jq_l2_industry_code', 'jq_l2_industry_name',
       'sw_l1_industry_code', 'sw_l1_industry_name', 'sw_l2_industry_code',
       'sw_l2_industry_name', 'sw_l3_industry_code', 'sw_l3_industry_name',
       'capitalization', 'circulating_cap', 'market_cap',
       'circulating_market_cap', 'turnover_ratio', 'pe_ratio', 'pe_ratio_lyr',
       'pb_ratio', 'ps_ratio', 'pcf_ratio', 'current', 'auc_volume',
       'auc_money', 'a1_p', 'a1_v', 'a2_p', 'a2_v', 'a3_p', 'a3_v', 'a4_p',
       'a4_v', 'a5_p', 'a5_v', 'b1_p', 'b1_v', 'b2_p', 'b2_v', 'b3_p', 'b3_v',
       'b4_p', 'b4_v', 'b5_p', 'b5_v'],
      dtype='object')

In [None]:
# 对于每只股票，计算当日收益率

import numpy as np

def calculate_multi_day_returns(df):
    # 直接按字符串日期排序（前提：格式为YYYY-MM-DD或YYYYMMDD）
    df = df.copy()
    df = df.sort_values(['stock_code', 'date']).reset_index(drop=True)  # 字符串日期可直接排序
    
    # 按股票分组计算
    result = []
    for code, group in df.groupby('stock_code'):
        stock_data = group[['date', 'open', 'close']].reset_index(drop=True)
        total_days = len(stock_data)
        
        # 计算1-5日收益率（用位置偏移，依赖正确排序）
        for n in range(0, 6):
            tn_close = stock_data['close'].shift(-n)  # 已排序，直接偏移获取T+n日数据
            stock_data[f'return_{n}d'] = (tn_close - stock_data['close']) / stock_data['close']
        
        stock_data['stock_code'] = code
        result.append(stock_data)
    
    return pd.concat(result, ignore_index=True)[['date', 'stock_code', 'open', 'close'] + 
                                               [f'return_{n}d' for n in range(0, 6)]]
# 使用示例：
# 假设你的DataFrame名为merged_df
returns_df = calculate_multi_day_returns(merged_df)
print(returns_df.head())



         date   stock_code     open    close  return_0d  return_1d  return_2d  \
0  2025-01-02  000001.XSHE  1630.12  1588.43  -0.025575  -0.029838  -0.024722   
1  2025-01-03  000001.XSHE  1589.82  1581.48  -0.005246   0.000000   0.006120   
2  2025-01-06  000001.XSHE  1581.48  1589.82   0.005274   0.011426   0.010547   
3  2025-01-07  000001.XSHE  1587.04  1599.55   0.007883   0.007007  -0.001752   
4  2025-01-08  000001.XSHE  1598.16  1598.16   0.000000  -0.008698  -0.017395   

   return_3d  return_4d  return_5d  
0  -0.018753  -0.019606  -0.028133  
1   0.005246  -0.003497  -0.012240  
2   0.001758  -0.007031  -0.015814  
3  -0.010510  -0.019262  -0.003503  
4  -0.026086  -0.010437  -0.001740  


In [7]:
returns_df.sample()

Unnamed: 0,date,stock_code,open,close,return_0d,return_1d,return_2d,return_3d,return_4d,return_5d
628357,2025-09-05,600702.XSHG,240.93,243.01,0.008633,0.031378,0.027892,0.010459,0.034408,0.002449


In [8]:
# 将merged_df 与 returns_df 合并，仅保留returns_df的return_0d列
merged_df = pd.merge(merged_df, returns_df[['date', 'stock_code', 'return_0d', 'return_1d', 'return_2d', 'return_3d', 'return_4d', 'return_5d']], on=['date', 'stock_code'], how='left')


In [9]:
# 保存merged_df到D:\workspace\xiaoyao\data下
merged_df.to_parquet(r'D:\workspace\xiaoyao\data\widetable.parquet', index=False)