In [2]:
# 依次读取项目data目录下的parquet文件

import pandas as pd

# 读取股票日k线数据，行业数据，竞价数据，市值数据
price_df = pd.read_parquet(r'D:\workspace\xiaoyao\data\stock_daily_price.parquet')
industry_df = pd.read_parquet(r'D:\workspace\xiaoyao\data\stock_daily_industry.parquet')
auction_df = pd.read_parquet(r'D:\workspace\xiaoyao\data\stock_daily_auction.parquet')
marketcap_df = pd.read_parquet(r'D:\workspace\xiaoyao\data\stock_daily_marketcap.parquet')

# 将date转换为字符串类型
price_df['date'] = price_df['date'].astype(str)
industry_df['date'] = industry_df['date'].astype(str)
marketcap_df['date'] = marketcap_df['date'].astype(str)

# 将auction的date只取日期部分
auction_df['date'] = auction_df['date'].astype(str).str[:10]

In [3]:
# price_df 只取2025-01-01以后的数据
price_df = price_df[price_df['date'] >= '2025-01-01']

In [9]:
# 将四个df合并到price_df
merged_df = price_df.merge(industry_df, on=['date', 'stock_code'], how='left')
merged_df = merged_df.merge(marketcap_df, on=['date', 'stock_code'], how='left')
merged_df = merged_df.merge(auction_df, on=['date', 'stock_code'], how='left')

In [10]:
# merged_df将字段重命名 其中volume重命名为volume_daily
merged_df = merged_df.rename(columns={'volume_x': 'volume'})
merged_df = merged_df.rename(columns={'volume_y': 'auc_volume'})
merged_df = merged_df.rename(columns={'money_x': 'money'})
merged_df = merged_df.rename(columns={'money_y': 'auc_money'})

In [11]:
merged_df.columns

Index(['date', 'stock_code', 'open', 'close', 'low', 'high', 'volume', 'money',
       'factor', 'high_limit', 'low_limit', 'avg', 'pre_close', 'paused',
       'zjw_industry_code', 'zjw_industry_name', 'jq_l1_industry_code',
       'jq_l1_industry_name', 'jq_l2_industry_code', 'jq_l2_industry_name',
       'sw_l1_industry_code', 'sw_l1_industry_name', 'sw_l2_industry_code',
       'sw_l2_industry_name', 'sw_l3_industry_code', 'sw_l3_industry_name',
       'capitalization', 'circulating_cap', 'market_cap',
       'circulating_market_cap', 'turnover_ratio', 'pe_ratio', 'pe_ratio_lyr',
       'pb_ratio', 'ps_ratio', 'pcf_ratio', 'current', 'auc_volume',
       'auc_money', 'a1_p', 'a1_v', 'a2_p', 'a2_v', 'a3_p', 'a3_v', 'a4_p',
       'a4_v', 'a5_p', 'a5_v', 'b1_p', 'b1_v', 'b2_p', 'b2_v', 'b3_p', 'b3_v',
       'b4_p', 'b4_v', 'b5_p', 'b5_v'],
      dtype='object')

In [None]:
# 对于每只股票，计算当日收益率
import pandas as pd

def calculate_multi_day_returns(df):
    # 确保数据按股票代码和日期排序
    df_sorted = df.sort_values(['stock_code', 'date']).copy()
    
    # 按股票代码分组
    grouped = df_sorted.groupby('stock_code')
    
    # 存储结果的字典
    result_dfs = []
    
    for stock_code, group in grouped:
        # 重置索引，方便后续按位置访问
        stock_data = group.reset_index(drop=True)
        num_records = len(stock_data)
        
        # 为每个天数计算收益率（1到5天）
        for n in range(0, 6):
            # 计算T+n日的close与T日的open的收益率
            # 使用shift(-n)获取n天后的数据
            returns = (stock_data['close'].shift(-n) - stock_data['open']) / stock_data['open']
            
            # 创建临时DataFrame存储结果
            temp_df = stock_data[['date', 'stock_code', 'open']].copy()
            temp_df[f'{n}日收益率'] = returns
            
            result_dfs.append(temp_df)
    
    # 合并所有结果
    result = pd.concat(result_dfs, ignore_index=True)
    
    # 对于不足n天的数据（如最后n条记录），收益率将为NaN
    return result

# 使用示例：
# 假设你的DataFrame名为merged_df
returns_df = calculate_multi_day_returns(merged_df)
print(returns_df.head())



In [15]:
# 保存merged_df到D:\workspace\xiaoyao\data下
merged_df.to_parquet(r'D:\workspace\xiaoyao\data\widetable.parquet', index=False)