In [7]:
# 依次读取项目data目录下的parquet文件

import pandas as pd

# 读取股票日k线数据，行业数据，竞价数据，市值数据
price_df = pd.read_parquet(r'D:\workspace\xiaoyao\data\stock_daily_price.parquet')
industry_df = pd.read_parquet(r'D:\workspace\xiaoyao\data\stock_daily_industry.parquet')
auction_df = pd.read_parquet(r'D:\workspace\xiaoyao\data\stock_daily_auction.parquet')
marketcap_df = pd.read_parquet(r'D:\workspace\xiaoyao\data\stock_daily_marketcap.parquet')

# 将date转换为字符串类型
price_df['date'] = price_df['date'].astype(str)
industry_df['date'] = industry_df['date'].astype(str)
marketcap_df['date'] = marketcap_df['date'].astype(str)

# 将auction的date只取日期部分
auction_df['date'] = auction_df['date'].astype(str).str[:10]

In [8]:
# price_df 只取2025-01-01以后的数据
price_df = price_df[price_df['date'] >= '2025-01-01']

In [9]:
# 将四个df合并到price_df
merged_df = price_df.merge(industry_df, on=['date', 'stock_code'], how='left')
merged_df = merged_df.merge(marketcap_df, on=['date', 'stock_code'], how='left')
merged_df = merged_df.merge(auction_df, on=['date', 'stock_code'], how='left')

In [10]:
# merged_df将字段重命名 其中volume重命名为volume_daily
merged_df = merged_df.rename(columns={'volume_x': 'volume'})
merged_df = merged_df.rename(columns={'volume_y': 'auc_volume'})
merged_df = merged_df.rename(columns={'money_x': 'money'})
merged_df = merged_df.rename(columns={'money_y': 'auc_money'})

In [11]:
merged_df.columns

Index(['date', 'stock_code', 'open', 'close', 'low', 'high', 'volume', 'money',
       'factor', 'high_limit', 'low_limit', 'avg', 'pre_close', 'paused',
       'zjw_industry_code', 'zjw_industry_name', 'jq_l1_industry_code',
       'jq_l1_industry_name', 'jq_l2_industry_code', 'jq_l2_industry_name',
       'sw_l1_industry_code', 'sw_l1_industry_name', 'sw_l2_industry_code',
       'sw_l2_industry_name', 'sw_l3_industry_code', 'sw_l3_industry_name',
       'capitalization', 'circulating_cap', 'market_cap',
       'circulating_market_cap', 'turnover_ratio', 'pe_ratio', 'pe_ratio_lyr',
       'pb_ratio', 'ps_ratio', 'pcf_ratio', 'current', 'auc_volume',
       'auc_money', 'a1_p', 'a1_v', 'a2_p', 'a2_v', 'a3_p', 'a3_v', 'a4_p',
       'a4_v', 'a5_p', 'a5_v', 'b1_p', 'b1_v', 'b2_p', 'b2_v', 'b3_p', 'b3_v',
       'b4_p', 'b4_v', 'b5_p', 'b5_v'],
      dtype='object')

In [12]:
# 增加stock_name字段，读取D:\workspace\xiaoyao\data\stocks_info.csv文件，用stock_code关联display_name，将display_name 重命名为stock_name
stocks_info = pd.read_csv(r"D:\workspace\xiaoyao\data\stocks_info.csv")
stocks_info = stocks_info[["stock_code", "display_name"]]
stocks_info.rename(columns={"display_name": "stock_name"}, inplace=True)

# 将df的stock_code 与 stocks_info 合并，用stock_code关联
merged_df = pd.merge(merged_df, stocks_info, on="stock_code", how="left")

In [13]:
# 保存merged_df到D:\workspace\xiaoyao\data下
merged_df.to_parquet(r'D:\workspace\xiaoyao\data\widetable.parquet', index=False)