In [1]:
# 依次读取项目data目录下的parquet文件
import pandas as pd

# 读取股票日k线数据，行业数据，竞价数据，市值数据
price_df = pd.read_parquet(r'D:\workspace\xiaoyao\data\stock_daily_price.parquet')
industry_df = pd.read_parquet(r'D:\workspace\xiaoyao\data\stock_daily_industry.parquet')
auction_df = pd.read_parquet(r'D:\workspace\xiaoyao\data\stock_daily_auction.parquet')
marketcap_df = pd.read_parquet(r'D:\workspace\xiaoyao\data\stock_daily_marketcap.parquet')
concept_df = pd.read_parquet(r'D:\workspace\xiaoyao\data\stock_daily_concept.parquet')


# price_df 只取2025-01-01以后的数据
price_df = price_df[price_df['date'] >= '2025-01-01']
industry_df = industry_df[industry_df['date'] >= '2025-01-01']
marketcap_df = marketcap_df[marketcap_df['date'] >= '2025-01-01']
auction_df = auction_df[auction_df['date'] >= '2025-01-01']
concept_df = concept_df[concept_df['date'] >= '2025-01-01']

# 将date转换为字符串类型
price_df['date'] = price_df['date'].astype(str)
industry_df['date'] = industry_df['date'].astype(str)
marketcap_df['date'] = marketcap_df['date'].astype(str)
# 将auction的date只取日期部分
auction_df['date'] = auction_df['date'].astype(str).str[:10]
# 将概念信息的date转换为字符串
concept_df['date'] = concept_df['date'].astype(str)

In [2]:
# 增加stock_name字段，读取D:\workspace\xiaoyao\data\stocks_info.csv文件，用stock_code关联display_name，将display_name 重命名为stock_name
stocks_info = pd.read_csv(r"D:\workspace\xiaoyao\data\stocks_info.csv")
stocks_info = stocks_info[["stock_code", "display_name"]]
stocks_info.rename(columns={"display_name": "stock_name"}, inplace=True)

In [3]:
# 步骤1：按 date、stock_code、concept_code 排序
sorted_concept_df = concept_df.sort_values(by=['date', 'stock_code', 'concept_code'])

# 步骤2：按 date 和 stock_code 分组，聚合 concept_name 为列表
sorted_concept_df = sorted_concept_df.groupby(['date', 'stock_code'])['concept_name'].agg(list).reset_index()

# 重命名列名
sorted_concept_df = sorted_concept_df.rename(columns={'concept_name': 'concept_name_list'})

sorted_concept_df.head(3)

Unnamed: 0,date,stock_code,concept_name_list
0,2025-01-02,000001.XSHE,"[深股通, 证金持股, 融资融券, MSCI概念]"
1,2025-01-02,000002.XSHE,"[装配式建筑, 深股通, 证金持股, 融资融券, 超级品牌, 物业管理, MSCI概念]"
2,2025-01-02,000004.XSHE,"[物联网, 电力物联网, 5G, 网络安全, 信创, 华为概念, 车联网(车路协同)]"


In [4]:
# 将四个df合并到price_df
merged_df = price_df.merge(stocks_info, on=['stock_code'], how='left')
merged_df = merged_df.merge(industry_df, on=['date', 'stock_code'], how='left')
merged_df = merged_df.merge(marketcap_df, on=['date', 'stock_code'], how='left')
merged_df = merged_df.merge(auction_df, on=['date', 'stock_code'], how='left')
merged_df = merged_df.merge(sorted_concept_df, on=['date', 'stock_code'], how='left')

In [5]:
# merged_df将字段重命名 其中volume重命名为volume_daily
merged_df = merged_df.rename(columns={'volume_x': 'volume'})
merged_df = merged_df.rename(columns={'volume_y': 'auc_volume'})
merged_df = merged_df.rename(columns={'money_x': 'money'})
merged_df = merged_df.rename(columns={'money_y': 'auc_money'})

In [6]:
merged_df.columns

Index(['date', 'stock_code', 'open', 'close', 'low', 'high', 'volume', 'money',
       'factor', 'high_limit', 'low_limit', 'avg', 'pre_close', 'paused',
       'stock_name', 'zjw_industry_code', 'zjw_industry_name',
       'jq_l1_industry_code', 'jq_l1_industry_name', 'jq_l2_industry_code',
       'jq_l2_industry_name', 'sw_l1_industry_code', 'sw_l1_industry_name',
       'sw_l2_industry_code', 'sw_l2_industry_name', 'sw_l3_industry_code',
       'sw_l3_industry_name', 'capitalization', 'circulating_cap',
       'market_cap', 'circulating_market_cap', 'turnover_ratio', 'pe_ratio',
       'pe_ratio_lyr', 'pb_ratio', 'ps_ratio', 'pcf_ratio', 'current',
       'auc_volume', 'auc_money', 'a1_p', 'a1_v', 'a2_p', 'a2_v', 'a3_p',
       'a3_v', 'a4_p', 'a4_v', 'a5_p', 'a5_v', 'b1_p', 'b1_v', 'b2_p', 'b2_v',
       'b3_p', 'b3_v', 'b4_p', 'b4_v', 'b5_p', 'b5_v', 'concept_name_list'],
      dtype='object')

In [7]:
# import pandas as pd
# import numpy as np

# # 1. 确保索引为 datetime 类型（前提不可少）
# merged_df.index = pd.to_datetime(merged_df.index, errors='coerce')
# ref_date = pd.to_datetime('2025-01-01')

# # 2. 最终修正版：先区分标量/非标量，再处理 NaN
# merged_df['concept_name_list'] = merged_df.groupby('stock_code')['concept_name_list'].transform(
#     lambda x: 
#         # 分支1：有有效值时，用采样值填充标量 NaN
#         x.fillna(x.loc[x.index >= ref_date].dropna().sample(1, random_state=42).iloc[0]) 
#         if not x.loc[x.index >= ref_date].dropna().empty 
#         # 分支2：无有效值时，仅替换“标量 NaN”为空列表
#         else x.apply(lambda val: [] if (np.isscalar(val) and pd.isna(val)) else val)
# )

In [8]:
# 保存merged_df到D:\workspace\xiaoyao\data下
merged_df.to_parquet(r'D:\workspace\xiaoyao\data\widetable.parquet', index=False)

In [9]:
# 将parquet读取后，随机采样5条数据，并导出为csv存放在本地目录
import pandas as pd

df = pd.read_parquet(r'D:\workspace\xiaoyao\data\widetable.parquet')
df.sample(5).to_csv('./widetable_sample.csv', index=False)
