In [42]:
import os
import alphalens as al
import pandas as pd  
import numpy as np
import matplotlib.pyplot as plt
import baostock as bs
%matplotlib inline

from pylab import mpl
mpl.rcParams['font.sans-serif']=['SimHei']
mpl.rcParams['axes.unicode_minus']=False

IC回测

In [43]:
# 转换股票代码格式的函数
def convert_code(code):
    if code.endswith('.XSHG'):
        return 'sh.' + code[:6]
    elif code.endswith('.XSHE'):
        return 'sz.' + code[:6]
    else:
        return code

In [44]:
# 读取因子数据
factor_data = pd.read_csv('factor.csv')

# 合并日期和时间列为一个新的时间戳列
factor_data['timestamp'] = pd.to_datetime(factor_data['date'] + ' ' + factor_data['time'])

# 设置新的时间戳列为索引
factor_data.set_index('timestamp', inplace=True)

# 删除原始的日期和时间列
factor_data.drop(['date', 'time'], axis=1, inplace=True)

# 转换股票代码格式并转换因子数据格式为长格式
factor_data.columns = [convert_code(col) for col in factor_data.columns]
factor_data = factor_data.stack()
factor_data.index = factor_data.index.set_names(['timestamp', 'asset'])
factor_data = factor_data.reset_index()
factor_data.columns = ['timestamp', 'asset', 'factor']

# 确保索引是MultiIndex
factor_data.set_index(['timestamp', 'asset'], inplace=True)

print(factor_data)

                                 factor
timestamp           asset              
2023-06-07 10:00:00 sz.000001 -0.262344
                    sz.000002  0.297081
                    sz.000004  0.530078
                    sz.000005  0.034260
                    sz.000006 -0.613651
...                                 ...
2023-08-17 13:30:00 sz.000028 -0.051398
                    sz.000029 -1.083397
                    sz.000030  1.767867
                    sz.000031  0.623799
                    sz.000032 -2.315532

[2600 rows x 1 columns]


下载更新数据

In [45]:
# 读取因子数据
factor_data = pd.read_csv('factor.csv')
factor_data['timestamp'] = pd.to_datetime(factor_data['date'] + ' ' + factor_data['time'])
factor_data.set_index('timestamp', inplace=True)

# 提取所有股票代码并转换格式
stock_codes = [convert_code(code) for code in factor_data.columns[2:]]

# 提取日期范围
start_date = factor_data.index.min().strftime('%Y-%m-%d')
end_date = factor_data.index.max().strftime('%Y-%m-%d')

# 登录baostock
bs.login()

# 遍历所有股票代码并下载数据
for stock_code in stock_codes:
    data_list = []
    rs = bs.query_history_k_data_plus(stock_code,
                                      "date,time,code,open,high,low,close,volume,amount,adjustflag",
                                      start_date=start_date, end_date=end_date,
                                      frequency="30", adjustflag="3")
    while (rs.error_code == '0') & rs.next():
        row_data=rs.get_row_data()
        if row_data[1].endswith('100000000') or row_data[1].endswith('133000000'):
            data_list.append(row_data)

    # 转换为DataFrame并保存
    data_df = pd.DataFrame(data_list, columns=rs.fields)
    data_df.to_csv(f'C:/LIFT/price_data/{stock_code}.csv', index=False)

# 登出系统
bs.logout()

login success!
logout success!


<baostock.data.resultset.ResultData at 0x28b5630e130>

合并价格数据

In [46]:
# 假设所有价格数据文件都存储在这个目录下
price_data_directory = 'C:/LIFT/price_data/'

# 初始化空的DataFrame来存储所有股票的价格数据
all_prices = pd.DataFrame()

# 遍历目录中的每个文件
for filename in os.listdir(price_data_directory):
    if filename.endswith('.csv'):
        # 读取股票价格数据
        stock_price_data = pd.read_csv(os.path.join(price_data_directory, filename))

        # 处理日期和时间格式
        stock_price_data['date'] = pd.to_datetime(stock_price_data['date'], format='%Y-%m-%d')
        stock_price_data['time'] = stock_price_data['time'].astype(str).str.zfill(17)
        stock_price_data['time'] = pd.to_datetime(stock_price_data['time'], format='%Y%m%d%H%M%S%f').dt.time

        # 合并日期和时间为一个完整的timestamp
        stock_price_data['timestamp'] = pd.to_datetime(stock_price_data['date'].astype(str) + ' ' + stock_price_data['time'].astype(str))

        # 设置timestamp为索引
        stock_price_data.set_index('timestamp', inplace=True)

        # 提取收盘价并添加到主DataFrame
        all_prices[filename.rstrip('.csv')] = stock_price_data['close']

# 确保索引是日期时间戳
all_prices.index = pd.to_datetime(all_prices.index)


In [47]:
# 使用 stack 方法将 all_prices 转换为 MultiIndex Series
stacked_prices = all_prices.stack()

# 重新设置索引名称
stacked_prices.index.set_names(['timestamp', 'asset'], inplace=True)

# 给Series命名
stacked_prices.name = 'price'


In [48]:
print(stacked_prices)

timestamp            asset    
2023-06-07 10:00:00  sz.000001    11.98
                     sz.000002    14.59
                     sz.000004    15.28
                     sz.000005     1.17
                     sz.000006     4.75
                                  ...  
2023-08-17 13:30:00  sz.000028     33.8
                     sz.000029    12.58
                     sz.000030     5.27
                     sz.000031     4.23
                     sz.000032    29.42
Name: price, Length: 2598, dtype: object


In [49]:
print(all_prices.head())
print(factor_data.head())

                     sz.000001  sz.000002  sz.000004  sz.000005  sz.000006  \
timestamp                                                                    
2023-06-07 10:00:00      11.98      14.59      15.28       1.17       4.75   
2023-06-07 13:30:00      11.96      14.66      15.96       1.17       4.77   
2023-06-08 10:00:00      11.92      14.49      15.42       1.19       4.76   
2023-06-08 13:30:00      12.08      14.83      15.09       1.21       4.85   
2023-06-09 10:00:00      11.93      14.63      15.26       1.18       4.78   

                     sz.000007  sz.000008  sz.000009  sz.000010  sz.000011  \
timestamp                                                                    
2023-06-07 10:00:00       3.64       2.36      11.86       2.25       9.69   
2023-06-07 13:30:00       3.76       2.38      11.73       2.27       9.69   
2023-06-08 10:00:00       3.76       2.36      11.76       2.25       9.75   
2023-06-08 13:30:00       3.84       2.36      11.70       2.28

In [50]:
# 为 factor_data 提取时间
timestamps_factor = factor_data.index.get_level_values('timestamp')
times_factor = pd.to_datetime(timestamps_factor).time

# 为 stacked_prices 提取时间
timestamps_prices = stacked_prices.index.get_level_values('timestamp')
times_prices = pd.to_datetime(timestamps_prices).time

# 提取上午10:00的因子数据
factor_data_am = factor_data[times_factor == pd.Timestamp('10:00:00').time()]

# 提取上午10:00的价格数据
stacked_prices_am = stacked_prices[times_prices == pd.Timestamp('10:00:00').time()]

# 提取下午13:30的因子数据
factor_data_pm = factor_data[times_factor == pd.Timestamp('13:30:00').time()]

# 提取下午13:30的价格数据
stacked_prices_pm = stacked_prices[times_prices == pd.Timestamp('13:30:00').time()]


AttributeError: 'MultiIndex' object has no attribute 'tz'