In [1]:
%pylab inline
%load_ext autoreload
%autoreload 2

import os
import numpy as np
import pandas as pd

import devkit.api as dk
import finkit.api as fk
import DataAPI
import Factor.api as factor

Populating the interactive namespace from numpy and matplotlib


# 设置宏观参数

In [2]:
schema = pd.DataFrame(DataAPI.schema.get_schema("indicator")).T
inds_ts = schema[schema.type == '时间序列'].index.tolist()
inds_fr = schema[schema.type == '财报数据'].index.tolist()

In [4]:
trading_days_ts = fk.get_monthly_last_trading_days(start="2006-01-01", end="2013-12-31")

# 确定股票池

In [4]:
# 1. 特征
schema = pd.DataFrame(DataAPI.schema.get_schema("indicator")).T
inds_ts = schema[schema.type == '时间序列'].index.tolist()
inds_fr = schema[schema.type == '财报数据'].index.tolist()
# 2. 时间窗
# 财报数据的起始点要比时间序列数据早1年
trading_days_ts = fk.get_monthly_last_trading_days(start="2006-01-01", end="2010-12-31")
trading_days_fr = fk.get_report_days(start="2005-01-01", end="2010-12-31")
# trading_days = sorted(trading_days)
# 3. 股票池
# 从中证1000中选区上市公司个数大于20家的行业对应的所有股票
stocks_pool = pd.read_csv(r"E:\07_data\02_factor\temp_data\zz1000.csv")
industry_num = stocks_pool.groupby(['industry_sw']).count()
industry_selected = industry_num[industry_num.sec_id > 15].index.tolist()
stocks_pool = stocks_pool[stocks_pool.industry_sw.isin(industry_selected)].sec_id.tolist()

# 获取指标

In [5]:
# 时间序列指标
df_inds=pd.DataFrame()
for ind in inds_ts:
    a = DataAPI.read.get_secs_indicator_on_multidays(indicator=ind, sec_ids=stocks_pool, trading_days=trading_days_ts)
    b = pd.DataFrame()
    for date in a:
        td = a[date]
        td['date'] = date
        b = b.append(td)
    if len(df_inds) == 0:
        df_inds = b.copy()
    else:
        df_inds = df_inds.merge(b, how='outer', on=['sec_id', 'date'])

In [17]:
output=pd.DataFrame()
for date in a:
    td = a[date]
    td['date'] = date
    output = output.append(td)

In [None]:
output=pd.DataFrame()
for ind in inds_fr:
    a = DataAPI.read.get_secs_indicator_on_multidays(indicator=ind, trading_days=trading_days)
    df_ind = pd.DataFrame()
    for date in a:
        td = a[date]
        td['date'] = date
        df_ind = df_ind.append(td)
    if len(output) == 0:
        output = df_ind.copy()
    else:
        output = output.merge(df_ind, how='outer', on=['sec_id', 'date'])

In [6]:
df_inds['date_available'] = df_inds['date'].apply(lambda x: dk.get_available_report_day(x))

In [7]:
# 财务报表指标
df_fr=pd.DataFrame()
for ind in inds_fr:
    a = DataAPI.read.get_secs_indicator_on_multidays(indicator=ind, sec_ids=stocks_pool, trading_days=trading_days_fr)
    b = pd.DataFrame()
    for date in a:
        td = a[date]
        td['date'] = date
        b = b.append(td)
    if len(df_fr) == 0:
        df_fr = b.copy()
    else:
        df_fr = df_fr.merge(b, how='outer', on=['sec_id', 'date'])


In [8]:
df_fr = df_fr.rename(columns={'date': 'date_available'})
df_inds = df_inds.merge(df_fr, how='left', on=['sec_id', 'date_available'])

In [18]:
df_inds.to_csv(r"E:\07_data\02_factor\temp_data\featurs.csv")

In [19]:
df_feats = pd.read_csv(r"E:\07_data\02_factor\temp_data\featurs.csv", index_col=0)
df_feats['yearmonth'] = df_feats['date'].apply(lambda x: dk.date2char(dk.char2datetime(x) +  dk.timedelta({'months': 1}))[:7])

# 获取下个月收益率

In [20]:
# 取月初月末的交易日列表
tds = pd.read_csv(r"E:\07_data\02_factor\temp_data\trading_days.csv")
tds['group'] = tds['date'].apply(lambda x: x[:7])
tds = tds.sort_values(['date'])
first = tds.groupby('group').head(1)
last = tds.groupby('group').tail(1)
tds=first.append(last)
tds = tds.sort_values('date')

# 获取月初月末交易日的收盘价
close = factor.get_secs_index(index="close", trading_days=tds.date.tolist(),sec_ids=stocks_pool)
close = close.sort_values(['sec_id', 'date'])
close['yearmonth'] = close['date'].apply(lambda x: x[:7])

# 获取月收益率
def cal_ret(x):
    close_id = x.columns.tolist().index('close')
    return x.iloc[-1, close_id]/x.iloc[0, close_id] - 1
df_target=close.groupby(['sec_id', 'yearmonth'], as_index=False).apply(lambda x: cal_ret(x))
df_target=df_target.reset_index().rename(columns={0: 'month_ret'})

# 合成原始数据集

In [21]:
df_raw=df_feats.merge(df_target, how='inner', on=['sec_id', 'yearmonth'])
yearmonth = sorted(list(set(df_raw.yearmonth)))
group_id = pd.DataFrame(yearmonth, columns=['yearmonth'], index=range(1, len(yearmonth)+1))
group_id = group_id.sort_values(['yearmonth'])
group_id = group_id.reset_index().rename(columns={'index': 'group_id'})
df_raw = df_raw.merge(group_id, how='left', on=['yearmonth'])
# del df_raw['date']
# del df_raw['sec_id']
# del df_raw['yearmonth']
df_raw.to_csv(r"E:\07_data\02_factor\temp_data\df_raw.csv")

In [22]:
df_raw

Unnamed: 0,sec_id,ADJFACTOR,date,AMT,ANNUALSTDEVR_100W,ANNUALYEILD_100W,AVG_TURN_ND,BETA_100W,BIAS,CLOSE,...,OR_TTM2,ROE_TTM3,VAL_EVTOEBITDA2,YOYEPS_BASIC,YOYEPS_DILUTED,YOYOCF,YOYROE,yearmonth,month_ret,group_id
0,000010.SZ,2.693637,2006-01-25,4.364675e+06,46.843319,-32.952400,2.977181,1.1919,-8.256881e+00,2.70,...,3.080643e+08,,-11.184724,,,-28.7477,-8864.1063,2006-02,0.028571,1
1,000018.SZ,1.282588,2006-01-25,1.358069e+06,50.678188,-17.160000,1.720053,1.5987,-6.126687e+00,4.52,...,2.357775e+08,,11.713248,,,-74.4242,-3979.6048,2006-02,0.000000,1
2,000022.SZ,2.334744,2006-01-25,6.055144e+07,30.878664,12.875200,5.061361,0.6385,2.768806e+00,14.16,...,1.891717e+09,,11.095338,,,38.1145,10.3858,2006-02,0.000000,1
3,000029.SZ,1.447462,2006-01-25,0.000000e+00,48.229294,-4.908800,7.879793,1.5283,2.968617e-01,4.73,...,2.845373e+08,,36.421974,,,194.7292,109.1604,2006-02,0.000000,1
4,000034.SZ,1.906248,2006-01-25,1.472515e+06,55.901188,-58.890003,1.923672,1.2344,-2.114804e+00,1.62,...,3.882864e+08,,20.282944,,,202.7708,,2006-02,0.084906,1
5,000035.SZ,1.280996,2006-01-25,2.037291e+06,61.047031,-57.189598,3.475731,1.3494,-1.069072e+01,2.03,...,2.800027e+08,,19.815156,,,98.2684,,2006-02,0.068966,1
6,000038.SZ,1.371282,2006-01-25,1.423429e+06,54.149612,-14.071199,1.736177,1.4871,-2.325179e+00,5.65,...,3.996776e+07,,19.160621,,,85.1273,-288.1595,2006-02,0.069343,1
7,000040.SZ,3.262954,2006-01-25,0.000000e+00,40.954014,-25.974001,1.250129,0.9503,6.479571e+00,2.58,...,5.368975e+08,,12.840418,,,-14.2793,-229.1141,2006-02,0.000000,1
8,000042.SZ,4.209509,2006-01-25,1.136267e+07,38.337105,11.772800,1.514022,1.2504,5.653146e+00,8.27,...,3.057493e+09,,14.140398,,,-119.1237,1117.5913,2006-02,-0.068592,1
9,000043.SZ,1.751271,2006-01-25,0.000000e+00,40.393711,10.644400,4.323170,1.2011,3.697526e+00,7.46,...,8.915229e+08,,-355.287256,,,-544.1779,-31.8605,2006-02,0.000000,1
