In [19]:
import pandas as pd 
import datetime
import numpy as np 
from Alpha101_code_1 import get_alpha
from trade_date import *
from scipy.stats import pearsonr

class FactorBase:
    """
    一次计算 只能筛选一次factor，利用start_date 和 end_date来计算start_date~endate之间的持有收益与factor之间的相关性
    若需要多次筛选factors，则要重复创建不同时间窗口的FactorBase或其子类
    要注意提前保证 self.start_date 和 self.end_date 是交易日
    """

    def __init__(self, df:pd.DataFrame, start_date, end_date):
        self.init_data = df
        self.data4cal = pd.DataFrame()
        self.hs300 = pd.DataFrame()

        self.port_stk_num = 2
        self.ports = []

        self.original_factors = []
        self.raw_factors = [] # corr 初步筛选有效
        self.refined_factors = [] # port1 excess/below hs300

        self.factor_corr_dict = {}
        self.adjust_frequency = None
        self.start_date = start_date
        self.end_date = end_date
    
    def init(self):
        i = 0
        for code, data in self.init_data.groupby("code"):
            if i > 9:
                break
            self.data4cal = pd.concat([self.data4cal, self.getFactors(data)], axis=0)
            i += 1

    def getFactors(self, data):
        return get_alpha(data)
        #raise NotImplementedError

    # 在date日，按照factor_col 排序的 股票列表
    def codeList(self, date, factor_col):
        return self.data4cal.reset_index().set_index("trade_date").loc[date].sort_values(by=factor_col, ascending=False)["code"]

    # 将排好序的股票列表分割为指定大小的ports
    def splitIntoPorts(self, port_stk_num, code_list):
        ports = []
        code_list = list(code_list)
        split_num = len(code_list)//port_stk_num
        for i in range(split_num):
            ports.append(code_list[port_stk_num * i : port_stk_num * (i + 1)])
        if split_num * port_stk_num != len(code_list):
            # 如果非整除，则存在剩余的股票，一并算在最后
            ports.append(code_list[port_stk_num*(i+1) : ])
        return ports

    # 在date日，按照factor_col 加权的，相对“上一个”交易日的股票平均收益
    def weightedReturnSameDay(self, port, date, factor_col):
        df = self.data4cal.reset_index().set_index(["code", "trade_date"])
        return_list = []
        alpha_list = []
        for code in port:
            return_list.append(df.loc[(code, date), "S_DQ_PCTCHANGE"][0]/100)
            alpha_list.append(df.loc[(code, date), factor_col][0])
        return_array = np.array(return_list)
        alpha_array = abs(np.array(alpha_list))
        return ((return_array * alpha_array)/alpha_array.sum()).sum()

    def weightedReturnBetweenDays(self):
        # 是用未来的收益率来计算相关系数吗？
        # 然后假定因子仍有效，从下个月开始持有，直到下下个月？
        pass

    def factors_ports_corrCal(self, port_stk_num, date, next_date, factor_col):

    # 计算在next_date/date - 1 对应的ports的月收益，与ports的序号的相关性
    # 这里要注意和 prob_excess_hs300的日期区分，corr这里是计算date当日的相关系数， prob_excess_hs300是遍历了start_date, end_date的所有日期

        df = self.data4cal.reset_index().set_index("trade_date")
        if df[factor_col].isnull().all():
            return 0,0
        if df[factor_col].dtype == "bool":
            return 0,0
        
        code_list = df.loc[date].sort_values(by=factor_col, ascending=False)["code"]
        ports = self.splitIntoPorts(port_stk_num, code_list)
        ports_weighted_return_list = []
        df = df.reset_index().set_index(["code", "trade_date"])
        # next_date = get_next_trade_date(date, 20)
        for port in ports:
            port_return_list = []
            port_alpha_list = []
            for code in port:
                rtn = df.loc[(code, next_date), "S_DQ_CLOSE"][0]/df.loc[(code, date), "S_DQ_CLOSE"][0] - 1
                port_return_list.append(rtn) #某一个port中所有code的收益列表，可能存在Nan
                port_alpha_list.append(df.loc[(code, date), factor_col][0]) # 某一个port中所有code的factor列表，可能存在NaN
            # 存在NaN后，如何处理
            
            port_return_array = np.array(port_return_list)
            port_alpha_array = np.array(port_alpha_list)
            port_weighted_monthly_return = (port_return_array * port_alpha_array / port_alpha_array.sum()).sum()
            ports_weighted_return_list.append(port_weighted_monthly_return)
        
        # 无需理会ports_weighted_return_list 中的Nan，因为[6,5,4] 与 [return1, return2, return3,]的corr和[3,2,1]与[return1, return2, return3]的corr应该相等
        index = [len(ports_weighted_return_list) - i for i in range(len(ports_weighted_return_list))]
        corr = pearsonr(np.array(index), np.array(ports_weighted_return_list))[0]
        self.factor_corr_dict[factor_col] = corr
        return pearsonr(np.array(index), np.array(ports_weighted_return_list))

    def readHs300(self, df:pd.DataFrame):
        self.hs300 = df

    # 需要确保 start_date, end_date是交易日，没有做非交易日的处理
    def probExcessHs300(self, port, start_date, end_date, factor_col):
        """
        start_date ~ end_date 之间的每一天(date)，若因子的当日的加权收益超过hs300，则记录当天为超过hs300，否则为没超过
        概率 = 超过的天数/总交易天数
        """

        hs300_df = self.hs300.reset_index().set_index("trade_date").loc[start_date:end_date]
        port_return_list = []
        for date in trade_date_range(start_date, end_date):
            port_return_list.append(self.weightedReturnSameDay(port, date, factor_col))
        if len(port_return_list) != len(hs300_df):
            raise ValueError

        tmp_array = np.array(port_return_list) - np.array(hs300_df["return_daily"])
        gt_num = len(tmp_array[tmp_array > 0])
        return gt_num/len(port_return_list)
    
        
    # 保留了start_date, end_date作为函数形参，没有用self写死在函数内部，这样可读性会稍微好一些
    def probBelowHs300(self, port, start_date, end_date, factor_col):
        hs300_df = self.hs300.reset_index().set_index("trade_date").loc[start_date:end_date]
        port_return_list = []
        for date in trade_date_range(start_date, end_date):
            port_return_list.append(self.weightedReturnSameDay(port, date, factor_col))
        if len(port_return_list) != len(hs300_df):
            raise ValueError

        tmp_array = np.array(port_return_list) - np.array(hs300_df["return_daily"])
        lt_num = len(tmp_array[tmp_array < 0])
        return lt_num/len(port_return_list)


    def setFactors(self):
        # raise NotImplementedError
        self.original_factors =  ['alpha001', 'alpha002',
       'alpha003', 'alpha004', 'alpha005', 'alpha006', 'alpha007', 'alpha008',
       'alpha009', 'alpha010', 'alpha011', 'alpha012', 'alpha013', 'alpha014',
       'alpha015', 'alpha016', 'alpha017', 'alpha018', 'alpha019', 'alpha020',
       'alpha021', 'alpha022', 'alpha023', 'alpha024', 'alpha025', 'alpha026',
       'alpha027', 'alpha028', 'alpha029', 'alpha030', 'alpha031', 'alpha032',
       'alpha033', 'alpha034', 'alpha035', 'alpha036', 'alpha037', 'alpha038',
       'alpha039', 'alpha040', 'alpha041', 'alpha042', 'alpha043', 'alpha044',
       'alpha045', 'alpha046', 'alpha047', 'alpha049', 'alpha050', 'alpha051',
       'alpha052', 'alpha053', 'alpha054', 'alpha055', 'alpha057', 'alpha060',
       'alpha061', 'alpha062', 'alpha064', 'alpha065', 'alpha066', 'alpha068',
       'alpha071', 'alpha072', 'alpha073', 'alpha074', 'alpha075', 'alpha077',
       'alpha078', 'alpha081', 'alpha083', 'alpha084', 'alpha085', 'alpha086',
       'alpha088', 'alpha092', 'alpha094', 'alpha095', 'alpha096', 'alpha098',
       'alpha099', 'alpha101']

    # 通过next_date/date - 1的收益率来计算rawFactors
    def rawFactors(self):
        self.raw_factors = []
        for factor in self.original_factors:
            corr = self.factors_ports_corrCal(self.port_stk_num, self.start_date, self.end_date, factor)
            self.factor_corr_dict[factor] = corr
            if abs(corr[0]) > 0.5:
                self.raw_factors.append(corr[0])
        return self.raw_factors


    def refineFactors(self):
        for factor in self.raw_factors:
            # 我们必须在start_date 计算codelist啊，不然拿什么在未来的日子里计算持有期收益(不在start_date，计算就没有办法持仓了)
            code_list = self.codeList(self.start_date, factor)
            ports = self.splitIntoPorts(self.port_stk_num, code_list)
            if self.factor_corr_dict[factor] > 0:
                # start_date计算出后续需要持有的股票后，在持有期间计算大因子port跑赢hs300的概率(若正相关)
                if self.probExcessHs300(ports[0], self.start_date, self.end_date, factor) > 0.5:
                    self.refined_factors.append(factor)
            else:
                # start_date 计算出后续需要持有的股票后，在持有期间计算大因子port跑输hs300的概率(若负相关)
                if self.probBelowHs300(ports[0], self.start_date, self.end_date, factor) > 0.5:
                    self.refined_factors.append(factor)
            # 不考虑小因子的股票跑赢/跑输了，因为小因子可能会出现Nan效果不好

    def getHolding(self):
        pass

    def changeHolding(self):
        pass
    
def modify4alpha101(df):
    df.rename(columns={"代码":"code","简称":"short_name","日期":"trade_date","前收盘价(元)":"pre_close",
                            "开盘价(元)":"S_DQ_OPEN","最高价(元)":"S_DQ_HIGH","最低价(元)":"S_DQ_LOW",
                            "收盘价(元)":"S_DQ_CLOSE","成交量(股)":"S_DQ_VOLUME","成交金额(元)":"S_DQ_AMOUNT"
                            ,"涨跌(元)":"change","涨跌幅(%)":"S_DQ_PCTCHANGE","均价(元)":"S_DQ_AVEPRICE"}, inplace=True)
    df["trade_date"] = pd.to_datetime(df["trade_date"])
    df.set_index("trade_date", inplace=True)
    # 对于全市场数据就不要返回 tuple
    return df.loc["2018-05-01":"2019-04-30", ["code", "S_DQ_OPEN", "S_DQ_HIGH", "S_DQ_LOW", "S_DQ_CLOSE", "S_DQ_VOLUME", "S_DQ_AMOUNT", "S_DQ_PCTCHANGE"]]



In [6]:
raw_df = pd.read_csv("E:/data_all/A.csv")

In [7]:
df = modify4alpha101(raw_df)

分界

In [20]:
fac = FactorBase(df, "2018-05-08", get_next_trade_date("2018-05-18", 20))

In [21]:
fac.init()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  inner[self.returns < 0] = stddev(self.returns, 20)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['alpha001']=stock.alpha001()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stabl

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['alpha035']=stock.alpha035()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['alpha036']=stock.alpha036()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['alpha037']=stock.alpha037()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['alpha071']=stock.alpha071()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['alpha072']=stock.alpha072()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['alpha073']=stock.alpha073()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = 

In [22]:
fac.setFactors()

In [23]:
fac.rawFactors()

ValueError: array must not contain infs or NaNs

In [None]:
fac.factors_ports_corrCal(2, )