In [8]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from tqdm import tqdm
import pickle
import os
import warnings
warnings.filterwarnings('ignore')

In [9]:
#neutralization函数构建
def neutralization(factor_df,tar_col,barra_col,mode):   
    result=pd.DataFrame() 
    for col in tar_col:
        #（1）去极值————按照均值上下3个标准差来取最大值和最小值，超过这个范围的就当作是极值，去掉
        edge_up = factor_df[col].mean()+3*factor_df[col].std()
        edge_low = factor_df[col].mean()-3*factor_df[col].std()
        factor_df[col]=factor_df[col].clip(edge_low,edge_up)

        #（2）标准化
        if int(mode)==1:
            factor_df[col] = (factor_df[col] - factor_df[col].min())/(factor_df[col].max() - factor_df[col].min())
        elif mode==2:
            factor_df[col] = (factor_df[col] - factor_df[tar_col].mean())/factor_df[col].std()
        elif mode==3:
            factor_df[col] = factor_df[col]/10**np.ceil(np.log10(factor_df[col].abs().max()))

        #（3）中性化
    #     为了平整数据、剔除风险和板块的线性影响。比如某个因子就是在建筑管用，其他情况都不管用，那因子暴露出来的结果肯定是分布不均匀的，
    #     比如在建筑行业股上数值偏高，其他股上数值偏低，那整体的因子分布在股票上就是一边高一边底的不均匀分布。

    #     中性化的方式是对因子值和barra因子、行业做线性回归，最后用剩下的残差替代因子值。这个残差是跟barra和行业无关的。
        results = sm.OLS(factor_df[col], factor_df[barra_col]).fit()    #股票因子为因变量，barra、行业信息为自变量
        result[col]=results.resid    #取残差项作为中性化后的因子值
    return result

In [11]:
date=pickle.load(open('./data/date.pkl','rb'))

output_folder = './new_factor_neutralization/factor_final1_1'   #结果保存
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
output_file=os.listdir(output_folder)

barra_path='./data/data_barra'     #barra数据获取
barra_file=sorted(os.listdir(barra_path))

factor_path='./new_feature/factor_test'       #股票因子获取
factor_file=sorted(os.listdir(factor_path))

mode = 3    #标准化模式选择

for file in tqdm(barra_file):
    if file not in factor_file:
        continue
    date_tmp=file[:-4]
    factor=pd.read_csv(f'{factor_path}/{file}',index_col=0,header=0)   #读取因子
    barra_tmp=pd.read_csv(f'{barra_path}/{file}',index_col=0,header=0)   #读取barra

    target_list=barra_tmp.index.intersection(factor.index)    #对股票代码求交集

    barra_tmp=barra_tmp.loc[target_list,:]
    final=pd.concat([factor,barra_tmp],axis=1)

    final.replace([np.inf,-np.inf],np.nan,inplace=True)    #处理空值为0（可替代为均值等）
    final.fillna(0,inplace=True) 

    final_col=barra_tmp.columns.tolist()
    col_name=factor.columns.tolist()      #获取因子名称
    
    data=neutralization(final,col_name,'size',mode)     #调用中性化函数
    data.to_csv(f'{output_folder}/{file}')     #保存结果

100%|█████████████████████████████████████████| 949/949 [00:17<00:00, 55.02it/s]


In [4]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from tqdm import tqdm
import pickle
import os
import warnings
warnings.filterwarnings('ignore')

class NeutralizationProcessor:
    def __init__(self, barra_path, factor_path, output_folder, mode):
        self.barra_path = barra_path
        self.factor_path = factor_path
        self.output_folder = output_folder
        self.mode = mode

    def neutralize(self, factor_df, tar_col, barra_col, mode):
        result = pd.DataFrame()
        for col in tar_col:
            edge_up = factor_df[col].mean() + 3 * factor_df[col].std()
            edge_low = factor_df[col].mean() - 3 * factor_df[col].std()
            factor_df[col] = factor_df[col].clip(edge_low, edge_up)

            if mode == 1:
                factor_df[col] = (factor_df[col] - factor_df[col].min()) / (factor_df[col].max() - factor_df[col].min())
            elif mode == 2:
                factor_df[col] = (factor_df[col] - factor_df[tar_col].mean()) / factor_df[col].std()
            elif mode == 3:
                factor_df[col] = factor_df[col] / 10**np.ceil(np.log10(factor_df[col].abs().max()))

            results = sm.OLS(factor_df[col], factor_df[barra_col]).fit()
            result[col] = results.resid
        return result

    def process_data(self):
        barra_file = sorted(os.listdir(self.barra_path))
        factor_file = sorted(os.listdir(self.factor_path))

        for file in tqdm(barra_file):
            if file not in factor_file:
                continue
            date_tmp = file[:-4]
            factor = pd.read_csv(f'{self.factor_path}/{file}', index_col=0, header=0)
            barra_tmp = pd.read_csv(f'{self.barra_path}/{file}', index_col=0, header=0)

            target_list = barra_tmp.index.intersection(factor.index)

            barra_tmp = barra_tmp.loc[target_list, :]

            final = pd.concat([factor, barra_tmp], axis=1)

            final.replace([np.inf, -np.inf], np.nan, inplace=True)
            final.fillna(0, inplace=True)

            final_col = barra_tmp.columns.tolist()
            col_name = factor.columns.tolist()
            data = self.neutralize(final, col_name, 'size', self.mode)
            data.to_csv(f'{self.output_folder}/{file}')

# Configuration
barra_path = './data/data_barra'
factor_path = './new_feature/factor_test'
output_folder = './new_factor_neutralization/factor_final1_1'
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
mode = 3

# Create and use the NeutralizationProcessor
processor = NeutralizationProcessor(barra_path, factor_path, output_folder, mode)
processor.process_data()


100%|█████████████████████████████████████████| 949/949 [00:16<00:00, 56.07it/s]
