In [1]:
import sys
sys.path.append('..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
from utils.utils import normalize, func_list
import warnings
warnings.filterwarnings('ignore')

In [2]:
FILE_PATH = '../data/raw/zz800_close.CSV'
START_DATE = '2010-01-01'
END_DATE = '2020-12-31'
N_RETURN = 20  # return of past 20 days
NAN_DROP_RATIO = 0.1

In [3]:
def get_factor_data(
        file_path, 
        start_date, 
        end_date,
        n_return = 20, 
        nan_drop_ratio=0.1,
    ):
    start_date, end_date = pd.to_datetime(start_date), pd.to_datetime(end_date)
    df = pd.read_csv('../data/raw/zz800_close.CSV', encoding='gbk')
    df = df.rename(columns={'Unnamed: 0': 'date'})
    df = df.drop(columns=['Unnamed: 801'])
    df = df.drop([0, 1]).reset_index(drop=True)
    df['date'] = pd.to_datetime(df['date'].astype(int).astype(str))
    df = df[(df['date'] >= start_date-pd.Timedelta(n_return*5, 'd')) & (df['date'] <= end_date)]
    cols = (df.isnull().sum(0) / len(df) <= nan_drop_ratio).values
    df = df.iloc[:, cols]
    df = df.set_index('date')
    df = df.pct_change(n_return, fill_method='ffill')
    df = df[(df.index >= start_date) & (df.index <= end_date)]
    df = df.fillna(method='ffill').fillna(method='bfill')
    assert df.isnull().sum().sum() == 0
    df = df.reset_index()
    return df

In [4]:
df_factor = get_factor_data(FILE_PATH, START_DATE, END_DATE, N_RETURN, NAN_DROP_RATIO)
df_factor

Unnamed: 0,date,000001.SZ,000002.SZ,000009.SZ,000012.SZ,000021.SZ,000027.SZ,000031.SZ,000039.SZ,000050.SZ,...,601888.SH,601898.SH,601899.SH,601919.SH,601939.SH,601958.SH,601988.SH,601989.SH,601991.SH,601998.SH
0,2010-01-04,-0.062846,-0.114453,-0.140364,-0.020192,-0.018726,-0.042144,-0.177044,0.076480,0.040063,...,-0.002405,-0.077449,-0.104287,-0.102752,-0.012908,-0.117316,-0.002324,-0.018072,-0.058823,0.110039
1,2010-01-05,-0.081230,-0.157038,-0.176929,-0.041206,0.000000,-0.044223,-0.204444,0.071025,0.056315,...,-0.003835,-0.041842,-0.076203,-0.091674,0.001615,-0.081804,0.011648,-0.018072,-0.037735,0.087885
2,2010-01-06,-0.075494,-0.148026,-0.171250,-0.011558,-0.042446,-0.035328,-0.195815,0.101957,0.028918,...,-0.002468,-0.011910,-0.079847,-0.060621,0.004926,-0.079942,0.014198,-0.018072,-0.027629,0.051660
3,2010-01-07,-0.081136,-0.145470,-0.141375,-0.035215,-0.002958,-0.028232,-0.164110,0.111111,0.020410,...,0.046687,-0.011528,-0.075800,-0.061087,0.003339,-0.059481,0.016897,-0.018072,-0.024019,0.053576
4,2010-01-08,-0.092005,-0.130982,-0.091128,-0.058733,0.015083,-0.019274,-0.149381,0.102649,0.030912,...,0.032729,-0.009329,-0.091524,-0.074732,0.000000,-0.085270,0.014379,-0.018072,-0.029478,0.030630
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2669,2020-12-25,-0.084264,-0.111041,0.012001,0.130935,-0.076049,0.105719,-0.090721,-0.014744,-0.035363,...,0.340579,0.020272,0.035754,0.146860,-0.108111,0.001599,-0.045054,-0.061790,0.004215,-0.071433
2670,2020-12-28,-0.045086,-0.074267,-0.012096,0.109489,-0.119105,0.148149,-0.084033,0.054990,-0.071755,...,0.333333,0.022475,0.016148,0.208672,-0.127271,0.040990,-0.039387,-0.062204,0.042195,-0.032441
2671,2020-12-29,-0.043890,-0.072520,-0.018742,0.020202,-0.110681,0.068542,-0.098361,-0.041722,-0.102775,...,0.301786,-0.020181,-0.030888,0.243551,-0.143836,-0.008149,-0.056542,-0.070769,0.008449,-0.059377
2672,2020-12-30,-0.021905,-0.088339,0.051317,0.071533,-0.126010,0.095744,-0.113361,-0.012113,-0.116829,...,0.345745,-0.019652,-0.008556,0.197528,-0.149111,0.006467,-0.048049,-0.068492,0.012719,-0.048684


In [5]:
# 因子标准化
from utils.utils import normalize
df_factor.iloc[:, 1:] = df_factor.iloc[:, 1:].apply(normalize, axis=1)

In [9]:
# df_factor.to_csv('../data/processed/factor_data.csv', index=False)