In [1]:
%pylab inline
%load_ext autoreload
%autoreload 2

import os
import numpy as np
import pandas as pd

import devkit.api as dk
import finkit.api as fk
import DataAPI
import Factor.api as factor

Populating the interactive namespace from numpy and matplotlib


# 获取原始数据集

In [3]:
df_raw = pd.read_csv(r"E:\07_data\02_factor\temp_data\df_raw.csv", index_col=0)

In [5]:
df_raw.head(3)

Unnamed: 0,sec_id,ADJFACTOR,date,AMT,ANNUALSTDEVR_100W,ANNUALYEILD_100W,AVG_TURN_ND,BETA_100W,BIAS,CLOSE,...,OR_TTM2,ROE_TTM3,VAL_EVTOEBITDA2,YOYEPS_BASIC,YOYEPS_DILUTED,YOYOCF,YOYROE,yearmonth,month_ret,group_id
0,000010.SZ,2.693637,2006-01-25,4364675.2,46.843319,-32.9524,2.977181,1.1919,-8.256881,2.7,...,308064300.0,,-11.184724,,,-28.7477,-8864.1063,2006-02,0.028571,1
1,000018.SZ,1.282588,2006-01-25,1358068.6,50.678188,-17.16,1.720053,1.5987,-6.126687,4.52,...,235777500.0,,11.713248,,,-74.4242,-3979.6048,2006-02,0.0,1
2,000022.SZ,2.334744,2006-01-25,60551439.6,30.878664,12.8752,5.061361,0.6385,2.768806,14.16,...,1891717000.0,,11.095338,,,38.1145,10.3858,2006-02,0.0,1


# 删除极端收益率样本 只取收益率介于%5-95%之间的样本

In [6]:
# 删除极端收益率样本 只取收益率介于%5-95%之间的样本
df_lp = df_raw.groupby(['yearmonth']).month_ret.apply(lambda x: x.quantile(0.05)).reset_index().rename(columns={'month_ret': 'lp'})
df_up = df_raw.groupby(['yearmonth']).month_ret.apply(lambda x: x.quantile(0.95)).reset_index().rename(columns={'month_ret': 'up'})
df_raw = df_raw.merge(df_lp, how='left', on='yearmonth')
df_raw = df_raw.merge(df_up, how='left', on='yearmonth')
df_raw = df_raw[(df_raw.month_ret > df_raw.lp) & (df_raw.month_ret < df_raw.up)]
del df_raw['lp']
del df_raw['up']

# 获得标签 每月收益率处于同期前30%的为1（强势股） 位于后30%的为0（弱势股

In [7]:
df_lp = df_raw.groupby(['yearmonth']).month_ret.apply(lambda x: x.quantile(0.3)).reset_index().rename(columns={'month_ret': 'lp'})
df_up = df_raw.groupby(['yearmonth']).month_ret.apply(lambda x: x.quantile(0.7)).reset_index().rename(columns={'month_ret': 'up'})
df_raw = df_raw.merge(df_lp, how='left', on='yearmonth')
df_raw = df_raw.merge(df_up, how='left', on='yearmonth')
for i in df_raw.index:
    if df_raw.loc[i, 'month_ret'] < df_raw.loc[i, 'lp']:
        df_raw.loc[i, 'good_yn'] = 0
    elif df_raw.loc[i, 'month_ret'] > df_raw.loc[i, 'up']:
        df_raw.loc[i, 'good_yn'] = 1
    else:
        df_raw.loc[i, 'good_yn'] = np.nan
del df_raw['lp']
del df_raw['up']
del df_raw['month_ret']
df_raw = df_raw[~df_raw.good_yn.isnull()]

# 删除缺失过多的特征

In [8]:
a = df_raw.isnull().sum() / len(df_raw) < 0.1
feats_to_retain = a[a].index
df_raw = df_raw[feats_to_retain].dropna()
df_raw = df_raw.reset_index()
del df_raw['index']
del df_raw['date']
# del df_raw['sec_id']
# del train['group_id']

In [12]:
df_raw.columns

Index(['sec_id', 'ADJFACTOR', 'AMT', 'AVG_TURN_ND', 'BIAS', 'CLOSE',
       'DOWN_DAYS', 'MKT_CAP_FLOAT', 'PB_LF', 'PCF_OCF_TTM', 'PS_TTM',
       'RISK_GAINVARIANCE120', 'RISK_GAINVARIANCE20', 'RISK_GAINVARIANCE60',
       'RISK_LOSSVARIANCE120', 'RISK_LOSSVARIANCE20', 'RISK_LOSSVARIANCE60',
       'RISK_VARIANCE120', 'RISK_VARIANCE20', 'RISK_VARIANCE60', 'RSI',
       'UP_DAYS', 'VOLUME', 'date_available', 'APTURN', 'ARTURN', 'CURRENT',
       'DEBTTOASSETS', 'DIVIDENDYIELD2', 'EBIT2_TTM', 'EV2_TO_EBITDA',
       'OPTOEBT', 'OR_TTM2', 'VAL_EVTOEBITDA2', 'YOYOCF', 'yearmonth',
       'group_id', 'good_yn'],
      dtype='object')

In [337]:
X = ['val_pe_deducted_ttm', 'ps_ttm', 'yoy_or', 'yoy_assets',
       'debttoassets', 'current', 'quick', 'cashtocurrentdebt', 'turnover_ttm',
       'invturn', 'netprofitmargin', 'grossprofitmargin', 'roe_diluted', 'roa',
       'vol_180days', 'vol_90days', 'vol_30days', 'vol_5days', 'turn_360days',
       'turn_180days', 'turn_90days', 'turn_30days', 'turn_5days',
       'rr_180days', 'rr_90days', 'rr_30days', 'rr_5days', 'boll_15',
       'boll_30', 'boll_45', 'boll_60', 'boll_100']
y = ['sec_id', 'yearmonth','good_yn']
# 需要进行行业中性化的特征
feats1 = ['val_pe_deducted_ttm', 'ps_ttm', 'yoy_or', 'yoy_assets',
       'debttoassets', 'current', 'quick', 'cashtocurrentdebt', 'turnover_ttm',
       'invturn', 'netprofitmargin', 'grossprofitmargin', 'roe_diluted', 'roa']
# 普通标准化的特征
feats2 = list(set(X) - set(feats1))
df_target = df_raw[y]

########################下面的步骤不影响特征个数和样本个数 如需变动 请在上面更改########################


# 行业中性化

In [338]:
industry = pd.read_csv(r"E:\07_data\02_factor\temp_data\industry_sw.csv")
df_raw = df_raw.merge(industry, how='left', on=['sec_id'])

In [339]:
df_feats1 = df_raw[['sec_id', 'yearmonth', 'industry_sw']+(feats1)]

In [340]:
industry_mean = df_feats1.groupby(['yearmonth', 'industry_sw']).mean().reset_index()
new_names = ['yearmonth', 'industry_sw'] + [i + '_mean' for i in feats1]
industry_mean.columns = new_names
industry_mean = industry_mean.dropna()

industry_std = df_feats1.groupby(['yearmonth', 'industry_sw']).std().reset_index()
new_names = ['yearmonth', 'industry_sw'] + [i + '_std' for i in feats1]
industry_std.columns = new_names
industry_std = industry_std.dropna()


In [341]:
df_feats1 = df_feats1.merge(industry_mean, how='inner', on=['yearmonth', 'industry_sw'])
df_feats1 = df_feats1.merge(industry_std, how='inner', on=['yearmonth', 'industry_sw'])
df_feats1 = df_feats1.replace(0, np.nan)
df_feats1 = df_feats1.dropna()

In [342]:
for i in df_feats1.index:
    for j in feats1:
        df_feats1.loc[i, j] = (df_feats1.loc[i, j] - df_feats1.loc[i, j + '_mean']) / df_feats1.loc[i, j + '_std']
del df_feats1['industry_sw']
df_feats1 = df_feats1[['sec_id', 'yearmonth'] + feats1]

# 统计标准化

In [343]:
df_feats2 = df_raw[['sec_id', 'yearmonth']+(feats2)].set_index(['sec_id', 'yearmonth'])

In [344]:
df_feats2 = (df_feats2 - df_feats2.mean()) / df_feats2.std()

In [345]:
df_feats2 = df_feats2.reset_index()

In [347]:
df_all = df_feats1.merge(df_feats2, how='inner', on=['sec_id', 'yearmonth'])
df_all = df_all.set_index(['sec_id', 'yearmonth'])

# iForest剔除极端样本

In [348]:
from sklearn.ensemble import IsolationForest
clf = IsolationForest()
clf.fit(df_all)##　训练出一个iForest，iForest为无监督的方法，但是也不能直接对无标记样本集预测，可以先fit无标记样本集，然后在predict
y_pred = clf.predict(df_all) # 1：正常值 -1：异常样本
df_all['is_outlier'] = y_pred
df_all = df_all[df_all.is_outlier==1]
del df_all['is_outlier'] 
df_all = df_all.reset_index()

In [349]:
df_all = df_all.merge(df_target, how='inner', on=['sec_id', 'yearmonth'])
df_all.to_csv(r"E:\07_data\02_factor\temp_data\train_for_class.csv")

In [2]:
df_all = pd.read_csv(r"E:\07_data\02_factor\temp_data\train_for_class.csv", index_col=0)

In [4]:
del df_all['sec_id']
del df_all['yearmonth']

# 确定特征向量和目标变量

In [5]:
df_all = pd.read_csv(r"E:\07_data\02_factor\temp_data\train_for_class.csv", index_col=0) 
del df_all['sec_id']
del df_all['yearmonth']
X = df_all.iloc[:, :-1]
y = df_all.iloc[:, -1]

In [6]:
from sklearn.decomposition import PCA

In [7]:
pca = PCA()

In [8]:
pca.fit(X)

PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [9]:
a = (pca.explained_variance_ratio_.cumsum() <= 0.95).sum()
b = pca.explained_variance_ratio_[:a].sum()
print("前{}个主成分可以解释{}的变差".format(a, b))

前18个主成分可以解释0.9433991757327469的变差


In [10]:
X_pca = pca.transform(X)[:, :a]

##################################################建模部分##################################################

In [20]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.datasets i

# 逻辑回归LR

In [21]:
lr = LogisticRegression()

In [22]:
lr_cv = cross_val_score(lr, X_pca, y, cv=10, scoring="precision")

In [23]:
lr_cv

array([ 0.52667646,  0.53629417,  0.53833049,  0.53889304,  0.51725572,
        0.52588556,  0.52021403,  0.57348901,  0.53236308,  0.5286031 ])

In [380]:
tree_cv = cross_val_score(tree, X, y, cv=10)

In [375]:
lr_cv.mean()

0.53391574643288586

In [381]:
tree_cv.mean()

0.50137089443784144

In [12]:
svc= SVC(kernel='rbf', C=0.1, gamma=0.003)

In [15]:
svc_cv = cross_val_score(svc, X_pca, y, cv=10)

In [17]:
svc_cv

array([ 0.53286177,  0.52814618,  0.53020925,  0.52637784,  0.51105217,
        0.52519894,  0.50619104,  0.54556178,  0.52875258,  0.52816278])