In [1]:
%pylab inline
%load_ext autoreload
%autoreload 2

import os
import numpy as np
import pandas as pd

import devkit.api as dk
import finkit.api as fk
import DataAPI
import Factor.api as factor

Populating the interactive namespace from numpy and matplotlib


# 获取特征列表

In [2]:
feats = list(dk.json2dict(r"E:\07_data\02_factor\index.json").keys())

# 获取月末交易日列表

In [3]:
start = "2005-01-01"
end = "2018-03-01"
trading_days = fk.get_monthly_last_trading_days(start=start, end=end)
#trading_days = list(set(fk.get_trading_days(start=start, end=end)).union(set(fk.get_report_days(start=start, end=end))))
trading_days = sorted(trading_days)

# 获取hs300成分股

In [4]:
hs300=DataAPI.read.get_index_contents(index_code='000300.SH', date='2018-01-01', approx=True)

# 获取指标

In [None]:
df_feats = factor.get_secs_multiple_indexs(indexs=feats, sec_ids=hs300, trading_days=trading_days)

In [13]:
df_feats.to_csv(r"E:\07_data\02_factor\temp_data\featurs.csv")

In [10]:
df_feats = pd.read_csv(r"E:\07_data\02_factor\temp_data\featurs.csv", index_col=0)

In [12]:
df_feats['yearmonth'] = df_feats['date'].apply(lambda x: dk.date2char(dk.char2datetime(x) +  dk.timedelta({'months': 1}))[:7])

# 获取下个月收益率

In [14]:
# 取月初月末的交易日列表
tds = pd.read_csv(r"E:\07_data\02_factor\temp_data\trading_days.csv")
tds['group'] = tds['date'].apply(lambda x: x[:7])
tds = tds.sort_values(['date'])
first = tds.groupby('group').head(1)
last = tds.groupby('group').tail(1)
tds=first.append(last)
tds = tds.sort_values('date')

In [21]:
close = factor.get_secs_index(index="close", trading_days=tds.date.tolist(),sec_ids=hs300)
close = close.sort_values(['sec_id', 'date'])
close['yearmonth'] = close['date'].apply(lambda x: x[:7])

In [24]:
def cal_ret(x):
    close_id = x.columns.tolist().index('close')
    return x.iloc[-1, close_id]/x.iloc[0, close_id] - 1
df_target=close.groupby(['sec_id', 'yearmonth'], as_index=False).apply(lambda x: cal_ret(x))
df_target=df_target.reset_index().rename(columns={0: 'month_ret'})

# 合并特征和目标变量

In [384]:
df_all=df_feats.merge(df_target, how='inner', on=['sec_id', 'yearmonth'])
yearmonth = sorted(list(set(df_all.yearmonth)))
group_id = pd.DataFrame(yearmonth, columns=['yearmonth'], index=range(1, len(yearmonth)+1))
group_id = group_id.sort_values(['yearmonth'])
group_id = group_id.reset_index().rename(columns={'index': 'group_id'})
df_all = df_all.merge(group_id, how='left', on=['yearmonth'])
# del df_all['date']
# del df_all['sec_id']
# del df_all['yearmonth']
df_all.to_csv(r"E:\07_data\02_factor\temp_data\rawdata.csv")

In [380]:
lp = df_all.month_ret.quantile(0.3)
up = df_all.month_ret.quantile(0.7)

In [393]:
def derive_yn(df, raw_var, target_var, lp, up):
    """
    根据收益率和阈值获得好股票标签 强势股票：1 弱势股票：0
    @df: 数据集
    @raw_var: 基于的数值变量
    @target_var: 要生成的分类变量
    @lp: 上限
    @up: 下限
    """
    df[target_var] = np.nan
    for i in df.index:
        if df.loc[i, raw_var] < lp:
            df.loc[i, target_var] = 0
        elif df.loc[i, raw_var] > up:
            df.loc[i, target_var] = 1
        else:
            df.loc[i, target_var] = np.nan
    df = df.dropna()
    return df

# 选择训练集进行预处理

In [557]:
# 比如当前是第120期  则以120期之前的12期为训练数据 训练出模型后来预测第120期 对训练集和预测集采取相同的数据预处理
train = df_all[df_all.group_id.between(110-5,110)]

In [None]:
def 

## 缺失值处理(删除)

In [558]:
a = train.isnull().sum() / len(train) < 0.1
feats_to_retain = a[a].index
train = train[feats_to_retain].dropna()
train = train.reset_index()
del train['index']
del train['date']
del train['sec_id']
del train['yearmonth']
# del train['group_id']

In [559]:
# 创建本期数据
df_new = train[train.group_id==train.group_id.max()]
df_new = derive_yn(df_new.copy(), 'month_ret', 'good_yn', df_new.month_ret.quantile(0.3), df_new.month_ret.quantile(0.7))
del df_new['group_id']
del df_new['month_ret']
X_new = df_new.iloc[:, :-1]
X_new = (X_new - X_new.mean()) / X_new.std()
y_new = df_new.iloc[:, -1]
del df_new

In [560]:
# 创建训练期数据
df_train = train[train.group_id != train.group_id.max()]
df_train = derive_yn(df_train.copy(), 'month_ret', 'good_yn', df_train.month_ret.quantile(0.3), df_train.month_ret.quantile(0.7))
del df_train['group_id']
del df_train['month_ret']
X_train = df_train.iloc[:, :-1]
y_train = df_train.iloc[:, -1]
del df_train

## iForest剔除极端样本

In [561]:
from sklearn.ensemble import IsolationForest
clf = IsolationForest()
clf.fit(X_train)##　训练出一个iForest，iForest为无监督的方法，但是也不能直接对无标记样本集预测，可以先fit无标记样本集，然后在predict
y_pred = clf.predict(X_train) # 1：正常值 -1：异常样本
X_train['is_outlier'] = y_pred
train = X_train[X_train.is_outlier==1]
del X_train['is_outlier'] 

## 统计标准化 

In [562]:
X_train = (X_train - X_train.mean()) / X_train.std()

## PCA

In [563]:
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(X_train)
n_components = (pca.explained_variance_ratio_.cumsum() < 0.95).sum()
print("前{}个主成分可以解释{}的变差".format(n_components, pca.explained_variance_ratio_[:n_components].sum()))
X_train_pca = pd.DataFrame(pca.transform(X_train)[:, :n_components], index=X_train.index)
X_new_pca = pd.DataFrame(pca.transform(X_new)[:, :n_components], index=X_new.index)

前15个主成分可以解释0.9396821966293483的变差


## 将处理好的特征与目标变量合并

In [564]:
train_final = X_train_pca.merge(y_train.to_frame(), how='inner', left_index=True, right_index=True).reset_index()
del train_final['index']
X_train = train_final.iloc[:, :-1]
y_train = train_final.iloc[:, -1]

In [566]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [567]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [565]:
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,-2.924867,-0.040118,-0.054731,0.425846,-1.097040,-0.631608,0.398960,0.367248,-0.655866,0.660493,0.671406,-0.630352,-0.059406,-0.350379,-0.509723
1,2.655384,-0.175161,1.274893,0.831088,0.527545,-0.958315,-0.576778,0.172297,-0.566281,2.346605,-2.650316,0.795267,-1.229285,-0.173816,1.199574
2,-1.651215,1.964257,0.146416,-1.245868,0.458792,1.036816,0.064755,-0.174673,-0.120441,0.262308,-0.462446,0.016420,-0.246662,0.135707,0.550636
3,-1.652126,1.109663,0.078220,-0.743913,-0.022934,0.484250,0.122561,0.093382,-0.395708,0.252143,0.174854,-0.265686,-0.046305,-0.300414,-0.153213
4,-1.464844,1.314947,1.329783,-0.899186,-0.676848,0.155005,-0.523612,0.197130,-0.493499,0.896992,0.350630,-0.232930,0.292264,-0.693870,-0.067581
5,-2.387962,0.764457,0.537941,-0.146600,-1.228650,-0.717906,-0.076716,-0.065811,0.051198,0.634026,0.391852,-0.095789,0.269853,0.957003,0.439826
6,-0.787076,1.356490,1.851643,-1.443969,-0.797969,0.395121,-0.747017,-0.125879,0.015222,0.749042,0.123806,-0.466283,0.495774,0.017049,0.083892
7,-2.040981,0.422643,0.194234,-0.005110,-0.467335,-0.090259,0.450785,0.281442,-0.795167,1.344139,0.074177,-0.314281,-0.642296,-0.523904,0.059825
8,-0.373951,1.402669,-1.489537,0.462333,-1.214812,0.145444,-1.889737,-0.058410,0.500168,0.246589,-0.310584,-0.151290,-0.022708,-0.377675,-0.162573
9,2.405756,4.740882,-2.836641,-0.916741,-1.010506,3.684480,-0.298537,-0.221977,-1.312955,-0.982331,-3.583015,-0.492330,-0.354205,-0.680569,0.071676


In [568]:
svc.score(X_train, y_train)

0.88324175824175821

In [570]:
svc.score(X_new_pca, y_new)

0.55333333333333334

In [504]:
svc.score(X_train_pca, y_train)

0.78719008264462809

In [505]:
svc.score(X_new_pca, y_new)

0.30666666666666664

In [463]:
X_new_pca = (X_new_pca - X_new_pca.mean()) / X_new_pca.std()

In [507]:
y_new.mean()

0.5