# 行业中性

## 安装软件包

In [None]:
import sys

In [None]:
!{sys.executable} -m pip install -r requirements.txt

In [None]:
import cvxpy as cvx
import numpy as np
import pandas as pd
import time
import os
import quiz_helper
import matplotlib.pyplot as plt

In [None]:
%matplotlib inline
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (14, 8)

## 阅读 zipline 数据包文档

http://www.zipline.io/bundles.html#ingesting-data-from-csv-files

### 数据包

In [None]:
import os
import quiz_helper
from zipline.data import bundles

In [None]:
os.environ['ZIPLINE_ROOT'] = os.path.join(os.getcwd(), '..', '..','data','module_4_quizzes_eod')
ingest_func = bundles.csvdir.csvdir_equities(['daily'], quiz_helper.EOD_BUNDLE_NAME)
bundles.register(quiz_helper.EOD_BUNDLE_NAME, ingest_func)
print('Data Registered')

### 构建管道引擎

In [None]:
from zipline.pipeline import Pipeline
from zipline.pipeline.factors import AverageDollarVolume
from zipline.utils.calendars import get_calendar

universe = AverageDollarVolume(window_length=120).top(500) 
trading_calendar = get_calendar('NYSE') 
bundle_data = bundles.load(quiz_helper.EOD_BUNDLE_NAME)
engine = quiz_helper.build_pipeline_engine(bundle_data, trading_calendar)

### 查看数据
构建管道引擎后，我们获取时段结束时股票池中的股票。我们将使用这些 ticker 生成风险模型的收益率数据。

In [None]:
universe_end_date = pd.Timestamp('2016-01-05', tz='UTC')

universe_tickers = engine\
    .run_pipeline(
        Pipeline(screen=universe),
        universe_end_date,
        universe_end_date)\
    .index.get_level_values(1)\
    .values.tolist()
    
universe_tickers

# 获取收益率数据

In [None]:
from zipline.data.data_portal import DataPortal

data_portal = DataPortal(
    bundle_data.asset_finder,
    trading_calendar=trading_calendar,
    first_trading_day=bundle_data.equity_daily_bar_reader.first_trading_day,
    equity_minute_reader=None,
    equity_daily_reader=bundle_data.equity_daily_bar_reader,
    adjustment_reader=bundle_data.adjustment_reader)

## 获取股价数据的辅助函数

In [None]:
def get_pricing(data_portal, trading_calendar, assets, start_date, end_date, field='close'):
    end_dt = pd.Timestamp(end_date.strftime('%Y-%m-%d'), tz='UTC', offset='C')
    start_dt = pd.Timestamp(start_date.strftime('%Y-%m-%d'), tz='UTC', offset='C')

    end_loc = trading_calendar.closes.index.get_loc(end_dt)
    start_loc = trading_calendar.closes.index.get_loc(start_dt)

    return data_portal.get_history_window(
        assets=assets,
        end_dt=end_dt,
        bar_count=end_loc - start_loc,
        frequency='1d',
        field=field,
        data_frequency='daily')

## 将股价数据放入 dataframe 中

In [None]:
returns_df = \
    get_pricing(
        data_portal,
        trading_calendar,
        universe_tickers,
        universe_end_date - pd.DateOffset(years=5),
        universe_end_date)\
    .pct_change()[1:].fillna(0) #convert prices into returns

returns_df

## 行业数据辅助函数
我们将为你创建一个对象，它会针对每支股票定义一个行业。行业由整数表示。我们继承了 Classifier 类。[Classifier 文档](https://www.quantopian.com/posts/pipeline-classifiers-are-here)以及 [Classifier 的源代码](https://github.com/quantopian/zipline/blob/master/zipline/pipeline/classifiers/classifier.py)

In [None]:
from zipline.pipeline.classifiers import Classifier
from zipline.utils.numpy_utils import int64_dtype
class Sector(Classifier):
    dtype = int64_dtype
    window_length = 0
    inputs = ()
    missing_value = -1

    def __init__(self):
        self.data = np.load('../../data/project_4_sector/data.npy')

    def _compute(self, arrays, dates, assets, mask):
        return np.where(
            mask,
            self.data[assets],
            self.missing_value,
        )

In [None]:
sector = Sector()
sector

In [None]:
len(sector.data)

In [None]:
sector.data

## 小测验 1
sector 变量中有多少个唯一行业？

## 答案 1


## 根据动量创建一个 alpha 因子

我们想要计算一年收益率。 
也就是获取今天的收盘价，减去 252 个交易日之前的收盘价，并除以 252 天前的股价。

$1YearReturn_t = \frac{price_{t} - price_{t-252}}{price_{t-252}}$

In [None]:
from zipline.pipeline.factors import Returns

## 我们将使用 2 年的数据计算因子

**注意：**2 年前的日期是休市日期。管道软件包不会处理开始或结束日期是休市的日期。为了解决这个问题，我们再往回推 2 天，这时候市场没有休市。

In [None]:
factor_start_date = universe_end_date - pd.DateOffset(years=2, days=2)
factor_start_date

In [None]:
## 1 year returns can be the basis for an alpha factor
p1 = Pipeline(screen=universe)
rets1 = Returns(window_length=252, mask=universe)
p1.add(rets1,"1YearReturns")
df1 = engine.run_pipeline(p1, factor_start_date, universe_end_date)

In [None]:
#graphviz lets us visualize the pipeline
import graphviz

In [None]:
p1.show_graph(format='png')

## 查看因子数据

In [None]:
df1.head()

## 查看 demean 函数

Returns 类继承自 zipline.pipeline.factors.factor。  
[这是 demean 文档](https://www.zipline.io/appendix.html#zipline.pipeline.factors.Factor.demean)，并且摘抄在下面：

In [None]:
demean(mask=sentinel('NotSpecified'), groupby=sentinel('NotSpecified'))[source]
Construct a Factor that computes self and subtracts the mean from row of the result.

If mask is supplied, ignore values where mask returns False when computing row means, and output NaN anywhere the mask is False.

If groupby is supplied, compute by partitioning each row based on the values produced by groupby, de-meaning the partitioned arrays, and stitching the sub-results back together.

Parameters:	
mask (zipline.pipeline.Filter, optional) – A Filter defining values to ignore when computing means.
groupby (zipline.pipeline.Classifier, optional) – A classifier defining partitions over which to compute means.

## 小测验 2

阅读文档并查看 `demean` 的源代码后，你认为此函数的两个参数是什么？如果你想按行业去均值，并且想对所选股票池中的所有值去均值，你会调用哪个/哪些参数？

[源代码](https://www.zipline.io/_modules/zipline/pipeline/factors/factor.html#Factor.demean)包含一些有用的注释，可以帮助你解答这个问题。

## 答案 2：

## 小测验 3
将一年收益率变成 alpha 因子

我们可以通过一些处理步骤将信号（一年收益率）变成 alpha 因子。其中一个步骤是按行业去均值。

* 去均值
对于每支股票，我们将计算同一行业中股票的平均收益率，然后用每支股票的收益率减去该均值。

## 答案 3：

In [None]:
#TODO
# create a pipeline called p2

# create a factor of one year returns, deman by sector

# add the factor to the p2 pipeline


## 可视化第二个管道

In [None]:
p2.show_graph(format='png')

## 小测验 4
这个管道与我们之前创建的第一个管道相比，效果如何？

## 答案 4：


## 运行管道并查看因子数据

In [None]:
df2 = engine.run_pipeline(p2, factor_start_date, universe_end_date)

In [None]:
df2.head()

## 解答
[解答 notebook](sector_neutral_solution.ipynb)