# 检查一对股票是否具有协整关系

## 导入

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from statsmodels.tsa.stattools import adfuller
import matplotlib.pyplot as plt
import quiz_tests

In [None]:
# Set plotting options
%matplotlib inline
plt.rc('figure', figsize=(16, 9))

In [None]:
# just set the seed for the random number generator
np.random.seed(2018)
# use returns to create a price series
drift = 100
r1 = np.random.normal(0, 1, 1000) 
s1 = pd.Series(np.cumsum(r1), name='s1') + drift
s1.plot(figsize=(14,6))
plt.show()

In [None]:
offset = 10
noise = np.random.normal(0, 1, 1000)
s2 = s1 + offset + noise
s2.name = 's2'
pd.concat([s1, s2], axis=1).plot(figsize=(15,6))
plt.show()

In [None]:
price_ratio = s2/s1
price_ratio.plot(figsize=(15,7)) 
plt.axhline(price_ratio.mean(), color='black') 
plt.xlabel('Days')
plt.legend(['s2/s1 price ratio', 'average price ratio'])
plt.show()
print(f"average price ratio {price_ratio.mean():.4f}")

## 通过回归计算对冲比率

## 线性回归

注意，LinearRegression().fit() 要求输入是二维数组。因为 s1 和 s2 是 pandas 序列，所以我们可以使用 Series.values 将值变成 numpy 数组。因为这些数组是一维的，我们可以使用 numpy.reshape(-1,1) 将它们变成 1000 行 x 1 列二维数组。

In [None]:
type(s1)

In [None]:
type(s1.values)

In [None]:
s1.values.reshape(-1,1).shape

In [None]:
lr = LinearRegression()
lr.fit(s1.values.reshape(-1,1),s2.values.reshape(-1,1))

In [None]:
hedge_ratio = lr.coef_[0][0]
hedge_ratio

In [None]:
intercept = lr.intercept_[0]
intercept

In [None]:
print(f"hedge ratio from regression is {hedge_ratio:.4f}, intercept is {intercept:.4f}")

 ## 问题
 在计算价差时，你认为我们需要截距吗？为何呢？

## 计算价差

In [None]:
spread = s2 - s1 * hedge_ratio

In [None]:
print(f"Average spread is {spread.mean()}")

In [None]:
spread.plot(figsize=(15,7)) 
plt.axhline(spread.mean(), color='black') 
plt.xlabel('Days')
plt.legend(['Spread: s2 - hedge_ratio * s1', 'average spread'])
plt.show()

## 如果包含截距，会获得什么结果

In [None]:
spread_with_intercept = s2 - (s1 * hedge_ratio + intercept)
print(f"Average spread with intercept included is {spread_with_intercept.mean()}")

In [None]:
spread_with_intercept.plot(figsize=(15,7)) 
plt.axhline(spread_with_intercept.mean(), color='black') 
plt.xlabel('Days')
plt.legend(['Spread: s2 - (hedge_ratio * s1 + intercept)', 'average spread'])
plt.show()

## 小测验
### 使用 Augmented Dickey Fuller 检验检查价差是否稳定

[adfuller](http://www.statsmodels.org/dev/generated/statsmodels.tsa.stattools.adfuller.html) 函数属于 statsmodel 库。

```
adfuller(x, maxlag=None, regression='c', autolag='AIC', store=False, regresults=False)[source]

adf (float) – Test statistic
pvalue (float) – p-value
...
```

In [None]:
def is_spread_stationary(spread, p_level=0.05):
    """
    spread: obtained from linear combination of two series with a hedge ratio
    
    p_level: level of significance required to reject null hypothesis of non-stationarity
    
    returns:
        True if spread can be considered stationary
        False otherwise
    """
    #TODO: use the adfuller function to check the spread
    #adf_result = 
    
    #get the p-value
    #pvalue = 
    
    print(f"pvalue {pvalue:.4f}")
    if pvalue <= p_level:
        print(f"pvalue is <= {p_level}, assume spread is stationary")
        return True
    else:
        print(f"pvalue is > {p_level}, assume spread is not stationary")
        return False
    
quiz_tests.test_is_spread_stationary(is_spread_stationary)

In [None]:
# Try out your function
print(f"Are the two series candidates for pairs trading? {is_spread_stationary(spread)}")

如果你遇到问题，请在[此处](pairs_candidates_solution.ipynb)查看解答。