# 分组变换和分析

In [None]:
# statsmodels api api
import pandas as pd
import numpy as np
import random
import string
import statsmodels.api as sm
import pandas_datareader.data as web

In [None]:
'''
随机生成1000个股票代码
python random r.seed()
python string s.ascii_uppercase
python random r.choice()
python str s.join()
numpy Array creation routines np.array()
''' 
random.seed(0)
N=1000
def rands(n):
    choices = string.ascii_uppercase
    return ''.join([random.choice(choices) for _ in range(n)])
tickers = np.array([rands(5) for _ in range(N)])

'''
创建一个含有3列的DataFrame来承载这些假想数据，不过只选择部分股票组成该投资组合
numpy indexing Other indexing options
numpy Broadcasting 标量
'''
M=500
df = pd.DataFrame({'Momentum':np.random.randn(M)/200 +0.03,
                  'Value':np.random.randn(M)/200 +0.08,
                  'ShortInterest':np.random.randn(M)/200 -0.02},
                  index = tickers[:M])
'''
为这些股票随机创建一个行业分类，为了简单起见，只选用两个行业，并将映射关系保存在Series中
numpy Random sampling (numpy.random) np.random.randint() low/high 
pandas series pd.Series() name
numpy Indexing 整数索引
'''
ind_names = np.array(['FINANCIAL','TECH'])
sampler = np.random.randint(low=0,high=len(ind_names),size=N)
industries = pd.Series(ind_names[sampler],index=tickers,
                      name='industry')
# 现在可以根据行业分类进行分组并执行分组聚合和变换了
by_industry = df.groupby(industries)
by_industry.mean()

In [None]:
# pandas dataframe d.describe()
by_industry.describe()

In [None]:
'''
行业内标准化处理，广泛用于股票资产投资组合的构建过程 
pandas dataframe d.sub()  
pandas dataframe d.mean()
pandas dataframe d.div()
pandas dataframe d.std()
pandas GroupBy g.apply() [func]
'''
def zscore(group):
    return group.sub(group.mean()).div(group.std())

df_stand=by_industry.apply(zscore)
df_stand.groupby(industries).agg(['mean','std'])

In [None]:
'''
行业内降序排名
内置变换函数的用法会更简洁一些
pandas GroupBy g.rank() 
pandas dataframe d.min()
pandas dataframe d.max()
'''
ind_rank = by_industry.rank(ascending=False)
ind_rank.groupby(industries).agg(['min','max'])

In [None]:
# 行业内排名和标准化
rank_industry=by_industry.apply(lambda x: zscore(x.rank()))
rank_industry.info()

## 分组因子暴露
因子分析是投资组合定量管理中的一种技术  
投资组合的持有量和性能可以被分解为一个或多个表示投资组合权重的因子  
numpy Random sampling (numpy.random) np.random.rand()  
numpy Random sampling (numpy.random) np.random.permutation()  
numpy Broadcasting 同型矩阵  
numpy Indexing routines np.take()

In [None]:
fac1,fac2,fac3=np.random.rand(3,1000)
ticker_subset = tickers.take(np.random.permutation(N)[:1000])

# 因子加权和以及噪声
port = pd.Series(0.7*fac1 -1.2*fac2+0.3*fac3+np.random.rand(1000),index=ticker_subset)
factors = pd.DataFrame({'f1':fac1,'f2':fac2,'f3':fac3},
                      index=ticker_subset)
factors.corrwith(port)

In [None]:
'''
由于没有给投资组合添加过多的随机噪音，所以原始的因子权重基本上可算是恢复出来了
statsmodels OLS sm.OLS()  
statsmodels OLS OLS.fit()  
statsmodels RegressionResults RegressionResults.params
'''
model=sm.OLS(endog=port,exog=factors)
results=model.fit()
results.params

In [None]:
'''
通过groupby计算各行业的暴露量
statsmodels ERROR ValueError: The indices for endog and exog are not aligned
pandas series s.groupby() 
pandas GroupBy g.apply() func,param 
pandas dataframe d.reindex()
'''
def beta_exposure(chunk,factors=None):
    model=sm.OLS(endog=chunk,exog=factors.reindex(labels=chunk.index))
    results=model.fit()
    return results.params

by_ind = port.groupby(industries)
exposures = by_ind.apply(beta_exposure,factors=factors)
exposures.unstack()

## 十分位和四分位分析
pandas pandas_datareader.data pdr.data.get_data_yahoo()

In [None]:
data=web.get_data_yahoo('SPY','2006-01-01','2012-07-27')
data.info()

In [None]:
# pandas series s.pct_change()
px = data['Adj Close']
returns = px.pct_change()
returns.head()

In [None]:
'''
计算日收益率，并编写一个用于将收益率变换为趋势信号（通过滞后移动形成）的函数
pandas series s.rolling() min_periods
pandas Window Rolling.sum()
pandas series s.shift()  
pandas Resampling r.mean()
pandas Resampling r.ffill()
pandas series s.mul()
pandas series s.add()
pandas series s.cumprod()
pandas series s.notna()
pandas series s.idxmax()
pandas Date offsets tseries.offsets.Day()  
pandas Index slice[]
pandas Indexing and Selecting Data series[label]
pandas series s.plot.line()
''' 
def to_index(rets):
    index = (rets.add(1)).cumprod()
    first_loc = max(index.notna().idxmax()-pd.tseries.offsets.Day(1),index.index[0])
    index[first_loc]=1
    return index

def trend_signal(rets,lookback,lag):
    signal = rets.rolling(lookback,min_periods=lookback-5).sum()
    return signal.shift(lag)

# 通过该函数，我们可以单纯的创建和测试一种根据每周五动量信号进行交易的交易策略
signal=trend_signal(returns,100,3)
trade_friday = signal.resample(rule='W-FRI').mean().resample(rule='B').ffill()
trade_rets=trade_friday.shift(1).mul(returns)
to_index(trade_rets).plot.line()

# 更多示例应用