# 分组变换和分析

In [None]:
# statsmodels api api
import pandas as pd
import numpy as np
import random
import string
import statsmodels.api as sm

In [None]:
'''
随机生成1000个股票代码
python random r.seed()
python string s.ascii_uppercase
python random r.choice()
python str s.join()
numpy Array creation routines np.array()
''' 
random.seed(0)
N=1000
def rands(n):
    choices = string.ascii_uppercase
    return ''.join([random.choice(choices) for _ in range(n)])
tickers = np.array([rands(5) for _ in range(N)])

'''
创建一个含有3列的DataFrame来承载这些假想数据，不过只选择部分股票组成该投资组合
numpy indexing Other indexing options
numpy Broadcasting 标量
'''
M=500
df = pd.DataFrame({'Momentum':np.random.randn(M)/200 +0.03,
                  'Value':np.random.randn(M)/200 +0.08,
                  'ShortInterest':np.random.randn(M)/200 -0.02},
                  index = tickers[:M])
'''
为这些股票随机创建一个行业分类，为了简单起见，只选用两个行业，并将映射关系保存在Series中
numpy Random sampling (numpy.random) np.random.randint() low/high 
pandas series pd.Series() name
numpy Indexing 整数索引
'''
ind_names = np.array(['FINANCIAL','TECH'])
sampler = np.random.randint(low=0,high=len(ind_names),size=N)
industries = pd.Series(ind_names[sampler],index=tickers,
                      name='industry')
# 现在可以根据行业分类进行分组并执行分组聚合和变换了
by_industry = df.groupby(industries)
by_industry.mean()

In [None]:
# pandas dataframe d.describe()
by_industry.describe()

In [None]:
'''
行业内标准化处理，广泛用于股票资产投资组合的构建过程 
pandas dataframe d.sub()  
pandas dataframe d.mean()
pandas dataframe d.div()
pandas dataframe d.std()
pandas GroupBy g.apply() [func]
'''
def zscore(group):
    return group.sub(group.mean()).div(group.std())

df_stand=by_industry.apply(zscore)
df_stand.groupby(industries).agg(['mean','std'])

In [None]:
'''
行业内降序排名
内置变换函数的用法会更简洁一些
pandas GroupBy g.rank() 
pandas dataframe d.min()
pandas dataframe d.max()
'''
ind_rank = by_industry.rank(ascending=False)
ind_rank.groupby(industries).agg(['min','max'])

In [None]:
# 行业内排名和标准化
rank_industry=by_industry.apply(lambda x: zscore(x.rank()))
rank_industry.info()

## 分组因子暴露
因子分析是投资组合定量管理中的一种技术  
投资组合的持有量和性能可以被分解为一个或多个表示投资组合权重的因子  
numpy Random sampling (numpy.random) np.random.rand()  
numpy Random sampling (numpy.random) np.random.permutation()  
numpy Broadcasting 同型矩阵  
numpy Indexing routines np.take()

In [None]:
fac1,fac2,fac3=np.random.rand(3,1000)
ticker_subset = tickers.take(np.random.permutation(N)[:1000])

# 因子加权和以及噪声
port = pd.Series(0.7*fac1 -1.2*fac2+0.3*fac3+np.random.rand(1000),index=ticker_subset)
factors = pd.DataFrame({'f1':fac1,'f2':fac2,'f3':fac3},
                      index=ticker_subset)
factors.corrwith(port)

In [None]:
# statsmodels OLS sm.OLS()  
# statsmodels OLS OLS.fit()  
# statsmodels RegressionResults RegressionResults.params
model=sm.OLS(endog=port,exog=factors)
results=model.fit()
results.params

# 更多示例应用