In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

In [None]:
# 想要为一个DataFrame添加一个用于存放各索引分组平均值的列，一个办法是先聚合再合并
df = pd.DataFrame({'key1':['a','a','b','b','a'],
                  'key2':['one','two','one','two','one'],
                  'data1':np.random.randn(5),
                  'data2':np.random.randn(5)})

df

In [None]:
df.groupby('key1').mean()

In [None]:
# pandas dataframe d.add_prefix()
k1_means = df.groupby('key1').mean().add_prefix('mean_')
k1_means

In [None]:
# pandas General functions pd.merge()
pd.merge(df,k1_means,left_on='key1',right_index=True)

In [None]:
#但这样不够灵活，这次我们使用transform方法
key = ['one','two','one','two','one']
people = pd.DataFrame(np.random.randn(5,5),
                     columns=['a','b','c','d','e'],
                     index=['Joe','Steve','Wes','Jim','Travis'])
people.groupby(key).mean()

In [None]:
# pandas GroupBy g.transform()
# numpy Statistics np.mean()
people.groupby(key).transform(np.mean)

In [None]:
def demean(arr):
    return arr - arr.mean()

demeaned = people.groupby(key).transform(demean)
demeaned

In [None]:
demeaned.groupby(key).mean()

# apply: 一般性的“拆分-应用-合并”

In [None]:
tips = pd.read_csv('../pydata-book-master/ch08/tips.csv')
tips.loc[:,'tip_pct']=tips.loc[:,'tip'].div(tips.loc[:,'total_bill'])

# pandas dataframe d.sort_values()
def top(df, n=5,column='tip_pct'):
    return df.sort_values(by=column).iloc[-n:,:]

top(tips,n=6)

In [None]:
# pandas GroupBy g.apply()
tips.groupby('smoker').apply(top)

In [None]:
# pandas GroupBy g.apply(top,n=1,column='total_bill')
tips.groupby(['smoker','day']).apply(top,n=1,column='total_bill')

In [None]:
# pandas GroupBy g.describe()  
result =tips.groupby('smoker')['tip_pct'].describe()
result

In [None]:
# pandas dataframe d.T
result.T

## 禁止分组键

In [None]:
tips.groupby('smoker').apply(top)

In [None]:
# pandas dataframe d.groupby()
tips.groupby('smoker',group_keys=False).apply(top)

# 分位数和桶分析

In [None]:
# pandas General functions pd.cut()
# pandas Pandas arrays Categorical[]
frame = pd.DataFrame({'data1':np.random.randn(1000),
                     'data2':np.random.randn(1000)})
frame.iloc[:10,]

In [None]:
factor = pd.cut(x=frame.loc[:,'data1'],bins=4)
factor[:10]

In [None]:
# pandas GroupBy g.max()
# pandas GroupBy g.min()
# pandas GroupBy g.count()
# pandas series s.unstack()
def get_stats(group):
        return {'min':group.min(),'max':group.max(),
               'count':group.count(),'mean':group.mean()}
grouped = frame.loc[:,'data2'].groupby(factor)
grouped.apply(get_stats).unstack()

In [None]:
# pandas General functions pd.qcut()
grouping = pd.qcut(x=frame.loc[:,'data1'],q=10,labels=False)
grouped = frame.loc[:,'data2'].groupby(grouping)
grouped.apply(get_stats).unstack()

# 示例：用特定于分组的值填充缺失值
pandas series s.iloc[]  
numpy Constants np.nan

In [None]:
s = pd.Series(np.random.randn(6))
s.iloc[::2]=np.nan
s

In [None]:
# pandas series s.fiilna()
# pandas series s.mean()
s.fillna(s.mean())

In [None]:
states = ['Ohio','New York','Vermont','Florida',
          'Oregon','Nevada','California','Idaho']
group_key = ['East'] * 4 + ['West'] * 4

data =pd.Series(np.random.randn(8),index=states)
data.loc[['Vermont','Nevada','Idaho']] =np.nan
data

In [None]:
data.groupby(group_key).mean()

In [None]:
# python More on Defining Functions lambda a, b: a+b
fill_mean = lambda g: g.fillna(g.mean())
data.groupby(group_key).apply(fill_mean)

In [None]:
# pandas series s.name
fill_values = {'East':0.5,'West':-1}
fill_func = lambda g: g.fillna(fill_values[g.name])
data.groupby(group_key).apply(fill_func)

# 示例：随机采样和排列
python Built-in Function list()  
python Built-in Function range()  
python Sequence Types — list, tuple, range s.extend()

In [None]:
# 红桃 H，黑桃 S，梅花 C，方片 D 构造一副英语型扑克牌
suits = ['H','S','C','D']
card_val = (list(range(1,11))+ [10] *3)*4
base_names =['A'] + list(range(2,11)) + ['J','K','Q']
cards = []
for suit in ['H','S','C','D']:
    cards.extend(str(num) + suit for num in base_names)
deck = pd.Series(card_val,index=cards)
deck.iloc[:13]

In [None]:
# python Built-in Function len()
# numpy Random sampling (numpy.random) np.random.permutation()
# pandas series s.take()
def draw(deck,n=5):
    return deck.take(indices=np.random.permutation(len(deck))[:n])

draw(deck)

In [None]:
#从每种花色中随机抽取两张牌，花色是牌名的最后一个字符
get_suit = lambda card: card[-1] #只要最后一个字母
deck.groupby(get_suit).apply(draw,n=2)

In [None]:
# pandas GroupBy g.apply()
deck.groupby(get_suit,group_keys=False).apply(draw,n=2)

# 示例：分组加权平均数和相关系数

In [None]:
df = pd.DataFrame({'category':['a','a','a','a','b','b','b','b'],
                  'data':np.random.randn(8),
                  'weights':np.random.rand(8)})
df

In [None]:
# numpy statistics np.average()
grouped = df.groupby('category')
get_wavg = lambda g:np.average(g['data'],weights=g['weights'])
grouped.apply(get_wavg)

In [None]:
# pandas dataframe d.info()
close_px =pd.read_csv('../pydata-book-master/ch09/stock_px.csv',parse_dates=True,index_col=0)
close_px.info()

In [None]:
# pandas dataframe d.tail()
close_px.tail()

In [None]:
#计算一个由日收益率与SPX之间的年度相关系数组成的DataFrame
# pandas dataframe d.pct_change()
# pandas dataframe d.dropna()
# pandas Index DatetimeIndex.year
# pandas dataframe d.corrwith()
rets =close_px.pct_change().dropna()
spx_corr = lambda x: x.corrwith(x.loc[:,'SPX'])
by_year = rets.groupby(lambda x: x.year)
by_year.apply(spx_corr)

In [None]:
# pandas series s.corr()
by_year.apply(lambda g: g.loc[:,'AAPL'].corr(g.loc[:,'MSFT']))

# 示例：面向分组的线性回归
statsmodels OLS sm.OLS()  
statsmodels OLS OLS.fit()  
statsmodels RegressionResults RegressionResults.params

In [None]:
def regress(data,yvar,xvars):
    Y = data.loc[:,yvar]
    X = data.loc[:,xvars]
    X.loc[:,'intercept'] = 1.
    result = sm.OLS(endog=Y,exog=X).fit()
    return result.params

by_year.apply(regress,'AAPL',['SPX'])