In [2]:
import pandas as pd
import numpy as np

In [27]:
df = pd.DataFrame({'key1':['a','a','b','b','a'],
               'key2':['one','two','one','two','one',],
               'data1':np.random.randn(5),
               'data2':np.random.randn(5)})

In [6]:
df

Unnamed: 0,data1,data2,key1,key2
0,-1.034465,-0.523656,a,one
1,0.581535,-0.586051,a,two
2,-0.545002,-0.647088,b,one
3,-1.425541,0.360132,b,two
4,-0.093638,0.681281,a,one


In [7]:
grouped = df['data1'].groupby(df['key1'])

In [8]:
grouped

<pandas.core.groupby.SeriesGroupBy object at 0x000001DF486C75F8>

In [9]:
grouped.mean()

key1
a   -0.182189
b   -0.985272
Name: data1, dtype: float64

In [11]:
means = df['data1'].groupby([df['key1'],df['key2']]).mean()

In [12]:
means

key1  key2
a     one    -0.564052
      two     0.581535
b     one    -0.545002
      two    -1.425541
Name: data1, dtype: float64

In [13]:
means.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.564052,0.581535
b,-0.545002,-1.425541


In [14]:
states = np.array(['Ohio','California','California','Ohio','Ohio'])
years = np.array([2005,2005,2006,2005,2006])

In [15]:
df['data1'].groupby([states,years]).mean()

California  2005    0.581535
            2006   -0.545002
Ohio        2005   -1.230003
            2006   -0.093638
Name: data1, dtype: float64

In [17]:
df.groupby(['key1','key2']).size().unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,2,1
b,1,1


In [20]:
for name,group in df.groupby('key1'):
    print(name)
    print(group)
    print('-'*32)

a
      data1     data2 key1 key2
0 -1.034465 -0.523656    a  one
1  0.581535 -0.586051    a  two
4 -0.093638  0.681281    a  one
--------------------------------
b
      data1     data2 key1 key2
2 -0.545002 -0.647088    b  one
3 -1.425541  0.360132    b  two
--------------------------------


In [25]:
for (k1,k2), group in df.groupby(['key1','key2']):
    print(k1,k2)
    print(group)
    print('-'*30)

a one
      data1     data2 key1 key2
0 -1.034465 -0.523656    a  one
4 -0.093638  0.681281    a  one
------------------------------
a two
      data1     data2 key1 key2
1  0.581535 -0.586051    a  two
------------------------------
b one
      data1     data2 key1 key2
2 -0.545002 -0.647088    b  one
------------------------------
b two
      data1     data2 key1 key2
3 -1.425541  0.360132    b  two
------------------------------


In [26]:
pieces = dict(list(df.groupby('key1')))
pieces['b']

Unnamed: 0,data1,data2,key1,key2
2,-0.545002,-0.647088,b,one
3,-1.425541,0.360132,b,two


In [27]:
pieces['a']

Unnamed: 0,data1,data2,key1,key2
0,-1.034465,-0.523656,a,one
1,0.581535,-0.586051,a,two
4,-0.093638,0.681281,a,one


In [29]:
df.dtypes

data1    float64
data2    float64
key1      object
key2      object
dtype: object

In [35]:
grouped = df.groupby(df.dtypes,axis=1)

In [36]:
dict(list(grouped))

{dtype('float64'):       data1     data2
 0 -1.034465 -0.523656
 1  0.581535 -0.586051
 2 -0.545002 -0.647088
 3 -1.425541  0.360132
 4 -0.093638  0.681281, dtype('O'):   key1 key2
 0    a  one
 1    a  two
 2    b  one
 3    b  two
 4    a  one}

In [39]:
df.groupby('key1')['data1']
df.groupby('key1')[['data2']]

<pandas.core.groupby.DataFrameGroupBy object at 0x000001DF487E0F98>

In [41]:
df['data1'].groupby(df['key1'])
df[['data2']].groupby(df['key2'])

<pandas.core.groupby.DataFrameGroupBy object at 0x000001DF487D6C18>

In [42]:
df.groupby(['key1','key2'])[['data2']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,0.078812
a,two,-0.586051
b,one,-0.647088
b,two,0.360132


In [45]:
type(df.groupby(['key1','key2'])[['data2']].mean())

pandas.core.frame.DataFrame

In [44]:
df.groupby(['key1','key2'])['data2'].mean()

key1  key2
a     one     0.078812
      two    -0.586051
b     one    -0.647088
      two     0.360132
Name: data2, dtype: float64

In [46]:
type(df.groupby(['key1','key2'])['data2'].mean())

pandas.core.series.Series

In [3]:
people = pd.DataFrame(np.random.randn(5,5),columns=['a','b','c','d','e'], index=['Joe','Steve','Wes','Jim','Travis'])

In [4]:
people.ix[2:3,['b','c']]=np.nan

In [6]:
people

Unnamed: 0,a,b,c,d,e
Joe,-0.046905,0.029855,-0.642612,-0.202181,1.470523
Steve,0.883423,1.027835,-0.644811,-0.941187,-0.511082
Wes,1.031117,,,-0.81578,0.000948
Jim,-1.624179,-1.065998,1.33461,0.401919,1.368776
Travis,1.433021,-0.130372,0.19824,-1.331323,-2.253611


In [7]:
mapping = {'a':'red','b':'red','c':'blue','d':'blue','e':'red','f':'orange'}

In [8]:
by_column = people.groupby(mapping,axis=1)

In [11]:
by_column.sum()

Unnamed: 0,blue,red
Joe,-0.844794,1.453472
Steve,-1.585998,1.400176
Wes,-0.81578,1.032065
Jim,1.73653,-1.321401
Travis,-1.133083,-0.950962


In [14]:
map_series = pd.Series(mapping)

In [15]:
map_series

a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

In [16]:
people.groupby(map_series,axis=1).count()

Unnamed: 0,blue,red
Joe,2,3
Steve,2,3
Wes,1,2
Jim,2,3
Travis,2,3


In [17]:
people.groupby(map_series,axis=1).size()

blue    2
red     3
dtype: int64

In [18]:
people.groupby(len).sum() # 默认 axis=0.

Unnamed: 0,a,b,c,d,e
3,-0.639967,-1.036143,0.691998,-0.616043,2.840247
5,0.883423,1.027835,-0.644811,-0.941187,-0.511082
6,1.433021,-0.130372,0.19824,-1.331323,-2.253611


In [19]:
people

Unnamed: 0,a,b,c,d,e
Joe,-0.046905,0.029855,-0.642612,-0.202181,1.470523
Steve,0.883423,1.027835,-0.644811,-0.941187,-0.511082
Wes,1.031117,,,-0.81578,0.000948
Jim,-1.624179,-1.065998,1.33461,0.401919,1.368776
Travis,1.433021,-0.130372,0.19824,-1.331323,-2.253611


In [20]:
key_list = ['one','one','one','two','two']

In [22]:
people.groupby([len,key_list]).min()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,-0.046905,0.029855,-0.642612,-0.81578,0.000948
3,two,-1.624179,-1.065998,1.33461,0.401919,1.368776
5,one,0.883423,1.027835,-0.644811,-0.941187,-0.511082
6,two,1.433021,-0.130372,0.19824,-1.331323,-2.253611


In [23]:
columns = pd.MultiIndex.from_arrays([['US','US','US','JP','JP'],[1,3,5,1,3]],names=['cty','tenor'])

In [24]:
hier_df = pd.DataFrame(np.random.randn(4,5),columns=columns)

In [25]:
hier_df

cty,US,US,US,JP,JP
tenor,1,3,5,1,3
0,-0.858663,0.746287,-1.349356,0.907939,-0.694087
1,-1.647699,-1.716077,0.528955,-0.451548,-1.804115
2,0.080326,0.780081,0.050682,2.191598,-2.413592
3,-0.320877,1.649554,-0.924763,0.349195,1.372203


In [26]:
hier_df.groupby(level='cty',axis=1).count()

cty,JP,US
0,2,3
1,2,3
2,2,3
3,2,3


In [28]:
df

Unnamed: 0,data1,data2,key1,key2
0,0.256316,0.285833,a,one
1,0.363354,-0.597338,a,two
2,-0.187892,-0.017069,b,one
3,0.962315,-0.642714,b,two
4,-0.40996,-1.561845,a,one


In [29]:
grouped = df.groupby('key1')

In [39]:
grouped['data1'].quantile(0.5)

key1
a    0.256316
b    0.387212
Name: data1, dtype: float64

In [34]:
df_1 = pd.DataFrame(np.array([[1, 1], [2, 10], [3, 100], [4, 100]]),
                   columns=['a', 'b'])

In [36]:
df_1

Unnamed: 0,a,b
0,1,1
1,2,10
2,3,100
3,4,100


In [37]:
df_1.quantile(.1)

a    1.3
b    3.7
Name: 0.1, dtype: float64

In [38]:
df_1.quantile([.1, .5])

Unnamed: 0,a,b
0.1,1.3,3.7
0.5,2.5,55.0


In [40]:
def peak_to_peak(arr):
    return arr.max()-arr.min()    

In [41]:
grouped.agg(peak_to_peak)

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.773315,1.847677
b,1.150207,0.625645


In [42]:
df

Unnamed: 0,data1,data2,key1,key2
0,0.256316,0.285833,a,one
1,0.363354,-0.597338,a,two
2,-0.187892,-0.017069,b,one
3,0.962315,-0.642714,b,two
4,-0.40996,-1.561845,a,one


In [43]:
grouped.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,count,3.0,3.0
a,mean,0.069903,-0.62445
a,std,0.419006,0.924137
a,min,-0.40996,-1.561845
a,25%,-0.076822,-1.079591
a,50%,0.256316,-0.597338
a,75%,0.309835,-0.155753
a,max,0.363354,0.285833
b,count,2.0,2.0
b,mean,0.387212,-0.329892
