In [4]:
import pandas as pd
import numpy as np

In [5]:
df = pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
                  'key2': ['one', 'two', 'one', 'two', 'one'],
                  'data1': np.random.randn(5),
                  'data2': np.random.randn(5)})

In [6]:
pieces = dict(list(df.groupby('key1')))

In [7]:
pieces['b']

Unnamed: 0,key1,key2,data1,data2
2,b,one,1.724564,-0.728537
3,b,two,-0.425396,0.061024


In [8]:
grouped = df.groupby(df.dtypes, axis=1)
for dtype, group in grouped:
    print(dtype)
    print(group)

float64
      data1     data2
0  1.307040 -1.226481
1  1.359279 -0.813086
2  1.724564 -0.728537
3 -0.425396  0.061024
4  0.998623  0.583523
object
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one


In [9]:
df.groupby('key1')['data1']
df.groupby('key1')['data2']

<pandas.core.groupby.generic.SeriesGroupBy object at 0x000001B6BD149748>

In [10]:
df['data1'].groupby(df['key1'])
df['data2'].groupby(df['key1'])

<pandas.core.groupby.generic.SeriesGroupBy object at 0x000001B6BD1497F0>

### 함수로 그룹핑

In [11]:
people = pd.DataFrame(np.random.randn(5,5), columns = ['a','b','c','d','e'], index = ['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])

In [12]:
people.iloc[2:3, [1,2]]

Unnamed: 0,b,c
Wes,0.579247,-0.312597


In [13]:
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,-0.576848,-0.488531,-0.285957,-0.948341,1.572988
5,-0.233509,0.364816,0.467765,0.318362,-0.167942
6,0.504632,0.63153,0.04422,0.600264,0.527286


In [14]:
key_list = ['one', 'one', 'one', 'two', 'two']

In [15]:
people.groupby([len, key_list]).min()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,-1.327622,-1.636066,-0.312597,-1.991915,0.054084
3,two,0.123226,0.568288,-1.176287,2.192818,-0.067217
5,one,-0.233509,0.364816,0.467765,0.318362,-0.167942
6,two,0.504632,0.63153,0.04422,0.600264,0.527286


In [19]:
grouped['data1'].quantile(0.9)

float64    1.509568
object     1.354055
Name: data1, dtype: float64

In [16]:
def peak_to_peak(arr):
    return arr.max() - arr.min()

In [18]:
df.groupby('key1').agg(peak_to_peak)

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.360656,1.810004
b,2.149961,0.789561


In [20]:
grouped.describe()

Unnamed: 0,Unnamed: 1,count,mean,std,min,25%,50%,75%,max,unique,top,freq
float64,data1,5,0.992822,0.833644,-0.425396,0.998623,1.30704,1.35928,1.72456,,,
float64,data2,5,-0.424711,0.731173,-1.22648,-0.813086,-0.728537,0.0610245,0.583523,,,
object,key1,5,,,,,,,,2.0,a,3.0
object,key2,5,,,,,,,,2.0,one,3.0


#### 변위치 분석과 버킷분석

In [21]:
frame = pd.DataFrame({'data1': np.random.randn(1000),
                     'data2': np.random.randn(1000)})

In [22]:
quartiles = pd.cut(frame.data1, 4)

In [25]:
quartiles[:10]

0    (-2.926, -1.415]
1    (-1.415, 0.0894]
2    (-1.415, 0.0894]
3    (-1.415, 0.0894]
4      (1.594, 3.099]
5    (-1.415, 0.0894]
6    (-1.415, 0.0894]
7     (0.0894, 1.594]
8     (0.0894, 1.594]
9     (0.0894, 1.594]
Name: data1, dtype: category
Categories (4, interval[float64]): [(-2.926, -1.415] < (-1.415, 0.0894] < (0.0894, 1.594] < (1.594, 3.099]]

In [26]:
def get_stats(group):
    return {'min': group.min(), 'max': group.max(),
           'count': group.count(), 'mean': group.mean()}

grouped = frame.data2.groupby(quartiles)

In [27]:
grouped.apply(get_stats).unstack()

Unnamed: 0_level_0,count,max,mean,min
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(-2.926, -1.415]",71.0,2.038522,-0.122036,-2.373972
"(-1.415, 0.0894]",466.0,3.106209,0.027964,-3.63532
"(0.0894, 1.594]",404.0,2.949754,0.036451,-3.368952
"(1.594, 3.099]",59.0,1.679665,0.030859,-2.586902


In [28]:
grouping = pd.qcut(frame.data1, 10, labels=False)

In [29]:
grouped = frame.data2.groupby(grouping)

In [30]:
grouped.apply(get_stats).unstack()

Unnamed: 0_level_0,count,max,mean,min
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,100.0,2.977008,0.050725,-2.373972
1,100.0,2.612139,0.032959,-2.858593
2,100.0,3.106209,0.043569,-3.63532
3,100.0,2.970474,-0.010063,-2.382901
4,100.0,1.99166,-0.098242,-2.867819
5,100.0,2.274496,0.058363,-2.074779
6,100.0,2.949754,-0.032665,-2.578946
7,100.0,2.678849,-0.022067,-3.368952
8,100.0,2.27655,0.117538,-1.771031
9,100.0,1.905827,0.069019,-2.586902


#### group을 이용한 결측치 처리

In [31]:
states = ['Ohio', 'New York', 'Vermont', 'Florida', 'Oregon', 'Nevada', 'California', 'Idaho']
group_key = ['East'] * 4 + ['West'] * 4
data = pd.Series(np.random.randn(8), index =states)

In [32]:
data[['Vermont', 'Nevada', 'Idaho']] = np.nan #의도적으로 NaN으로 만들기

In [34]:
data.groupby(group_key).mean()

East   -0.361822
West    0.175437
dtype: float64

In [36]:
fill_mean = lambda g: g.fillna(g.mean()) #lambda는 자동으로 return