In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
                           'foo', 'bar', 'foo', 'foo'],
                    'B' : ['one', 'one', 'two', 'three',
                           'two', 'two', 'one', 'three'],
                    'C' : np.random.randn(8),
                    'D' : np.random.randn(8)})

In [3]:
df

Unnamed: 0,A,B,C,D
0,foo,one,1.523312,-2.097531
1,bar,one,1.274458,0.130431
2,foo,two,0.025057,0.122469
3,bar,three,-0.587006,0.736016
4,foo,two,-0.897645,-0.132321
5,bar,two,-0.540995,-0.579324
6,foo,one,0.986143,1.045397
7,foo,three,1.621226,-0.076459


In [4]:
s = pd.Series([1, 2, 3, 10, 20, 30], [1,2,3,1,2,3])

In [9]:
gs = s.groupby(level=0)

In [13]:
gs.prod()

1    10
2    40
3    90
dtype: int64

In [16]:
for i in gs.groups:
    print(i)

1
2
3


In [18]:
gs.count()

1    2
2    2
3    2
dtype: int64

In [19]:
gs.ohlc()

Unnamed: 0,open,high,low,close
1,1,10,1,10
2,2,20,2,20
3,3,30,3,30


In [20]:
arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
          ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]

In [23]:
muI = pd.MultiIndex.from_arrays(arrays,names=['first','second'])

In [25]:
s = pd.Series(np.random.randn(8), index=muI)

In [26]:
s

first  second
bar    one      -1.310236
       two      -0.675718
baz    one      -0.146804
       two       0.064948
foo    one      -1.681707
       two      -0.377968
qux    one      -0.359398
       two      -0.961174
dtype: float64

In [28]:
s.groupby(level='second').sum()

second
one   -3.498145
two   -1.949911
dtype: float64

In [29]:
# sum function can also auto groupby

In [30]:
s.sum(level='second')

second
one   -3.498145
two   -1.949911
dtype: float64

In [32]:
df = pd.DataFrame({'A': [1, 1, 1, 1, 2, 2, 3, 3],
                   'B': np.arange(8)},
                   index=muI)

In [33]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,1,0
bar,two,1,1
baz,one,1,2
baz,two,1,3
foo,one,2,4
foo,two,2,5
qux,one,3,6
qux,two,3,7


In [34]:
df.groupby([pd.Grouper(level=1), 'A']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,B
second,A,Unnamed: 2_level_1
one,1,2
one,2,4
one,3,6
two,1,4
two,2,5
two,3,7


In [35]:
df.groupby(['second', 'A']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,B
second,A,Unnamed: 2_level_1
one,1,2
one,2,4
one,3,6
two,1,4
two,2,5
two,3,7


In [36]:
# After V0.2 Index can be key to groupby

Grouper(level=1, axis=0, sort=False)

In [51]:
df.groupby('A').sum()

Unnamed: 0_level_0,B
A,Unnamed: 1_level_1
1,6
2,9
3,13


In [45]:
df.groupby('A').sum()['B']

A
1     6
2     9
3    13
Name: B, dtype: int64

In [46]:
df.groupby('A')['B'].sum()

A
1     6
2     9
3    13
Name: B, dtype: int64

In [53]:
df['B'].groupby(df['A']).sum()

A
1     6
2     9
3    13
Name: B, dtype: int64

In [54]:
# groupby operation is a lot about index!!

In [55]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,1,0
bar,two,1,1
baz,one,1,2
baz,two,1,3
foo,one,2,4
foo,two,2,5
qux,one,3,6
qux,two,3,7


In [66]:
gpd = df.groupby(['first','A'])

In [71]:
for name,t in gpd:
    print("name:\n",type(name))
    print("t:\n",type(t))

name:
 <class 'tuple'>
t:
 <class 'pandas.core.frame.DataFrame'>
name:
 <class 'tuple'>
t:
 <class 'pandas.core.frame.DataFrame'>
name:
 <class 'tuple'>
t:
 <class 'pandas.core.frame.DataFrame'>
name:
 <class 'tuple'>
t:
 <class 'pandas.core.frame.DataFrame'>


In [72]:
gpdi = df.groupby(['first','A'],as_index=False)

In [73]:
gpdi.sum()

Unnamed: 0,A,B
0,1,1
1,1,5
2,2,9
3,3,13


In [75]:
gpd.sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,B
first,A,Unnamed: 2_level_1
bar,1,1
baz,1,5
foo,2,9
qux,3,13


In [79]:
countries = np.array(['US', 'UK', 'GR', 'JP'])
countries

array(['US', 'UK', 'GR', 'JP'],
      dtype='<U2')

In [86]:
key = countries[np.random.randint(0,4,1000)]

In [83]:
key

array(['US', 'GR', 'JP', 'UK'],
      dtype='<U2')

In [85]:
data_f = pd.DataFrame(np.random.randn(1000,3),columns=list('ABC'))

In [88]:
data_f.groupby(key).count()

Unnamed: 0,A,B,C
GR,280,280,280
JP,226,226,226
UK,251,251,251
US,243,243,243


In [91]:
data_f.loc[np.random.randint(0,1000,100),['C']]=np.nan

In [93]:
data_f.groupby(key).count()

Unnamed: 0,A,B,C
GR,280,280,247
JP,226,226,207
UK,251,251,224
US,243,243,217


In [94]:
data_f =  data_f.transform(lambda x:x.fillna(x.mean()))

In [95]:
data_f.groupby(key).count()

Unnamed: 0,A,B,C
GR,280,280,280
JP,226,226,226
UK,251,251,251
US,243,243,243


In [101]:
df_re = pd.DataFrame({'A': [1] * 10 + [5] * 10,
                       'B': np.arange(20)})
df_re

Unnamed: 0,A,B
0,1,0
1,1,1
2,1,2
3,1,3
4,1,4
5,1,5
6,1,6
7,1,7
8,1,8
9,1,9


In [102]:
df_re.groupby('A').rolling(5).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0,,
1,1,,
1,2,,
1,3,,
1,4,1.0,2.0
1,5,1.0,3.0
1,6,1.0,4.0
1,7,1.0,5.0
1,8,1.0,6.0
1,9,1.0,7.0


In [97]:
df_re.groupby('A').rolling(5).B.mean()

A    
1  0      NaN
   1      NaN
   2      NaN
   3      NaN
   4      2.0
   5      3.0
   6      4.0
   7      5.0
   8      6.0
   9      7.0
5  10     NaN
   11     NaN
   12     NaN
   13     NaN
   14    12.0
   15    13.0
   16    14.0
   17    15.0
   18    16.0
   19    17.0
Name: B, dtype: float64

In [107]:
df_re.groupby('A').expanding().sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0,1.0,0.0
1,1,2.0,1.0
1,2,3.0,3.0
1,3,4.0,6.0
1,4,5.0,10.0
1,5,6.0,15.0
1,6,7.0,21.0
1,7,8.0,28.0
1,8,9.0,36.0
1,9,10.0,45.0


In [124]:
df_re.groupby('A')

<pandas.core.groupby.SeriesGroupBy object at 0x112a394e0>

In [130]:
def f(x):
    print('x is:',x)
    print(type(x))
    return True

In [131]:
df_re.groupby('A').filter(f)

x is:    A  B
0  1  0
1  1  1
2  1  2
3  1  3
4  1  4
5  1  5
6  1  6
7  1  7
8  1  8
9  1  9
<class 'pandas.core.frame.DataFrame'>
x is:     A   B
10  5  10
11  5  11
12  5  12
13  5  13
14  5  14
15  5  15
16  5  16
17  5  17
18  5  18
19  5  19
<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,A,B
0,1,0
1,1,1
2,1,2
3,1,3
4,1,4
5,1,5
6,1,6
7,1,7
8,1,8
9,1,9


In [132]:
dff = pd.DataFrame({'A': np.arange(8), 'B': list('aabbbbcc')})

In [137]:
dff.groupby('B').head(5)

Unnamed: 0,A,B
0,0,a
1,1,a
2,2,b
3,3,b
4,4,b
5,5,b
6,6,c
7,7,c


In [138]:
dff.groupby('B').filter(lambda x:len(x)>3, dropna=False)

Unnamed: 0,A,B
0,,
1,,
2,2.0,b
3,3.0,b
4,4.0,b
5,5.0,b
6,,
7,,


In [140]:
df_re.groupby('B').sum()

Unnamed: 0_level_0,A
B,Unnamed: 1_level_1
0,1
1,1
2,1
3,1
4,1
5,1
6,1
7,1
8,1
9,1


In [141]:
tsdf = pd.DataFrame(np.random.randn(1000, 3),
                    index=pd.date_range('1/1/2000', periods=1000),
                     columns=['A', 'B', 'C'])

In [144]:
ts = pd.DataFrame(np.random.randn(10,3),index=[1,2,3,1,2,3,1,2,3,1],columns=['A', 'B', 'C'])

In [146]:
ts.groupby(level=0).sum()

Unnamed: 0,A,B,C
1,2.012314,2.658578,-1.755085
2,2.154391,-0.464173,0.506495
3,-0.489107,0.494968,1.264105


In [148]:
s = pd.Series(np.random.randn(10),index=[1,2,3,1,2,3,1,2,3,1])

In [150]:
s.groupby(level=0).sum()

1   -3.134994
2   -2.392805
3    1.806793
dtype: float64

In [152]:
pd.Categorical(['a', 'a', 'a'], categories=['a', 'b'])

[a, a, a]
Categories (2, object): [a, b]

In [154]:
import datetime
df = pd.DataFrame({
          'Branch' : 'A A A A A A A B'.split(),
          'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(),
          'Quantity': [1,3,5,1,8,1,9,3],
          'Date' : [
              datetime.datetime(2013,1,1,13,0),
              datetime.datetime(2013,1,1,13,5),
              datetime.datetime(2013,10,1,20,0),
              datetime.datetime(2013,10,2,10,0),
              datetime.datetime(2013,10,1,20,0),
              datetime.datetime(2013,10,2,10,0),
              datetime.datetime(2013,12,2,12,0),
              datetime.datetime(2013,12,2,14,0),
             ]
         })

In [155]:
df

Unnamed: 0,Branch,Buyer,Date,Quantity
0,A,Carl,2013-01-01 13:00:00,1
1,A,Mark,2013-01-01 13:05:00,3
2,A,Carl,2013-10-01 20:00:00,5
3,A,Carl,2013-10-02 10:00:00,1
4,A,Joe,2013-10-01 20:00:00,8
5,A,Joe,2013-10-02 10:00:00,1
6,A,Joe,2013-12-02 12:00:00,9
7,B,Carl,2013-12-02 14:00:00,3


In [156]:
df.groupby([pd.Grouper(freq='1M',key='Date'),'Buyer']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,Quantity
Date,Buyer,Unnamed: 2_level_1
2013-01-31,Carl,1
2013-01-31,Mark,3
2013-10-31,Carl,6
2013-10-31,Joe,9
2013-12-31,Carl,3
2013-12-31,Joe,9


In [162]:
df = df.set_index('Date')

In [163]:
df['Date'] = df.index + pd.offsets.MonthEnd(2)

In [164]:
df

Unnamed: 0_level_0,Branch,Buyer,Quantity,Date
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2013-02-28 13:00:00,A,Carl,1,2013-04-30 13:00:00
2013-02-28 13:05:00,A,Mark,3,2013-04-30 13:05:00
2013-11-30 20:00:00,A,Carl,5,2014-01-31 20:00:00
2013-11-30 10:00:00,A,Carl,1,2014-01-31 10:00:00
2013-11-30 20:00:00,A,Joe,8,2014-01-31 20:00:00
2013-11-30 10:00:00,A,Joe,1,2014-01-31 10:00:00
2014-01-31 12:00:00,A,Joe,9,2014-03-31 12:00:00
2014-01-31 14:00:00,B,Carl,3,2014-03-31 14:00:00


In [165]:
df.groupby([pd.Grouper(freq='3m',level='Date'),'Buyer']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,Quantity
Date,Buyer,Unnamed: 2_level_1
2013-02-28,Carl,1
2013-02-28,Mark,3
2013-11-30,Carl,6
2013-11-30,Joe,9
2014-02-28,Carl,3
2014-02-28,Joe,9


In [170]:
g = df.groupby([pd.Grouper(freq='3m',level='Date'),'Buyer'])
g.nth(1)

Unnamed: 0_level_0,Unnamed: 1_level_0,Branch,Date,Quantity
Date,Buyer,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2013-11-30,Carl,A,2014-01-31 10:00:00,1
2013-11-30,Joe,A,2014-01-31 10:00:00,1


In [171]:
business_dates = pd.date_range(start='4/1/2014', end='6/30/2014', freq='B')

DatetimeIndex(['2014-04-01', '2014-04-02', '2014-04-03', '2014-04-04',
               '2014-04-07', '2014-04-08', '2014-04-09', '2014-04-10',
               '2014-04-11', '2014-04-14', '2014-04-15', '2014-04-16',
               '2014-04-17', '2014-04-18', '2014-04-21', '2014-04-22',
               '2014-04-23', '2014-04-24', '2014-04-25', '2014-04-28',
               '2014-04-29', '2014-04-30', '2014-05-01', '2014-05-02',
               '2014-05-05', '2014-05-06', '2014-05-07', '2014-05-08',
               '2014-05-09', '2014-05-12', '2014-05-13', '2014-05-14',
               '2014-05-15', '2014-05-16', '2014-05-19', '2014-05-20',
               '2014-05-21', '2014-05-22', '2014-05-23', '2014-05-26',
               '2014-05-27', '2014-05-28', '2014-05-29', '2014-05-30',
               '2014-06-02', '2014-06-03', '2014-06-04', '2014-06-05',
               '2014-06-06', '2014-06-09', '2014-06-10', '2014-06-11',
               '2014-06-12', '2014-06-13', '2014-06-16', '2014-06-17',
      

In [173]:
df = pd.DataFrame(1, index=business_dates, columns=['a', 'b'])

In [174]:
df.groupby([df.index.year, df.index.month]).nth([0, 3, -1])

Unnamed: 0,Unnamed: 1,a,b
2014,4,1,1
2014,4,1,1
2014,4,1,1
2014,5,1,1
2014,5,1,1
2014,5,1,1
2014,6,1,1
2014,6,1,1
2014,6,1,1


In [175]:
df = pd.DataFrame({'a':[1,0,0], 'b':[0,1,0], 'c':[1,0,0], 'd':[2,3,4]})

df.groupby(df.sum(), axis=1).sum()

Unnamed: 0,1,9
0,2,2
1,1,3
2,0,4


In [176]:
df.sum()

a    1
b    1
c    1
d    9
dtype: int64