# Pandas Documentation on Group By

In this notebook, you will work through the Pandas documentation on "group by".

## Imports

In [1]:
import numpy as np
import pandas as pd

## Pandas group by: split-apply-combine

In this notebook, you are going to learn how to use Pandas by typing the code from the Pandas documentation into this notebook.

* Go to the Pandas Documentation for [Group By](http://pandas.pydata.org/pandas-docs/stable/groupby.html#group-by-split-apply-combine).
* Type all of the code from that section of the documentation into this notebook and get it working.
* **To learn this API well, you must type the code rather than copy and pasting it**.
* Create a new cell in this section for each `In[]` prompt in the documentation.
* Ignore the cells in the **Grading** section below.
* No Markdown comments are needed.
* Skip the following sub-sections:
  - Grouping with a Grouper specification
  - Plotting
  - Examples

## Grading

In [2]:
df = pd.DataFrame({ 'A' : ['foo', 'bar', 'foo', 'bar',
                           'foo', 'bar', 'foo', 'foo'],
                    'B' : ['one', 'one', 'two', 'three',
                           'two', 'two', 'one', 'three'],
                    'C' : np.random.randn(8),
                    'D' : np.random.randn(8)})

In [3]:
df

Unnamed: 0,A,B,C,D
0,foo,one,-1.497341,-0.682266
1,bar,one,0.448929,-0.661433
2,foo,two,-0.316369,1.183555
3,bar,three,-0.495823,-0.820406
4,foo,two,-0.062008,0.687446
5,bar,two,0.804803,0.007221
6,foo,one,-1.463474,-0.099845
7,foo,three,1.178544,-1.080946


In [4]:
grouped = df.groupby('A')

In [5]:
grouped = df.groupby(['A', 'B'])

In [6]:
def get_letter_type(letter):
    if letter.lower() in 'aeiou':
        return 'vowel'
    else:
        return 'consonant'

In [7]:
grouped = df.groupby(get_letter_type, axis=1)

In [8]:
lst = [1, 2, 3, 1, 2, 3]

In [9]:
s = pd.Series([1, 2, 3, 10, 20, 30], lst)

In [10]:
grouped = s.groupby(level=0)

In [11]:
grouped.first()

1    1
2    2
3    3
dtype: int64

In [12]:
grouped.last()

1    10
2    20
3    30
dtype: int64

In [13]:
grouped.sum()

1    11
2    22
3    33
dtype: int64

In [14]:
# GroupBy sorting
df2 = pd.DataFrame({'X' : ['B', 'B', 'A', 'A'], 'Y' : [1, 2, 3, 4]})

In [15]:
df2.groupby(['X']).sum()

Unnamed: 0_level_0,Y
X,Unnamed: 1_level_1
A,7
B,3


In [16]:
df2.groupby(['X'], sort=False).sum()

Unnamed: 0_level_0,Y
X,Unnamed: 1_level_1
B,3
A,7


In [17]:
df3 = pd.DataFrame({'X' : ['A', 'B', 'A', 'B'], 'Y' : [1, 4, 3, 2]})

In [18]:
df3.groupby(['X']).get_group('A')

Unnamed: 0,X,Y
0,A,1
2,A,3


In [19]:
df3.groupby(['X']).get_group('B')

Unnamed: 0,X,Y
1,B,4
3,B,2


In [20]:
# GroupBy object attributes
df.groupby('A').groups

{'bar': [1, 3, 5], 'foo': [0, 2, 4, 6, 7]}

In [21]:
df.groupby(get_letter_type, axis=1).groups

{'consonant': ['B', 'C', 'D'], 'vowel': ['A']}

In [22]:
grouped = df.groupby(['A', 'B'])

In [23]:
grouped.groups

{('bar', 'one'): [1],
 ('bar', 'three'): [3],
 ('bar', 'two'): [5],
 ('foo', 'one'): [0, 6],
 ('foo', 'three'): [7],
 ('foo', 'two'): [2, 4]}

In [24]:
len(grouped)

6

In [25]:
df

Unnamed: 0,A,B,C,D
0,foo,one,-1.497341,-0.682266
1,bar,one,0.448929,-0.661433
2,foo,two,-0.316369,1.183555
3,bar,three,-0.495823,-0.820406
4,foo,two,-0.062008,0.687446
5,bar,two,0.804803,0.007221
6,foo,one,-1.463474,-0.099845
7,foo,three,1.178544,-1.080946


In [26]:
gb = df.groupby('A')

In [27]:
gb.agg

<bound method DataFrameGroupBy.agg of <pandas.core.groupby.DataFrameGroupBy object at 0x7f43bd0ab7b8>>

In [28]:
# GroupBy with MultiIndex
arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
          ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]

In [29]:
index = pd.MultiIndex.from_arrays(arrays, names=['first', 'second'])

In [30]:
s = pd.Series(np.random.randn(8), index=index)

In [31]:
s

first  second
bar    one       0.336256
       two       0.734362
baz    one       2.479012
       two       2.171215
foo    one       2.281394
       two       0.015794
qux    one      -0.176135
       two      -1.695181
dtype: float64

In [32]:
grouped = s.groupby(level=0)

In [33]:
grouped.sum()

first
bar    1.070618
baz    4.650228
foo    2.297187
qux   -1.871316
dtype: float64

In [34]:
s.groupby(level='second').sum()

second
one    4.920528
two    1.226190
dtype: float64

In [35]:
s.sum(level='second')

second
one    4.920528
two    1.226190
dtype: float64

In [36]:
s

first  second
bar    one       0.336256
       two       0.734362
baz    one       2.479012
       two       2.171215
foo    one       2.281394
       two       0.015794
qux    one      -0.176135
       two      -1.695181
dtype: float64

In [37]:
s.groupby(level=['first', 'second']).sum()

first  second
bar    one       0.336256
       two       0.734362
baz    one       2.479012
       two       2.171215
foo    one       2.281394
       two       0.015794
qux    one      -0.176135
       two      -1.695181
dtype: float64

In [38]:
# DataFrame column selection in GroupBy
grouped = df.groupby(['A'])

In [39]:
grouped_C = grouped['C']

In [40]:
grouped_D = grouped['D']

In [41]:
df['C'].groupby(df['A'])

<pandas.core.groupby.SeriesGroupBy object at 0x7f43bce5b860>

In [42]:
# Iterating through groups
grouped = df.groupby('A')

In [43]:
for name, group in grouped:
    print(name)
    print(group)

bar
     A      B         C         D
1  bar    one  0.448929 -0.661433
3  bar  three -0.495823 -0.820406
5  bar    two  0.804803  0.007221
foo
     A      B         C         D
0  foo    one -1.497341 -0.682266
2  foo    two -0.316369  1.183555
4  foo    two -0.062008  0.687446
6  foo    one -1.463474 -0.099845
7  foo  three  1.178544 -1.080946


In [44]:
for name, group in df.groupby(['A', 'B']):
    print(name)
    print(group)

('bar', 'one')
     A    B         C         D
1  bar  one  0.448929 -0.661433
('bar', 'three')
     A      B         C         D
3  bar  three -0.495823 -0.820406
('bar', 'two')
     A    B         C         D
5  bar  two  0.804803  0.007221
('foo', 'one')
     A    B         C         D
0  foo  one -1.497341 -0.682266
6  foo  one -1.463474 -0.099845
('foo', 'three')
     A      B         C         D
7  foo  three  1.178544 -1.080946
('foo', 'two')
     A    B         C         D
2  foo  two -0.316369  1.183555
4  foo  two -0.062008  0.687446


In [45]:
# Selecting a group
grouped.get_group('bar')

Unnamed: 0,A,B,C,D
1,bar,one,0.448929,-0.661433
3,bar,three,-0.495823,-0.820406
5,bar,two,0.804803,0.007221


In [46]:
df.groupby(['A', 'B']).get_group(('bar', 'one'))

Unnamed: 0,A,B,C,D
1,bar,one,0.448929,-0.661433


In [47]:
# Aggregation
grouped = df.groupby('A')

In [48]:
grouped.aggregate(np.sum)

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,0.75791,-1.474618
foo,-2.160647,0.007945


In [49]:
grouped = df.groupby(['A', 'B'])

In [50]:
grouped.aggregate(np.sum)

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.448929,-0.661433
bar,three,-0.495823,-0.820406
bar,two,0.804803,0.007221
foo,one,-2.960814,-0.78211
foo,three,1.178544,-1.080946
foo,two,-0.378376,1.871002


In [51]:
grouped = df.groupby(['A', 'B'], as_index=False)

In [52]:
grouped.aggregate(np.sum)

Unnamed: 0,A,B,C,D
0,bar,one,0.448929,-0.661433
1,bar,three,-0.495823,-0.820406
2,bar,two,0.804803,0.007221
3,foo,one,-2.960814,-0.78211
4,foo,three,1.178544,-1.080946
5,foo,two,-0.378376,1.871002


In [53]:
df.groupby('A', as_index=False).sum()

Unnamed: 0,A,C,D
0,bar,0.75791,-1.474618
1,foo,-2.160647,0.007945


In [54]:
df.groupby(['A', 'B']).sum().reset_index()

Unnamed: 0,A,B,C,D
0,bar,one,0.448929,-0.661433
1,bar,three,-0.495823,-0.820406
2,bar,two,0.804803,0.007221
3,foo,one,-2.960814,-0.78211
4,foo,three,1.178544,-1.080946
5,foo,two,-0.378376,1.871002


In [55]:
grouped.size()

A    B    
bar  one      1
     three    1
     two      1
foo  one      2
     three    1
     two      2
dtype: int64

In [56]:
grouped.describe()

Unnamed: 0,Unnamed: 1,C,D
0,count,1.0,1.0
0,mean,0.448929,-0.661433
0,std,,
0,min,0.448929,-0.661433
0,25%,0.448929,-0.661433
0,50%,0.448929,-0.661433
0,75%,0.448929,-0.661433
0,max,0.448929,-0.661433
1,count,1.0,1.0
1,mean,-0.495823,-0.820406


In [57]:
# Applying multiple functions at once
grouped = df.groupby('A')

In [58]:
grouped['C'].agg([np.sum, np.mean, np.std])

Unnamed: 0_level_0,sum,mean,std
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,0.75791,0.252637,0.672164
foo,-2.160647,-0.432129,1.111644


In [59]:
grouped['D'].agg({'result1' : np.sum,
                  'result2' : np.mean})

Unnamed: 0_level_0,result2,result1
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,-0.491539,-1.474618
foo,0.001589,0.007945


In [60]:
grouped.agg([np.sum, np.mean, np.std])

Unnamed: 0_level_0,C,C,C,D,D,D
Unnamed: 0_level_1,sum,mean,std,sum,mean,std
A,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
bar,0.75791,0.252637,0.672164,-1.474618,-0.491539,0.439192
foo,-2.160647,-0.432129,1.111644,0.007945,0.001589,0.93772


In [61]:
# Applying different functions to DataFrame columns
grouped.agg({'C' : np.sum,
             'D' : lambda x: np.std(x, ddof=1)})

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,0.75791,0.439192
foo,-2.160647,0.93772


In [62]:
grouped.agg({'C' : 'sum', 'D' : 'std'})

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,0.75791,0.439192
foo,-2.160647,0.93772


In [63]:
# Cython-optimized aggregation functions
df.groupby('A').sum()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,0.75791,-1.474618
foo,-2.160647,0.007945


In [64]:
df.groupby(['A', 'B']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.448929,-0.661433
bar,three,-0.495823,-0.820406
bar,two,0.804803,0.007221
foo,one,-1.480407,-0.391055
foo,three,1.178544,-1.080946
foo,two,-0.189188,0.935501


In [65]:
# Transformation
index = pd.date_range('10/1/1999', periods=1100)

In [66]:
ts = pd.Series(np.random.normal(0.5, 2, 1100), index)

In [67]:
ts = pd.rolling_mean(ts, 100, 100).dropna()

In [68]:
ts.head()

2000-01-08    0.140280
2000-01-09    0.148248
2000-01-10    0.174400
2000-01-11    0.204874
2000-01-12    0.263071
Freq: D, dtype: float64

In [69]:
ts.tail()

2002-09-30    0.601449
2002-10-01    0.550766
2002-10-02    0.553439
2002-10-03    0.565804
2002-10-04    0.553481
Freq: D, dtype: float64

In [70]:
key = lambda x: x.year

In [71]:
zscore = lambda x: (x - x.mean()) / x.std()

In [72]:
transformed = ts.groupby(key).transform(zscore)

In [73]:
# Original Data
grouped = ts.groupby(key)

In [74]:
grouped.mean()

2000    0.546268
2001    0.608713
2002    0.473512
dtype: float64

In [75]:
grouped.std()

2000    0.228063
2001    0.100528
2002    0.110977
dtype: float64

In [76]:
# Transformed Data
grouped_trans = transformed.groupby(key)

In [77]:
grouped_trans.mean()

2000   -6.216012e-17
2001   -2.956539e-16
2002   -6.897812e-16
dtype: float64

In [78]:
grouped_trans.std()

2000    1
2001    1
2002    1
dtype: float64

In [79]:
compare = pd.DataFrame({'Original': ts, 'Transformed': transformed})

In [80]:
# compare.plot()

In [81]:
data_df = df
df

Unnamed: 0,A,B,C,D
0,foo,one,-1.497341,-0.682266
1,bar,one,0.448929,-0.661433
2,foo,two,-0.316369,1.183555
3,bar,three,-0.495823,-0.820406
4,foo,two,-0.062008,0.687446
5,bar,two,0.804803,0.007221
6,foo,one,-1.463474,-0.099845
7,foo,three,1.178544,-1.080946


In [82]:
countries = np.array(['US', 'UK', 'GR', 'JP'])

In [83]:
key = countries[np.random.randint(0, 4, 1000)]

In [84]:
# grouped = data_df.groupby(key)

In [85]:
f = lambda x: x.fillna(x.mean())

In [86]:
transformed = grouped.transform(f)

In [87]:
# grouped_trans = transformed.groupby(key)

In [88]:
grouped.mean()

2000    0.546268
2001    0.608713
2002    0.473512
dtype: float64

In [89]:
grouped_trans.mean()

2000   -6.216012e-17
2001   -2.956539e-16
2002   -6.897812e-16
dtype: float64

In [90]:
grouped.count()

2000    359
2001    365
2002    277
dtype: int64

In [91]:
grouped_trans.count()

2000    359
2001    365
2002    277
dtype: int64

In [92]:
grouped_trans.size()

2000    359
2001    365
2002    277
dtype: int64

In [93]:
grouped.ffill()

2000-01-08    0.140280
2000-01-09    0.148248
2000-01-10    0.174400
2000-01-11    0.204874
2000-01-12    0.263071
2000-01-13    0.314556
2000-01-14    0.327686
2000-01-15    0.341662
2000-01-16    0.358131
2000-01-17    0.410695
2000-01-18    0.425109
2000-01-19    0.379064
2000-01-20    0.374013
2000-01-21    0.392010
2000-01-22    0.375647
2000-01-23    0.383377
2000-01-24    0.370982
2000-01-25    0.364801
2000-01-26    0.346129
2000-01-27    0.328972
2000-01-28    0.366238
2000-01-29    0.346015
2000-01-30    0.347612
2000-01-31    0.315399
2000-02-01    0.297416
2000-02-02    0.284434
2000-02-03    0.258989
2000-02-04    0.296923
2000-02-05    0.318920
2000-02-06    0.319761
                ...   
2002-09-05    0.457250
2002-09-06    0.480020
2002-09-07    0.496621
2002-09-08    0.538427
2002-09-09    0.548940
2002-09-10    0.529086
2002-09-11    0.524095
2002-09-12    0.538170
2002-09-13    0.528956
2002-09-14    0.564836
2002-09-15    0.530961
2002-09-16    0.529763
2002-09-17 

In [94]:
# Filtration
sf = pd.Series([1, 1, 2, 3, 3, 3])

In [95]:
sf.groupby(sf).filter(lambda x: x.sum() > 2)

3    3
4    3
5    3
dtype: int64

In [96]:
dff = pd.DataFrame({'A': np.arange(8), 'B': list('aabbbbcc')})

In [97]:
dff.groupby('B').filter(lambda x: len(x) > 2)

Unnamed: 0,A,B
2,2,b
3,3,b
4,4,b
5,5,b


In [98]:
dff.groupby('B').filter(lambda x: len(x) > 2, dropna=False)


Unnamed: 0,A,B
0,,
1,,
2,2.0,b
3,3.0,b
4,4.0,b
5,5.0,b
6,,
7,,


In [99]:
dff['C'] = np.arange(8)

In [100]:
dff.groupby('B').filter(lambda x: len(x['C']) > 2)

Unnamed: 0,A,B,C
2,2,b,2
3,3,b,3
4,4,b,4
5,5,b,5


In [101]:
dff.groupby('B').head(2)

Unnamed: 0,A,B,C
0,0,a,0
1,1,a,1
2,2,b,2
3,3,b,3
6,6,c,6
7,7,c,7


In [102]:
# Dispatching to instance methods
grouped = df.groupby('A')

In [103]:
grouped.agg(lambda x: x.std())

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,0.672164,0.439192
foo,1.111644,0.93772


In [104]:
grouped.std()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,0.672164,0.439192
foo,1.111644,0.93772


In [105]:
tsdf = pd.DataFrame(np.random.randn(1000, 3),
                    index=pd.date_range('1/1/2000', periods=1000),
                    columns=['A', 'B', 'C'])

In [106]:
tsdf.ix[::2] = np.nan

In [107]:
grouped = tsdf.groupby(lambda x: x.year)

In [108]:
grouped.fillna(method='pad')

Unnamed: 0,A,B,C
2000-01-01,,,
2000-01-02,0.176632,1.025764,0.461940
2000-01-03,0.176632,1.025764,0.461940
2000-01-04,-0.064850,1.820830,1.306955
2000-01-05,-0.064850,1.820830,1.306955
2000-01-06,-1.571093,-0.198214,1.208498
2000-01-07,-1.571093,-0.198214,1.208498
2000-01-08,-0.137219,-0.273748,0.017977
2000-01-09,-0.137219,-0.273748,0.017977
2000-01-10,0.648978,-0.740600,0.287325


In [109]:
s = pd.Series([9, 8, 7, 5, 19, 1, 4.2, 3.3])

In [110]:
g = pd.Series(list('abababab'))

In [111]:
gb = s.groupby(g)

In [112]:
gb.nlargest(3)

a  4    19.0
   0     9.0
   2     7.0
b  1     8.0
   3     5.0
   7     3.3
dtype: float64

In [113]:
gb.nsmallest(3)

a  6    4.2
   2    7.0
   0    9.0
b  5    1.0
   7    3.3
   3    5.0
dtype: float64

In [114]:
# Flexible apply
df

Unnamed: 0,A,B,C,D
0,foo,one,-1.497341,-0.682266
1,bar,one,0.448929,-0.661433
2,foo,two,-0.316369,1.183555
3,bar,three,-0.495823,-0.820406
4,foo,two,-0.062008,0.687446
5,bar,two,0.804803,0.007221
6,foo,one,-1.463474,-0.099845
7,foo,three,1.178544,-1.080946


In [115]:
grouped = df.groupby('A')

In [116]:
grouped['C'].apply(lambda x: x.describe())

A         
bar  count    3.000000
     mean     0.252637
     std      0.672164
     min     -0.495823
     25%     -0.023447
     50%      0.448929
     75%      0.626866
     max      0.804803
foo  count    5.000000
     mean    -0.432129
     std      1.111644
     min     -1.497341
     25%     -1.463474
     50%     -0.316369
     75%     -0.062008
     max      1.178544
dtype: float64

In [117]:
grouped = df.groupby('A')['C']

In [118]:
def f(group):
    return pd.DataFrame({'original' : group,
                         'demeaned' : group - group.mean()})

In [119]:
grouped.apply(f)

Unnamed: 0,demeaned,original
0,-1.065211,-1.497341
1,0.196293,0.448929
2,0.115761,-0.316369
3,-0.748459,-0.495823
4,0.370122,-0.062008
5,0.552166,0.804803
6,-1.031344,-1.463474
7,1.610673,1.178544


In [120]:
def f(x):
    return pd.Series([ x, x**2 ], index = ['x', 'x^s'])

In [121]:
s

0     9.0
1     8.0
2     7.0
3     5.0
4    19.0
5     1.0
6     4.2
7     3.3
dtype: float64

In [122]:
s.apply(f)

Unnamed: 0,x,x^s
0,9.0,81.0
1,8.0,64.0
2,7.0,49.0
3,5.0,25.0
4,19.0,361.0
5,1.0,1.0
6,4.2,17.64
7,3.3,10.89


In [123]:
d = pd.DataFrame({"a":["x", "y"], "b":[1,2]})

In [124]:
def identity(df):
    print(df)
    return df

In [125]:
d.groupby("a").apply(identity)

   a  b
0  x  1
   a  b
0  x  1
   a  b
1  y  2


Unnamed: 0,a,b
0,x,1
1,y,2


In [126]:
# Other useful features
# Automatic exclusion of “nuisance” columns
df

Unnamed: 0,A,B,C,D
0,foo,one,-1.497341,-0.682266
1,bar,one,0.448929,-0.661433
2,foo,two,-0.316369,1.183555
3,bar,three,-0.495823,-0.820406
4,foo,two,-0.062008,0.687446
5,bar,two,0.804803,0.007221
6,foo,one,-1.463474,-0.099845
7,foo,three,1.178544,-1.080946


In [127]:
# df.groupby('A').std().df.groupby('A').std()

In [128]:
# Grouping with ordered factors
data = pd.Series(np.random.randn(100))

In [129]:
factor = pd.qcut(data, [0, .25, .5, .75, 1.])

In [130]:
data.groupby(factor).mean()

[-2.894, -0.504]   -1.160551
(-0.504, 0.155]    -0.112413
(0.155, 0.625]      0.377110
(0.625, 2.379]      1.088398
dtype: float64

In [131]:
# Taking the first rows of each group
df = pd.DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B'])

In [132]:
df

Unnamed: 0,A,B
0,1,2
1,1,4
2,5,6


In [133]:
g = df.groupby('A')

In [134]:
g.head(1)

Unnamed: 0,A,B
0,1,2
2,5,6


In [135]:
g.tail(1)

Unnamed: 0,A,B
1,1,4
2,5,6


In [136]:
# Taking the nth row of each group
df = pd.DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])

In [137]:
g = df.groupby('A')

In [138]:
g.nth(0)

Unnamed: 0_level_0,B
A,Unnamed: 1_level_1
1,
5,6.0


In [139]:
g.nth(-1)

Unnamed: 0_level_0,B
A,Unnamed: 1_level_1
1,4
5,6


In [140]:
g.nth(1)

Unnamed: 0_level_0,B
A,Unnamed: 1_level_1
1,4


In [141]:
# nth(0) is the same as g.first()
g.nth(0, dropna='any')

Unnamed: 0_level_0,B
A,Unnamed: 1_level_1
1,4
5,6


In [142]:
g.first()

Unnamed: 0_level_0,B
A,Unnamed: 1_level_1
1,4
5,6


In [143]:
# nth(-1) is the same as g.last()
g.nth(-1, dropna='any')  # NaNs denote group exhausted when using dropna

Unnamed: 0_level_0,B
A,Unnamed: 1_level_1
1,4
5,6


In [144]:
g.last()

Unnamed: 0_level_0,B
A,Unnamed: 1_level_1
1,4
5,6


In [145]:
g.B.nth(0, dropna=True)

A
1    4
5    6
Name: B, dtype: float64

In [146]:
df = pd.DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])

In [147]:
g = df.groupby('A',as_index=False)

In [148]:
g.nth(0)

Unnamed: 0,A,B
0,1,
2,5,6.0


In [149]:
g.nth(-1)

Unnamed: 0,A,B
1,1,4
2,5,6


In [150]:
business_dates = pd.date_range(start='4/1/2014', end='6/30/2014', freq='B')

In [151]:
df = pd.DataFrame(1, index=business_dates, columns=['a', 'b'])

In [152]:
df.groupby((df.index.year, df.index.month)).nth([0, 3, -1])

Unnamed: 0,a,b
2014-04-01,1,1
2014-04-04,1,1
2014-04-30,1,1
2014-05-01,1,1
2014-05-06,1,1
2014-05-30,1,1
2014-06-02,1,1
2014-06-05,1,1
2014-06-30,1,1


In [153]:
# Enumerate group items
df = pd.DataFrame(list('aaabba'), columns=['A'])

In [154]:
df

Unnamed: 0,A
0,a
1,a
2,a
3,b
4,b
5,a


In [155]:
df.groupby('A').cumcount()

0    0
1    1
2    2
3    0
4    1
5    3
dtype: int64

In [156]:
df.groupby('A').cumcount(ascending=False)

0    3
1    2
2    1
3    1
4    0
5    0
dtype: int64