In [1]:
import pandas as pd

In [2]:
gapminder = pd.read_csv('../data/gapminder.tsv', sep='\t')

In [3]:
gapminder.head()

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.85303
2,Afghanistan,Asia,1962,31.997,10267083,853.10071
3,Afghanistan,Asia,1967,34.02,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106


In [4]:
# what we did before
gapminder.groupby('year')['lifeExp'].mean()

year
1952    49.057620
1957    51.507401
1962    53.609249
1967    55.678290
1972    57.647386
1977    59.570157
1982    61.533197
1987    63.212613
1992    64.160338
1997    65.014676
2002    65.694923
2007    67.007423
Name: lifeExp, dtype: float64

In [5]:
# breaking the groupby down
y1952 = gapminder.loc[gapminder['year'] == 1952, :]
y1952.head()

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
12,Albania,Europe,1952,55.23,1282697,1601.056136
24,Algeria,Africa,1952,43.077,9279525,2449.008185
36,Angola,Africa,1952,30.015,4232095,3520.610273
48,Argentina,Americas,1952,62.485,17876956,5911.315053


In [6]:
y1952['lifeExp'].mean()

49.05761971830987

methods you can use

- count
- size
- mean
- std
- min
- quantile(q=0.25)
- max
- sum
- var
- sem
- describe
- first
- last
- nth

In [7]:
gapminder.groupby('continent')['lifeExp'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Africa,624.0,48.86533,9.15021,23.599,42.3725,47.792,54.4115,76.442
Americas,300.0,64.658737,9.345088,37.579,58.41,67.048,71.6995,80.653
Asia,396.0,60.064903,11.864532,28.801,51.42625,61.7915,69.50525,82.603
Europe,360.0,71.903686,5.433178,43.585,69.57,72.241,75.4505,81.757
Oceania,24.0,74.326208,3.795611,69.12,71.205,73.665,77.5525,81.235


In [8]:
## use agg to call functions from other libraries
## or even functions you write yourself
import numpy as np

# these 2 do the same thing
gapminder.groupby('continent')['lifeExp'].aggregate(np.mean)
gapminder.groupby('continent')['lifeExp'].agg(np.mean)

continent
Africa      48.865330
Americas    64.658737
Asia        60.064903
Europe      71.903686
Oceania     74.326208
Name: lifeExp, dtype: float64

In [9]:
def my_mean(values):
    n = len(values)
    s = np.sum(values)
    return s / n

In [10]:
gapminder.groupby('continent')['lifeExp'].agg(my_mean)

continent
Africa      48.865330
Americas    64.658737
Asia        60.064903
Europe      71.903686
Oceania     74.326208
Name: lifeExp, dtype: float64

In [11]:
# multiple functions
gapminder.groupby('year')['lifeExp'].agg([
    np.count_nonzero,
    np.mean,
    np.std
])

Unnamed: 0_level_0,count_nonzero,mean,std
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1952,142.0,49.05762,12.225956
1957,142.0,51.507401,12.231286
1962,142.0,53.609249,12.097245
1967,142.0,55.67829,11.718858
1972,142.0,57.647386,11.381953
1977,142.0,59.570157,11.227229
1982,142.0,61.533197,10.770618
1987,142.0,63.212613,10.556285
1992,142.0,64.160338,11.22738
1997,142.0,65.014676,11.559439


In [12]:
gapminder.groupby('year')['lifeExp'].agg({
    'ncount': np.count_nonzero,
    'mean': np.mean,
    'std': np.std
}).reset_index()

is deprecated and will be removed in a future version
  after removing the cwd from sys.path.


Unnamed: 0,year,ncount,mean,std
0,1952,142.0,49.05762,12.225956
1,1957,142.0,51.507401,12.231286
2,1962,142.0,53.609249,12.097245
3,1967,142.0,55.67829,11.718858
4,1972,142.0,57.647386,11.381953
5,1977,142.0,59.570157,11.227229
6,1982,142.0,61.533197,10.770618
7,1987,142.0,63.212613,10.556285
8,1992,142.0,64.160338,11.22738
9,1997,142.0,65.014676,11.559439


http://pandas.pydata.org/pandas-docs/version/0.20/whatsnew.html#deprecate-groupby-agg-with-a-dictionary-when-renaming


In [13]:
gapminder.groupby('year')['lifeExp'].\
    agg([np.count_nonzero, np.mean, np.std]).\
    rename(columns={'count_nonzero': 'count',
                   'mean': 'avg',
                   'std': 'std_dev'}).\
    reset_index()

Unnamed: 0,year,count,avg,std_dev
0,1952,142.0,49.05762,12.225956
1,1957,142.0,51.507401,12.231286
2,1962,142.0,53.609249,12.097245
3,1967,142.0,55.67829,11.718858
4,1972,142.0,57.647386,11.381953
5,1977,142.0,59.570157,11.227229
6,1982,142.0,61.533197,10.770618
7,1987,142.0,63.212613,10.556285
8,1992,142.0,64.160338,11.22738
9,1997,142.0,65.014676,11.559439


other things to look into for groupby

- transform (returns same number of rows)
- filter (returns a subset)