In [1]:
import pandas as pd

## Groupby Method

With grouping we can:

	- Splitting the data into groups based on some criteria.
	- Applying a function to each group independently.
	- Combining the results into a data structure.

Reference: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.groupby.html



## Load data

In [2]:
# load datasets
df = pd.read_csv('../datasets/drinks.csv')
# remove some 'beer' and 'spirit' columns
df = df.drop(['beer_servings', 'spirit_servings'], axis=1)
df.head(5)

Unnamed: 0,country,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0.0,AS
1,Albania,54,4.9,EU
2,Algeria,14,0.7,AF
3,Andorra,312,12.4,EU
4,Angola,45,5.9,AF


### Split the data into groups based on 'continent' values.

df.groupby(by=None, axis=0, level=None, as_index=True, sort=True, group_keys=True, squeeze=NoDefault.no_default, observed=False, dropna=True)

In [3]:
continents_group = df.groupby('continent')
continents_group

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f22ec0774f0>

#### Groupby Object

groupby() method returns a [GroupBy object](https://pandas.pydata.org/docs/reference/groupby.html)

In [4]:
# we can get grups as Dict {group name -> group labels(indexes)}.
continents_group.groups

{'AF': [2, 4, 18, 22, 26, 27, 28, 29, 31, 33, 34, 38, 39, 47, 49, 53, 55, 56, 58, 62, 63, 66, 70, 71, 88, 95, 96, 97, 100, 101, 104, 107, 108, 114, 115, 117, 123, 124, 142, 148, 150, 152, 153, 158, 159, 162, 164, 172, 175, 179, 183, 191, 192], 'AS': [0, 12, 13, 19, 24, 30, 36, 46, 77, 78, 79, 80, 82, 85, 86, 87, 90, 91, 92, 94, 102, 103, 112, 116, 119, 127, 128, 134, 137, 138, 141, 149, 154, 161, 167, 168, 169, 171, 176, 177, 181, 186, 189, 190], 'EU': [1, 3, 7, 9, 10, 15, 16, 21, 25, 42, 44, 45, 48, 57, 60, 61, 64, 65, 67, 75, 76, 81, 83, 93, 98, 99, 105, 111, 113, 120, 126, 135, 136, 139, 140, 147, 151, 155, 156, 160, 165, 166, 170, 180, 182], 'OC': [8, 40, 59, 89, 106, 110, 118, 121, 125, 129, 131, 146, 157, 173, 178, 187], 'SA': [6, 20, 23, 35, 37, 52, 72, 132, 133, 163, 185, 188]}

### Get a single group by get_group():

In [5]:
continents_group.get_group('SA')

Unnamed: 0,country,wine_servings,total_litres_of_pure_alcohol,continent
6,Argentina,221,8.3,SA
20,Bolivia,8,3.8,SA
23,Brazil,16,7.2,SA
35,Chile,172,7.6,SA
37,Colombia,3,4.2,SA
52,Ecuador,3,4.2,SA
72,Guyana,1,7.1,SA
132,Paraguay,74,7.3,SA
133,Peru,21,6.1,SA
163,Suriname,7,5.6,SA


### Compute counts of each group, excluding missing values.

In [6]:
continents_group.count()

Unnamed: 0_level_0,country,wine_servings,total_litres_of_pure_alcohol
continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AF,53,53,53
AS,44,44,44
EU,45,45,45
OC,16,16,16
SA,12,12,12


In [7]:
# note that without groups we can do something like, which of course is not efficient
df[df.continent=='AF'].count()
# ...
df[df.continent=='SA'].count()


country                         12
wine_servings                   12
total_litres_of_pure_alcohol    12
continent                       12
dtype: int64

### Compute max of group values

In [8]:
continents_group.max()

Unnamed: 0_level_0,country,wine_servings,total_litres_of_pure_alcohol
continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AF,Zimbabwe,233,9.1
AS,Yemen,123,11.5
EU,United Kingdom,370,14.4
OC,Vanuatu,212,10.4
SA,Venezuela,221,8.3


### Compute sum of group values

In [9]:
continents_group.sum()

  continents_group.sum()


Unnamed: 0_level_0,wine_servings,total_litres_of_pure_alcohol
continent,Unnamed: 1_level_1,Unnamed: 2_level_1
AF,862,159.4
AS,399,95.5
EU,6400,387.8
OC,570,54.1
SA,749,75.7


### Get nth value/subset from each group

Get the nth row from each group if n is an int, or a subset of rows if n is a list of ints.

In [10]:
# lets get first and last value from each group
continents_group.nth([1,-1])

Unnamed: 0_level_0,country,wine_servings,total_litres_of_pure_alcohol
continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AF,Angola,45,5.9
AF,Zimbabwe,4,4.7
AS,Bahrain,7,2.0
AS,Yemen,0,0.1
EU,Andorra,312,12.4
EU,United Kingdom,195,10.4
OC,Cook Islands,74,5.9
OC,Vanuatu,11,0.9
SA,Bolivia,8,3.8
SA,Venezuela,3,7.7
