# Groupby-Aggregation
<hr style="border:2px solid black">

## 1. Introduction

**load packages**

In [1]:
# data analysis stack
import numpy as np
import pandas as pd

# data visualization stack
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('whitegrid')

### 1.1 Example: Large Countries Data

**read data**

In [2]:
df = pd.read_csv('../data/large_countries_2015.csv')

**show dataframe**

In [3]:
df

Unnamed: 0.1,Unnamed: 0,population,fertility,continent
0,Bangladesh,160995600.0,2.12,Asia
1,Brazil,207847500.0,1.78,South America
2,China,1376049000.0,1.57,Asia
3,India,1311051000.0,2.43,Asia
4,Indonesia,257563800.0,2.28,Asia
5,Japan,126573500.0,1.45,Asia
6,Mexico,127017200.0,2.13,North America
7,Nigeria,182202000.0,5.89,Africa
8,Pakistan,188924900.0,3.04,Asia
9,Philippines,100699400.0,2.98,Asia


In [34]:
df = df.rename(columns={'Unnamed: 0': 'country'})
df

Unnamed: 0,country,population,fertility,continent
0,China,1376.0,1.57,Asia
1,India,1311.1,2.43,Asia
2,United States,321.8,1.97,North America
3,Indonesia,257.6,2.28,Asia
4,Brazil,207.8,1.78,South America
5,Pakistan,188.9,3.04,Asia
6,Nigeria,182.2,5.89,Africa
7,Bangladesh,161.0,2.12,Asia
8,Russia,143.5,1.61,Europe
9,Mexico,127.0,2.13,North America


**population in million**

In [4]:
df['population'] = round(df['population']/1_000_000, 1)
df.head()

Unnamed: 0.1,Unnamed: 0,population,fertility,continent
0,Bangladesh,161.0,2.12,Asia
1,Brazil,207.8,1.78,South America
2,China,1376.0,1.57,Asia
3,India,1311.1,2.43,Asia
4,Indonesia,257.6,2.28,Asia


**sort by population**

In [5]:
df.sort_values(
    by="population",
    ascending=False,
    ignore_index=True,
    inplace=True
)
df.head()

# ascending=False , highest first - default is lowest to highest

Unnamed: 0.1,Unnamed: 0,population,fertility,continent
0,China,1376.0,1.57,Asia
1,India,1311.1,2.43,Asia
2,United States,321.8,1.97,North America
3,Indonesia,257.6,2.28,Asia
4,Brazil,207.8,1.78,South America


***Q: What is the average population size of the large countries?***

In [9]:
country_avg = df['population'].mean()

print(f"""
countrywise average population: {round(country_avg, 1)} million
""")


countrywise average population: 375.3 million



***Q: What is the average population in large countries in each continent?***

In [17]:
continent_avg = df.groupby('continent')['population'].mean()
type(continent_avg)

pandas.core.series.Series

### There are 3 ways to convert this back to a DataFrame...

In [19]:
pd.DataFrame(continent_avg).reset_index()

Unnamed: 0,continent,population
0,Africa,182.2
1,Asia,503.128571
2,Europe,143.5
3,North America,224.4
4,South America,207.8


In [16]:
continent_avg = continent_avg.reset_index()
continent_avg

Unnamed: 0,continent,population
0,Africa,182.2
1,Asia,503.128571
2,Europe,143.5
3,North America,224.4
4,South America,207.8


In [20]:
continent_avg = df.groupby('continent')[['population']].mean().reset_index()
continent_avg

Unnamed: 0,continent,population
0,Africa,182.2
1,Asia,503.128571
2,Europe,143.5
3,North America,224.4
4,South America,207.8


In [138]:
cntrx_max = pd.DataFrame(df.groupby('continent')['population'].max())
cntrx_max

Unnamed: 0_level_0,population
continent,Unnamed: 1_level_1
Africa,182.2
Asia,1376.0
Europe,143.5
North America,321.8
South America,207.8


**This is an example of `Data Wrangling`**

### 1.2 Data Wrangling

- the process of transforming data from one format to another
- makes data more approprite and valuable for inquiries in data analytics
- dataframes can be wrangled through:

|example|what it does|
|:--:|:--:|
|`melting`|converts a dataframe from wide to long format|
|`pivoting`|converts a dataframe from long to wide format|
|`stacking`|moves columns to hierarchical row index|
|`unstacking`|creates columns from hierarchical row index|
|`concatenating`|sticks two dataframes together|
|`merging`|joins two datadrames on specific indexes/columns|
|`groupby-aggregation`|performs a split-apply-combine operation based on some criteria|

<hr style="border:2px solid black">

## 2. Aggregatation Functions 

- spits out single summary value from the values of multiple rows

### 2.1 Pandas Default Functions

**`.sum()`**

In [25]:
df.sum()

Unnamed: 0    ChinaIndiaUnited StatesIndonesiaBrazilPakistan...
population                                               4504.2
fertility                                                 29.25
continent     AsiaAsiaNorth AmericaAsiaSouth AmericaAsiaAfri...
dtype: object

In [26]:
df['population'].sum()

4504.2

**`.count()`**

In [22]:
df.count()

Unnamed: 0    12
population    12
fertility     12
continent     12
dtype: int64

**`.mean()`**

In [29]:
df

Unnamed: 0.1,Unnamed: 0,population,fertility,continent
0,China,1376.0,1.57,Asia
1,India,1311.1,2.43,Asia
2,United States,321.8,1.97,North America
3,Indonesia,257.6,2.28,Asia
4,Brazil,207.8,1.78,South America
5,Pakistan,188.9,3.04,Asia
6,Nigeria,182.2,5.89,Africa
7,Bangladesh,161.0,2.12,Asia
8,Russia,143.5,1.61,Europe
9,Mexico,127.0,2.13,North America


In [27]:
# mean for whole DF outputs avg for each cell... the same as orig DF
df.mean

<bound method NDFrame._add_numeric_operations.<locals>.mean of        Unnamed: 0  population  fertility      continent
0           China      1376.0       1.57           Asia
1           India      1311.1       2.43           Asia
2   United States       321.8       1.97  North America
3       Indonesia       257.6       2.28           Asia
4          Brazil       207.8       1.78  South America
5        Pakistan       188.9       3.04           Asia
6         Nigeria       182.2       5.89         Africa
7      Bangladesh       161.0       2.12           Asia
8          Russia       143.5       1.61         Europe
9          Mexico       127.0       2.13  North America
10          Japan       126.6       1.45           Asia
11    Philippines       100.7       2.98           Asia>

In [28]:
# need to specify the numeric values only to instruct specific columns

df.mean(numeric_only=True)

population    375.3500
fertility       2.4375
dtype: float64

**`.min()`**

In [24]:
df.min(numeric_only=True)

population    100.70
fertility       1.45
dtype: float64

In [139]:
# find the country with the smallest population
df[df["population"]==df["population"].max()]["country"].iloc[0]

'China'

In [113]:
df[df["population"]==df["population"].min()]

Unnamed: 0,country,population,fertility,continent,country_code
11,Philippines,100.7,2.98,Asia,PHI


**`.max()`**

In [36]:
df.max(numeric_only=False)

country       United States
population           1376.0
fertility              5.89
continent     South America
dtype: object

**`.describe()`**

In [37]:
# apply a predefined set of aggregate functions
df.describe()

Unnamed: 0,population,fertility
count,12.0,12.0
mean,375.35,2.4375
std,456.517642,1.200781
min,100.7,1.45
25%,139.375,1.7375
50%,185.55,2.125
75%,273.65,2.5675
max,1376.0,5.89


### 2.2 Custom Functions

**user-defined function**

In [38]:
# quartile coefficient of dispersion is an absolute of value for the quartile dispersion...
# allows a comparable summary of variables... I think... research it!

def qcd(series):
    """
    spits out quartile coefficient of dispersion for a series
    """
    q3, q1 = series.quantile(0.75), series.quantile(0.25)
    qcd_value = (q3-q1)/(q3+q1)
    return qcd_value

qcd(df)

  q3, q1 = series.quantile(0.75), series.quantile(0.25)


population    0.325101
fertility     0.192799
dtype: float64

**implement with `.agg()`**

In [40]:
# apply a customized set of aggregate functions
df[['population','fertility']].agg(func=[qcd])

Unnamed: 0,population,fertility
qcd,0.325101,0.192799


In [None]:
# apply a customized set of aggregate functions
df[['population','fertility']].agg(
    func=['count','sum','mean', qcd]
)

In [41]:
# can also apply this to find the mean of the given columns... 
#...not that there's much point as we already have the .mean() function

df[['population','fertility']].agg(func='mean')

population    375.3500
fertility       2.4375
dtype: float64

In [43]:
df[['population','fertility']].mean()

population    375.3500
fertility       2.4375
dtype: float64

<hr style="border:2px solid black">

## 3. Groupby-Aggregation Mechanism

- split dataframe into groups based on some discrete-valued column(s)... **not continuous**
- apply an aggregate function to each group independently
- combine the results

<img src="groupby_mechanism.png" width=500>

### 3.1 Splitting

In [44]:
# applying groupby to the continent column of DF...
# to create seperate groups for each continent.

g1 = df.groupby('continent')

In [46]:
g1

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x179d86b90>

In [45]:
type(g1)

pandas.core.groupby.generic.DataFrameGroupBy

In [47]:
# the GroupBy object is an iterable of DataFrames
for continent, sub_df in g1:
    print(continent)
    print(sub_df)
    print('\n')

Africa
   country  population  fertility continent
6  Nigeria       182.2       5.89    Africa


Asia
        country  population  fertility continent
0         China      1376.0       1.57      Asia
1         India      1311.1       2.43      Asia
3     Indonesia       257.6       2.28      Asia
5      Pakistan       188.9       3.04      Asia
7    Bangladesh       161.0       2.12      Asia
10        Japan       126.6       1.45      Asia
11  Philippines       100.7       2.98      Asia


Europe
  country  population  fertility continent
8  Russia       143.5       1.61    Europe


North America
         country  population  fertility      continent
2  United States       321.8       1.97  North America
9         Mexico       127.0       2.13  North America


South America
  country  population  fertility      continent
4  Brazil       207.8       1.78  South America




In [48]:
# groups
g1.groups

# shows each continent groups and the corresponding indexes

{'Africa': [6], 'Asia': [0, 1, 3, 5, 7, 10, 11], 'Europe': [8], 'North America': [2, 9], 'South America': [4]}

In [49]:
# get the DataFrame of a specific group
g1.get_group('North America')

Unnamed: 0,country,population,fertility,continent
2,United States,321.8,1.97,North America
9,Mexico,127.0,2.13,North America


### 3.2 Split-Apply-Combine

**Example 1**

In [50]:
# use .first() to display the first row of each group... a default aggregate function
df.groupby('continent').first()

Unnamed: 0_level_0,country,population,fertility
continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Africa,Nigeria,182.2,5.89
Asia,China,1376.0,1.57
Europe,Russia,143.5,1.61
North America,United States,321.8,1.97
South America,Brazil,207.8,1.78


**Example 2**

In [51]:
# list() function converts a string to a list

# split by an array of equal length
my_groups = list("AAABBBBACCCA")
my_groups

['A', 'A', 'A', 'B', 'B', 'B', 'B', 'A', 'C', 'C', 'C', 'A']

In [52]:
# .shape[0] defines the rows
# below, we're checking that the number of rows = number of elements in the list

df.shape[0]==len(my_groups)

True

In [53]:
g2 = df.groupby(my_groups)

In [54]:
g2.get_group('A')

Unnamed: 0,country,population,fertility,continent
0,China,1376.0,1.57,Asia
1,India,1311.1,2.43,Asia
2,United States,321.8,1.97,North America
7,Bangladesh,161.0,2.12,Asia
11,Philippines,100.7,2.98,Asia


In [55]:
g2['population'].sum()

A    3270.6
B     836.5
C     397.1
Name: population, dtype: float64

**Example 3**

In [56]:
df2 = df.set_index('country')
df2.head()

Unnamed: 0_level_0,population,fertility,continent
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
China,1376.0,1.57,Asia
India,1311.1,2.43,Asia
United States,321.8,1.97,North America
Indonesia,257.6,2.28,Asia
Brazil,207.8,1.78,South America


In [57]:
# split by a Dictionary with keys on the Index
language = {
    'Bangladesh':'BN',
    'Brazil':'PT',
    'China':'CN',
    'India':'HD',
    'Indonesia':'ID',
    'Japan':'JP',
    'Mexico':'ES',
    'Nigeria':'NG',
    'Pakistan':'AR',
    'Philippines':'PP',
    'Russia':'RU',
    'United States':'EN'
}

In [60]:
g3 = df2.groupby(language)
g3[['population','fertility']].mean()

Unnamed: 0_level_0,population,fertility
country,Unnamed: 1_level_1,Unnamed: 2_level_1
AR,188.9,3.04
BN,161.0,2.12
CN,1376.0,1.57
EN,321.8,1.97
ES,127.0,2.13
HD,1311.1,2.43
ID,257.6,2.28
JP,126.6,1.45
NG,182.2,5.89
PP,100.7,2.98


**Example 4**

In [73]:
# split by a function - here: number of characters in the index (country name, for this example)
g4 = df2.groupby(len)
g4.mean()

  g4.mean()


Unnamed: 0_level_0,population,fertility
country,Unnamed: 1_level_1,Unnamed: 2_level_1
5,937.9,1.816667
6,159.433333,1.84
7,182.2,5.89
8,188.9,3.04
9,257.6,2.28
10,161.0,2.12
11,100.7,2.98
13,321.8,1.97


In [75]:
# split by a list of the above
g5 = df2.groupby(['continent', len, language])
g5.mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,population,fertility
continent,country,country,Unnamed: 3_level_1,Unnamed: 4_level_1
Africa,7,NG,182.2,5.89
Asia,5,CN,1376.0,1.57
Asia,5,HD,1311.1,2.43
Asia,5,JP,126.6,1.45
Asia,8,AR,188.9,3.04
Asia,9,ID,257.6,2.28
Asia,10,BN,161.0,2.12
Asia,11,PP,100.7,2.98
Europe,6,RU,143.5,1.61
North America,6,ES,127.0,2.13


In [77]:
# split by first letter and count the relevant number of entries

df.groupby(df['country'].str[0])['country'].count()

country
B    2
C    1
I    2
J    1
M    1
N    1
P    2
R    1
U    1
Name: country, dtype: int64

<hr style="border:2px solid black">

## 4. Apply & Transform

### 4.1 `.apply()`


this should be used as an alternative to python's for loop... DO NOT USE for loop on a DF

In [79]:
df

Unnamed: 0,country,population,fertility,continent
0,China,1376.0,1.57,Asia
1,India,1311.1,2.43,Asia
2,United States,321.8,1.97,North America
3,Indonesia,257.6,2.28,Asia
4,Brazil,207.8,1.78,South America
5,Pakistan,188.9,3.04,Asia
6,Nigeria,182.2,5.89,Africa
7,Bangladesh,161.0,2.12,Asia
8,Russia,143.5,1.61,Europe
9,Mexico,127.0,2.13,North America


In [78]:
# length of country names
df['country'].apply(len)

# output the character length for each row in the 'country' column

0      5
1      5
2     13
3      9
4      6
5      8
6      7
7     10
8      6
9      6
10     5
11    11
Name: country, dtype: int64

In [80]:
# using user-defined function on a column

def triplet(x):
    return x[:3].upper()

df['country'].apply(triplet)

0     CHI
1     IND
2     UNI
3     IND
4     BRA
5     PAK
6     NIG
7     BAN
8     RUS
9     MEX
10    JAP
11    PHI
Name: country, dtype: object

In [82]:
# apply this to a new column

def triplet(x):
    return x[:3].upper()

df['country'].apply(triplet)

df['country_code'] = df['country'].apply(triplet)

df.head()

Unnamed: 0,country,population,fertility,continent,country_code
0,China,1376.0,1.57,Asia,CHI
1,India,1311.1,2.43,Asia,IND
2,United States,321.8,1.97,North America,UNI
3,Indonesia,257.6,2.28,Asia,IND
4,Brazil,207.8,1.78,South America,BRA


## lambda!

In [83]:
# lambda creates a function without defining (saving) it... 
# can apply the above def function in a similar way:

df['country'].apply(
    lambda x: x[:3].upper()
)


0     CHI
1     IND
2     UNI
3     IND
4     BRA
5     PAK
6     NIG
7     BAN
8     RUS
9     MEX
10    JAP
11    PHI
Name: country, dtype: object

In [84]:
# using your own function on groups
def diff_from_mean(gdf):
    return gdf['population'] - gdf['population'].mean()

df.groupby('continent').apply(diff_from_mean)

continent        
Africa         6       0.000000
Asia           0     872.871429
               1     807.971429
               3    -245.528571
               5    -314.228571
               7    -342.128571
               10   -376.528571
               11   -402.428571
Europe         8       0.000000
North America  2      97.400000
               9     -97.400000
South America  4       0.000000
Name: population, dtype: float64

### 4.2 `.transform()`

- Transform takes each pd.Series of a pd.DataFrame as input, 
applies a specified function to each element of the pd.Series
and returns a pd.Series of equal size.

In [94]:
df

Unnamed: 0,country,population,fertility,continent,country_code
0,China,1376.0,1.57,Asia,CHI
1,India,1311.1,2.43,Asia,IND
2,United States,321.8,1.97,North America,UNI
3,Indonesia,257.6,2.28,Asia,IND
4,Brazil,207.8,1.78,South America,BRA
5,Pakistan,188.9,3.04,Asia,PAK
6,Nigeria,182.2,5.89,Africa,NIG
7,Bangladesh,161.0,2.12,Asia,BAN
8,Russia,143.5,1.61,Europe,RUS
9,Mexico,127.0,2.13,North America,MEX


In [124]:
# for the example below, this applies the continent's population and fertility mean against each country 

df.set_index('country').groupby('continent')[['population','fertility']].transform('mean')

Unnamed: 0_level_0,population,fertility
country,Unnamed: 1_level_1,Unnamed: 2_level_1
China,503.128571,2.267143
India,503.128571,2.267143
United States,224.4,2.05
Indonesia,503.128571,2.267143
Brazil,207.8,1.78
Pakistan,503.128571,2.267143
Nigeria,182.2,5.89
Bangladesh,503.128571,2.267143
Russia,143.5,1.61
Mexico,224.4,2.05


In [92]:
# this can be checked using the below

df.groupby('continent').describe()

Unnamed: 0_level_0,population,population,population,population,population,population,population,population,fertility,fertility,fertility,fertility,fertility,fertility,fertility,fertility
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
continent,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Africa,1.0,182.2,,182.2,182.2,182.2,182.2,182.2,1.0,5.89,,5.89,5.89,5.89,5.89,5.89
Asia,7.0,503.128571,576.55886,100.7,143.8,188.9,784.35,1376.0,7.0,2.267143,0.620154,1.45,1.845,2.28,2.705,3.04
Europe,1.0,143.5,,143.5,143.5,143.5,143.5,143.5,1.0,1.61,,1.61,1.61,1.61,1.61,1.61
North America,2.0,224.4,137.744401,127.0,175.7,224.4,273.1,321.8,2.0,2.05,0.113137,1.97,2.01,2.05,2.09,2.13
South America,1.0,207.8,,207.8,207.8,207.8,207.8,207.8,1.0,1.78,,1.78,1.78,1.78,1.78,1.78


In [133]:
# Transformation by function reference
# The below applies the length of the grouped series (count of entries) against each row...
# aka, number of countries within the same continent, including itself.

df.groupby('continent')['country'].transform(len)

0     7
1     7
2     2
3     7
4     1
5     7
6     1
7     7
8     1
9     2
10    7
11    7
Name: country, dtype: int64

In [106]:
# can check this again below... number of countries in each continent:

country_count = df.groupby('continent')[['country']].count()

pd.DataFrame(country_count)

Unnamed: 0_level_0,country
continent,Unnamed: 1_level_1
Africa,1
Asia,7
Europe,1
North America,2
South America,1


In [131]:
# Transformation with your own function
def normalize(array):
    """normalize to mean 0.0 and standard deviation 1.0"""
    return (array - array.mean()) / array.std()

df[['population', 'fertility']].transform(normalize)

Unnamed: 0,population,fertility
0,2.19192,-0.722446
1,2.049756,-0.006246
2,-0.117301,-0.38933
3,-0.257931,-0.131165
4,-0.367018,-0.54756
5,-0.408418,0.501757
6,-0.423094,2.875212
7,-0.469533,-0.264411
8,-0.507866,-0.689135
9,-0.54401,-0.256083
