### df.groupby()

In [None]:
# groupby() works by splitting data into groups based on specified criteria,
# applying a function to each group independently, then combining the results into 
# a data structure.
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.groupby.html

import pandas as pd
clothes = pd.DataFrame({'type': ['pants', 'shirt', 'shirt', 'pants', 'shirt', 'pants'],
                       'color': ['red', 'blue', 'green', 'blue', 'green', 'red'],
                       'price_usd': [20, 35, 50, 40, 100, 75],
                       'mass_g': [125, 440, 680, 200, 395, 485]})

clothes

In [None]:
clothes.value_counts('type')

In [None]:
# grouping the dataframe by type results in a DataFrameGroupBy object
grouped = clothes.groupby('type')
grouped

In [None]:
grouped.mean(numeric_only=True)

In [None]:
# In addition, groups may be created based on multiple columns.
clothes.groupby(['type', 'color']).min()

In [None]:
# to simply return the number of observations there are in each group, use the size() method.
clothes.groupby(['type', 'color']).size()

In [None]:
# There are many available built-in aggregation functions.
# count(): The number of non-null values in each group
# sum(): The sum of values in each group
# mean(): The mean of values in each group
# median(): The median of values in each group
# min(): The minimum value in each group
# max(): The maximum value in each group
# std(): The standard deviation of values in each group
# var(): The variance of values in each group

### df.agg()

In [None]:
# agg() applies multiple functions to a dataframe at the same time.
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.agg.html

clothes2 = pd.DataFrame({'type': ['pants', 'shirt', 'shirt', 'pants', 'shirt', 'pants'],
                       'color': ['red', 'blue', 'green', 'blue', 'green', 'red'],
                       'price1_usd': [20, 35, 50, 40, 100, 75],
                       'price2_usd': [10, 30, 45, 35, 80, 70],
                       'mass1_g': [125, 440, 680, 200, 395, 485],
                       'mass2_g': [130, 450, 700, 300, 400, 500]})

clothes2

In [None]:
# apply sum() and mean() functions to the 'price1_usd' and 'mass1_g' columns
clothes2[['price1_usd', 'mass1_g']].agg(['sum', 'mean'])

In [None]:
# different functions can be applied to different columns
# argument to the agg() function is a dictionary whose keys are columns and 
# whose values are the functions to be applied to those columns.
clothes2.agg({'price1_usd': ['mean', 'median'],
            'mass1_g': 'sum'
            })

In [None]:
# The following example applies the sum() and mean() functions 
# across axis 1. In other words, instead of applying the functions 
# down each column, they’re applied over each row.

clothes2[['price1_usd', 'price2_usd']].agg(['sum', 'mean'], axis=1)

In [None]:
clothes2[['mass1_g', 'mass2_g']].agg(['sum', 'mean'], axis=1)

### groupby() with agg()

In [None]:
# groupby() and agg() functions are often used together.
clothes

In [None]:
#The items in clothes are grouped by color, 
# then each of those groups has the mean() and max() functions 
# applied to them at the price_usd and mass_g columns.

clothes.groupby('color').agg({'price_usd': ['mean', 'max'],
                             'mass_g': ['mean', 'max']})

### MultiIndex

In [None]:
grouped = clothes.groupby(['color', 'type']).agg(['mean', 'min'])
grouped

In [None]:
# If you inspect row index, you get a MultiIndex object
grouped.index

In [None]:
# The column index shows a MultiIndex object
grouped.columns

In [None]:
# To perform selection on a dataframe with a MultiIndex, use loc[]
# and put indices in parentheses.

# To select a first-level (top) column:
grouped.loc[:, 'price_usd']

In [None]:
# To select a second-level (bottom) column:
grouped.loc[:, ('price_usd', 'min')]

In [None]:
# To select first-level (left-most) row:
grouped.loc['blue', :]

In [None]:
# To select a bottom-level (right-most) row:
grouped.loc[('green', 'shirt'), :]

In [None]:
# You can also select individual values:
grouped.loc[('blue', 'shirt'), ('mass_g', 'mean')]

In [None]:
clothes.groupby(['color', 'type'], as_index=False).mean()

In [None]:
grouped2 = clothes.groupby(['color', 'type']).mean()
grouped2