# Pandas Data Grammar Illustration

This notebook illustrates the concepts of grammar of data in pandas using the bank dataset.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Read the data file into a DataFrame
import os

PROJECT_NAME = "bank"

home_dir = os.path.expanduser("~")
course_id = 'cpsc6300'
project_dir = os.path.join(home_dir, course_id, PROJECT_NAME)
project_data_dir = os.path.join(project_dir, "data", "bank")
data_file_path = os.path.join(project_data_dir, "bank-additional-bank-additional-full.csv")
df = pd.read_csv(data_file_path, sep=";")

## Select a data slice

In [3]:
### head() and tail()

In [4]:
df.head(5)

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [5]:
df.tail(5)

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
41183,73,retired,married,professional.course,no,yes,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes
41184,46,blue-collar,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41185,56,retired,married,university.degree,no,yes,no,cellular,nov,fri,...,2,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41186,44,technician,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes
41187,74,retired,married,professional.course,no,yes,no,cellular,nov,fri,...,3,999,1,failure,-1.1,94.767,-50.8,1.028,4963.6,no


### bracket selection

In [6]:
df[['age', 'job']]

Unnamed: 0,age,job
0,56,housemaid
1,57,services
2,37,services
3,40,admin.
4,56,services
...,...,...
41183,73,retired
41184,46,blue-collar
41185,56,retired
41186,44,technician


### iloc

In [7]:
### Read the iloc documentation
help(df.iloc)

Help on _iLocIndexer in module pandas.core.indexing object:

class _iLocIndexer(_LocationIndexer)
 |  Purely integer-location based indexing for selection by position.
 |  
 |  ``.iloc[]`` is primarily integer position based (from ``0`` to
 |  ``length-1`` of the axis), but may also be used with a boolean
 |  array.
 |  
 |  Allowed inputs are:
 |  
 |  - An integer, e.g. ``5``.
 |  - A list or array of integers, e.g. ``[4, 3, 0]``.
 |  - A slice object with ints, e.g. ``1:7``.
 |  - A boolean array.
 |  - A ``callable`` function with one argument (the calling Series or
 |    DataFrame) and that returns valid output for indexing (one of the above).
 |    This is useful in method chains, when you don't have a reference to the
 |    calling object, but would like to base your selection on some value.
 |  
 |  ``.iloc`` will raise ``IndexError`` if a requested indexer is
 |  out-of-bounds, except *slice* indexers which allow out-of-bounds
 |  indexing (this conforms with python/numpy *sli

In [8]:
df.iloc[[10, 12, 15], [0, 1]]

Unnamed: 0,age,job
10,41,blue-collar
12,29,blue-collar
15,54,retired


### loc

In [9]:
# df.loc
help(df.loc)

Help on _LocIndexer in module pandas.core.indexing object:

class _LocIndexer(_LocationIndexer)
 |  Access a group of rows and columns by label(s) or a boolean array.
 |  
 |  ``.loc[]`` is primarily label based, but may also be used with a
 |  boolean array.
 |  
 |  Allowed inputs are:
 |  
 |  - A single label, e.g. ``5`` or ``'a'``, (note that ``5`` is
 |    interpreted as a *label* of the index, and **never** as an
 |    integer position along the index).
 |  - A list or array of labels, e.g. ``['a', 'b', 'c']``.
 |  - A slice object with labels, e.g. ``'a':'f'``.
 |  
 |        start and the stop are included
 |  
 |  - A boolean array of the same length as the axis being sliced,
 |    e.g. ``[True, False, True]``.
 |  - A ``callable`` function with one argument (the calling Series or
 |    DataFrame) and that returns valid output for indexing (one of the above)
 |  
 |  See more at :ref:`Selection by Label <indexing.label>`
 |  
 |  Raises
 |  ------
 |  KeyError
 |      If any 

In [10]:
df.loc[[0, 13]]

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
13,57,housemaid,divorced,basic.4y,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [11]:
df.loc[[0, 13], ["age", "job"]]

Unnamed: 0,age,job
0,56,housemaid
13,57,housemaid


In [12]:
df.loc[df.education == 'basic.4y']

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
13,57,housemaid,divorced,basic.4y,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
21,55,blue-collar,married,basic.4y,unknown,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
33,54,management,married,basic.4y,unknown,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
34,54,blue-collar,divorced,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41158,35,technician,divorced,basic.4y,no,no,no,cellular,nov,tue,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.035,4963.6,yes
41159,35,technician,divorced,basic.4y,no,yes,no,cellular,nov,tue,...,1,9,4,success,-1.1,94.767,-50.8,1.035,4963.6,yes
41162,60,blue-collar,married,basic.4y,no,yes,no,cellular,nov,tue,...,2,4,1,success,-1.1,94.767,-50.8,1.035,4963.6,no
41163,35,technician,divorced,basic.4y,no,yes,no,cellular,nov,tue,...,3,4,2,success,-1.1,94.767,-50.8,1.035,4963.6,yes


## Sort

In [13]:
### sort_values
help(df.sort_values)

Help on method sort_values in module pandas.core.frame:

sort_values(by, axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last', ignore_index=False) method of pandas.core.frame.DataFrame instance
    Sort by the values along either axis.
    
    Parameters
    ----------
            by : str or list of str
                Name or list of names to sort by.
    
                - if `axis` is 0 or `'index'` then `by` may contain index
                  levels and/or column labels.
                - if `axis` is 1 or `'columns'` then `by` may contain column
                  levels and/or index labels.
    
                .. versionchanged:: 0.23.0
    
                   Allow specifying index or column level names.
    axis : {0 or 'index', 1 or 'columns'}, default 0
         Axis to be sorted.
    ascending : bool or list of bool, default True
         Sort ascending vs. descending. Specify list for multiple sort
         orders.  If this is a list of bools, must

In [14]:
df.sort_values(by=["education", "age"])

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
30142,18,student,single,basic.4y,no,no,no,cellular,apr,thu,...,1,999,0,nonexistent,-1.8,93.075,-47.1,1.365,5099.1,no
30349,18,student,single,basic.4y,no,yes,yes,cellular,apr,thu,...,2,999,0,nonexistent,-1.8,93.075,-47.1,1.365,5099.1,no
39903,18,student,single,basic.4y,no,yes,no,cellular,jun,tue,...,1,999,0,nonexistent,-1.7,94.055,-39.8,0.737,4991.6,no
41088,18,student,single,basic.4y,no,yes,no,telephone,nov,tue,...,1,13,2,success,-1.1,94.767,-50.8,1.049,4963.6,yes
30048,19,student,single,basic.4y,no,no,yes,cellular,apr,wed,...,2,999,0,nonexistent,-1.8,93.075,-47.1,1.405,5099.1,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40727,86,retired,married,unknown,unknown,yes,no,cellular,sep,tue,...,1,999,0,nonexistent,-1.1,94.199,-37.5,0.877,4963.6,yes
39655,92,retired,married,unknown,no,yes,no,cellular,may,thu,...,1,6,2,success,-1.8,93.876,-40.0,0.683,5008.7,no
39734,92,retired,divorced,unknown,unknown,no,no,cellular,may,wed,...,3,999,1,failure,-1.8,93.876,-40.0,0.697,5008.7,yes
40450,92,retired,married,unknown,no,no,yes,cellular,aug,tue,...,1,3,1,success,-1.7,94.027,-38.3,0.904,4991.6,yes


In [15]:
### sort_values
help(df.sort_index)

Help on method sort_index in module pandas.core.frame:

sort_index(axis=0, level=None, ascending=True, inplace=False, kind='quicksort', na_position='last', sort_remaining=True, ignore_index: bool = False) method of pandas.core.frame.DataFrame instance
    Sort object by labels (along an axis).
    
    Parameters
    ----------
    axis : {0 or 'index', 1 or 'columns'}, default 0
        The axis along which to sort.  The value 0 identifies the rows,
        and 1 identifies the columns.
    level : int or level name or list of ints or list of level names
        If not None, sort on values in specified index level(s).
    ascending : bool, default True
        Sort ascending vs. descending.
    inplace : bool, default False
        If True, perform operation in-place.
    kind : {'quicksort', 'mergesort', 'heapsort'}, default 'quicksort'
        Choice of sorting algorithm. See also ndarray.np.sort for more
        information.  `mergesort` is the only stable algorithm. For
        Da

In [16]:
df.sort_index(ascending=False)

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
41187,74,retired,married,professional.course,no,yes,no,cellular,nov,fri,...,3,999,1,failure,-1.1,94.767,-50.8,1.028,4963.6,no
41186,44,technician,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes
41185,56,retired,married,university.degree,no,yes,no,cellular,nov,fri,...,2,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41184,46,blue-collar,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41183,73,retired,married,professional.course,no,yes,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [17]:
df.loc[df.education == 'basic.4y']

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
13,57,housemaid,divorced,basic.4y,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
21,55,blue-collar,married,basic.4y,unknown,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
33,54,management,married,basic.4y,unknown,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
34,54,blue-collar,divorced,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41158,35,technician,divorced,basic.4y,no,no,no,cellular,nov,tue,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.035,4963.6,yes
41159,35,technician,divorced,basic.4y,no,yes,no,cellular,nov,tue,...,1,9,4,success,-1.1,94.767,-50.8,1.035,4963.6,yes
41162,60,blue-collar,married,basic.4y,no,yes,no,cellular,nov,tue,...,2,4,1,success,-1.1,94.767,-50.8,1.035,4963.6,no
41163,35,technician,divorced,basic.4y,no,yes,no,cellular,nov,tue,...,3,4,2,success,-1.1,94.767,-50.8,1.035,4963.6,yes


### Unqiue values

In [18]:
### unqiue
help(pd.Series.unique)

Help on function unique in module pandas.core.series:

unique(self)
    Return unique values of Series object.
    
    Uniques are returned in order of appearance. Hash table-based unique,
    therefore does NOT sort.
    
    Returns
    -------
    ndarray or ExtensionArray
        The unique values returned as a NumPy array. See Notes.
    
    See Also
    --------
    unique : Top-level unique method for any 1-d array-like object.
    Index.unique : Return Index with unique values from an Index object.
    
    Notes
    -----
    Returns the unique values as a NumPy array. In case of an
    extension-array backed Series, a new
    :class:`~api.extensions.ExtensionArray` of that type with just
    the unique values is returned. This includes
    
        * Categorical
        * Period
        * Datetime with Timezone
        * Interval
        * Sparse
        * IntegerNA
    
    See Examples section.
    
    Examples
    --------
    >>> pd.Series([2, 1, 3, 3], name='A').uniqu

In [19]:
df["job"].unique()

array(['housemaid', 'services', 'admin.', 'blue-collar', 'technician',
       'retired', 'management', 'unemployed', 'self-employed', 'unknown',
       'entrepreneur', 'student'], dtype=object)

### drop_duplicates

In [20]:
help(pd.DataFrame.drop_duplicates)

Help on function drop_duplicates in module pandas.core.frame:

drop_duplicates(self, subset: Union[Hashable, Sequence[Hashable], NoneType] = None, keep: Union[str, bool] = 'first', inplace: bool = False, ignore_index: bool = False) -> Union[ForwardRef('DataFrame'), NoneType]
    Return DataFrame with duplicate rows removed.
    
    Considering certain columns is optional. Indexes, including time indexes
    are ignored.
    
    Parameters
    ----------
    subset : column label or sequence of labels, optional
        Only consider certain columns for identifying duplicates, by
        default use all of the columns.
    keep : {'first', 'last', False}, default 'first'
        Determines which duplicates (if any) to keep.
        - ``first`` : Drop duplicates except for the first occurrence.
        - ``last`` : Drop duplicates except for the last occurrence.
        - False : Drop all duplicates.
    inplace : bool, default False
        Whether to drop duplicates in place or to ret

In [21]:
df[["education", "job"]].drop_duplicates()

Unnamed: 0,education,job
0,basic.4y,housemaid
1,high.school,services
3,basic.6y,admin.
5,basic.9y,services
6,professional.course,admin.
...,...,...
13691,professional.course,unknown
16269,illiterate,housemaid
26680,illiterate,self-employed
28626,illiterate,entrepreneur


## Transform

In [22]:
education_map = {
     'illiterate': 0,
     'basic.4y': 4,
     'basic.6y': 6,
     'basic.9y': 9,
     'high.school': 12,
     'professional.course': 12,
     'university.degree': 16,
     'unknown': np.NaN,
}
education_map

{'illiterate': 0,
 'basic.4y': 4,
 'basic.6y': 6,
 'basic.9y': 9,
 'high.school': 12,
 'professional.course': 12,
 'university.degree': 16,
 'unknown': nan}

In [23]:
df['education_numeric'] = df["education"].map(education_map)

In [24]:
help(pd.Series.map)

Help on function map in module pandas.core.series:

map(self, arg, na_action=None)
    Map values of Series according to input correspondence.
    
    Used for substituting each value in a Series with another value,
    that may be derived from a function, a ``dict`` or
    a :class:`Series`.
    
    Parameters
    ----------
    arg : function, collections.abc.Mapping subclass or Series
        Mapping correspondence.
    na_action : {None, 'ignore'}, default None
        If 'ignore', propagate NaN values, without passing them to the
        mapping correspondence.
    
    Returns
    -------
    Series
        Same index as caller.
    
    See Also
    --------
    Series.apply : For applying more complex functions on a Series.
    DataFrame.apply : Apply a function row-/column-wise.
    DataFrame.applymap : Apply a function elementwise on a whole DataFrame.
    
    Notes
    -----
    When ``arg`` is a dictionary, values in Series that are not in the
    dictionary (as keys) ar

In [25]:
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y,education_numeric
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no,4.0
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no,12.0
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no,12.0
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no,6.0
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no,12.0


### Assignment

In [26]:
df.loc[df.poutcome == 'nonexistent', "poutcome"] = np.nan

### Sample

In [27]:
df.sample(n=5, random_state=1)

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y,education_numeric
35577,32,blue-collar,married,basic.9y,unknown,yes,yes,cellular,may,mon,...,999,0,,-1.8,92.893,-46.2,1.244,5099.1,no,9.0
13950,33,blue-collar,single,basic.6y,unknown,yes,yes,cellular,jul,fri,...,999,0,,1.4,93.918,-42.7,4.963,5228.1,no,6.0
29451,25,self-employed,divorced,university.degree,no,yes,no,cellular,apr,mon,...,999,0,,-1.8,93.075,-47.1,1.405,5099.1,yes,16.0
32295,34,blue-collar,single,high.school,no,yes,no,cellular,may,fri,...,999,0,,-1.8,92.893,-46.2,1.313,5099.1,no,12.0
27477,53,technician,married,professional.course,no,yes,no,cellular,nov,fri,...,999,0,,-0.1,93.2,-42.0,4.021,5195.8,no,12.0


## Aggregated statistics

In [28]:
df.age.max(), df.age.min()

(98, 17)

In [29]:
df[["age", "nr.employed"]].describe()

Unnamed: 0,age,nr.employed
count,41188.0,41188.0
mean,40.02406,5167.035911
std,10.42125,72.251528
min,17.0,4963.6
25%,32.0,5099.1
50%,38.0,5191.0
75%,47.0,5228.1
max,98.0,5228.1


## Delete Columns 

In [30]:
df.drop(columns=["contact"], inplace=True)

## Group By

In [31]:
df.groupby("education").sum()

Unnamed: 0_level_0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,education_numeric
education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
basic.4y,198763,1105921,10860,4032784,614,901.7,391117.097,-168041.7,15667.385,21590440.0,16704.0
basic.6y,92709,606038,5859,2247004,296,529.8,214645.943,-94265.1,8647.583,11860450.0,13752.0
basic.9y,236125,1579509,15308,5918860,855,962.6,565953.446,-249820.2,22354.188,31266520.0,54405.0
high.school,361553,2482338,24440,9175870,1769,313.4,890459.91,-389550.2,33836.834,49144920.0,114180.0
illiterate,873,4982,41,16989,2,-2.4,1679.712,-719.1,63.298,93092.0,0.0
professional.course,210140,1324035,13559,5037296,855,907.1,490586.795,-210370.7,19453.926,27107130.0,62916.0
university.degree,473082,3081222,31193,11581596,2341,-341.8,1137628.499,-486425.6,42948.942,62826140.0,194688.0
unknown,75266,454198,4494,1632040,392,102.3,162123.062,-69028.5,6181.57,8931180.0,0.0


In [32]:
help(df.groupby)

Help on method groupby in module pandas.core.frame:

groupby(by=None, axis=0, level=None, as_index: bool = True, sort: bool = True, group_keys: bool = True, squeeze: bool = False, observed: bool = False) -> 'groupby_generic.DataFrameGroupBy' method of pandas.core.frame.DataFrame instance
    Group DataFrame using a mapper or by a Series of columns.
    
    A groupby operation involves some combination of splitting the
    object, applying a function, and combining the results. This can be
    used to group large amounts of data and compute operations on these
    groups.
    
    Parameters
    ----------
    by : mapping, function, label, or list of labels
        Used to determine the groups for the groupby.
        If ``by`` is a function, it's called on each value of the object's
        index. If a dict or Series is passed, the Series or dict VALUES
        will be used to determine the groups (the Series' values are first
        aligned; see ``.align()`` method). If an ndarray 