# Counting Values

> Counting unique elements in a sequence in various ways (absolute and cumulative, count and percentage).

In [None]:
#| default_exp value_counts_plus

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
import pandas as pd

In [None]:
#| export
def value_counts_plus(
    series,
    dropna=False,
    show_top=10,
    sort_others=False,
    style=True,
    name='data',
    background_gradient='cividis'):
    """
    Provide a few ways of showing counts of values of items in ``series``.

    Parameters
    ----------
    series : pandas Series or list
        A sequence of items to count.
    dropna : bool
        Whether or not to drop missing values.
    show_top : int
        How many of the top rows to display.
    sort_others : bool
        Whether or not to place "Others" in the bottom (default) or in its
        sorted order position.
    style : bool
        Whether or not to style values for easier reading. If set to ``True``
        the result would not be a DataFrame, and cannot be further manipulated.
        Set the value to ``False`` to get aDataFrame as the return value.
    name : str
        The name of the column that you want displayed in the final table. It
        appears in the caption and defaults to "data".
    background_gradient: str
        The name of the color map to be used as the gradient. Many color maps
        are available: cividis, viridis, copper, cool, magma, and more. You can
        reverse the color by appending _r to the end of the colormap name
        cividis_r for example. Enter a random string to get an error message
        with all available colormaps.

    Returns
    -------
    value_counts_df : a pandas.DataFrame showing counts based on the provided arguments
    """
    val_counts = pd.Series(series).value_counts(dropna=dropna).rename(name)
    if len(val_counts) > show_top:
        val_counts = pd.concat([
            val_counts.iloc[:show_top],
            pd.Series(val_counts.iloc[show_top:].sum(), index=['Others:'])]).rename(name)
        if sort_others:
            val_counts = val_counts.sort_values(ascending=False)
        show_top += 1
    count_df = (val_counts
                .reset_index()
                .assign(cum_count=lambda df: df[name].cumsum(),
                        perc=lambda df: df[name].div(df[name].sum()),
                        cum_perc=lambda df: df['perc'].cumsum())
                .rename(columns={'index': name, name: 'count'}))
    if not style:
        return count_df.head(show_top)
    return (count_df.
            head(show_top).style
            .format({'count': '{:,}', 'cumsum': '{:,}', 
                     'perc': '{:.1%}',
                     'cum_count': '{:,}',
                     'cum_perc': '{:.1%}'})
            .background_gradient(background_gradient)
            .highlight_null()
            .set_caption(f'<h2>Counts of <b>{name}</b></h2>'))


### Counting a list of status codes - default

In [None]:
import random
from http import HTTPStatus

import pandas as pd

In [None]:
status_codes = random.choices(
    [s.value for s in HTTPStatus],
    weights=[.01, 0.04, 0.1] * 20 + [0.1, 0.3],
    k=10000)


In [None]:
value_counts_plus(status_codes)

Unnamed: 0,data,count,cum_count,perc,cum_perc
0,511,899,899,9.0%,9.0%
1,207,332,1231,3.3%,12.3%
2,508,331,1562,3.3%,15.6%
3,201,314,1876,3.1%,18.8%
4,410,311,2187,3.1%,21.9%
5,416,303,2490,3.0%,24.9%
6,204,302,2792,3.0%,27.9%
7,404,301,3093,3.0%,30.9%
8,102,300,3393,3.0%,33.9%
9,307,297,3690,3.0%,36.9%


In [None]:
#| hide
assert type(value_counts_plus(status_codes)) == pd.io.formats.style.Styler

### Changing the number of displayed rows with `show_top`

In [None]:
value_counts_plus(status_codes, show_top=15)

Unnamed: 0,data,count,cum_count,perc,cum_perc
0,511,899,899,9.0%,9.0%
1,207,332,1231,3.3%,12.3%
2,508,331,1562,3.3%,15.6%
3,201,314,1876,3.1%,18.8%
4,410,311,2187,3.1%,21.9%
5,416,303,2490,3.0%,24.9%
6,204,302,2792,3.0%,27.9%
7,404,301,3093,3.0%,30.9%
8,102,300,3393,3.0%,33.9%
9,307,297,3690,3.0%,36.9%


In [None]:
#| hide
assert value_counts_plus(status_codes, show_top=15, style=False).shape[0] == 16

In [None]:
#| hide
# make sure it works with low unique values < show_top
assert type(value_counts_plus([1.1, 2, 3, 4], show_top=15)) == pd.io.formats.style.Styler

### Sorting "Others:" 

In [None]:
value_counts_plus(status_codes, sort_others=True)

Unnamed: 0,data,count,cum_count,perc,cum_perc
0,Others:,6310,6310,63.1%,63.1%
1,511,899,7209,9.0%,72.1%
2,207,332,7541,3.3%,75.4%
3,508,331,7872,3.3%,78.7%
4,201,314,8186,3.1%,81.9%
5,410,311,8497,3.1%,85.0%
6,416,303,8800,3.0%,88.0%
7,204,302,9102,3.0%,91.0%
8,404,301,9403,3.0%,94.0%
9,102,300,9703,3.0%,97.0%


In [None]:
#| hide
assert value_counts_plus(status_codes, sort_others=True, style=False)['data'].iloc[0] == 'Others:'

### Removing table styling if you want a pure `DataFrame`:

In [None]:
value_counts_plus(status_codes, style=False)

Unnamed: 0,data,count,cum_count,perc,cum_perc
0,511,899,899,0.0899,0.0899
1,207,332,1231,0.0332,0.1231
2,508,331,1562,0.0331,0.1562
3,201,314,1876,0.0314,0.1876
4,410,311,2187,0.0311,0.2187
5,416,303,2490,0.0303,0.249
6,204,302,2792,0.0302,0.2792
7,404,301,3093,0.0301,0.3093
8,102,300,3393,0.03,0.3393
9,307,297,3690,0.0297,0.369


In [None]:
#| hide
assert type(value_counts_plus(status_codes, style=False)) == pd.DataFrame

### Changing the colormap used with `background_gradient`

In [None]:
value_counts_plus(status_codes, background_gradient='Greens')

Unnamed: 0,data,count,cum_count,perc,cum_perc
0,511,899,899,9.0%,9.0%
1,207,332,1231,3.3%,12.3%
2,508,331,1562,3.3%,15.6%
3,201,314,1876,3.1%,18.8%
4,410,311,2187,3.1%,21.9%
5,416,303,2490,3.0%,24.9%
6,204,302,2792,3.0%,27.9%
7,404,301,3093,3.0%,30.9%
8,102,300,3393,3.0%,33.9%
9,307,297,3690,3.0%,36.9%


In [None]:
value_counts_plus(status_codes, background_gradient='cool')

Unnamed: 0,data,count,cum_count,perc,cum_perc
0,511,899,899,9.0%,9.0%
1,207,332,1231,3.3%,12.3%
2,508,331,1562,3.3%,15.6%
3,201,314,1876,3.1%,18.8%
4,410,311,2187,3.1%,21.9%
5,416,303,2490,3.0%,24.9%
6,204,302,2792,3.0%,27.9%
7,404,301,3093,3.0%,30.9%
8,102,300,3393,3.0%,33.9%
9,307,297,3690,3.0%,36.9%


### Convert the table to raw HTML for embedding in a blog or web page

In [None]:
print(value_counts_plus(status_codes).to_html()[:700])

<style type="text/css">
#T_296aa_row0_col1, #T_296aa_row0_col3 {
  background-color: #083370;
  color: #f1f1f1;
}
#T_296aa_row0_col2, #T_296aa_row0_col4, #T_296aa_row3_col1, #T_296aa_row3_col3, #T_296aa_row4_col1, #T_296aa_row4_col3, #T_296aa_row5_col1, #T_296aa_row5_col3, #T_296aa_row6_col1, #T_296aa_row6_col3, #T_296aa_row7_col1, #T_296aa_row7_col3, #T_296aa_row8_col1, #T_296aa_row8_col3, #T_296aa_row9_col1, #T_296aa_row9_col3 {
  background-color: #00224e;
  color: #f1f1f1;
}
#T_296aa_row1_col1, #T_296aa_row1_col3, #T_296aa_row2_col1, #T_296aa_row2_col3 {
  background-color: #00234f;
  color: #f1f1f1;
}
#T_296aa_row1_col2, #T_296aa_row1_col4 {
  background-color: #00295d;
  color: #f1f1f1


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()