# Counting Values

> Counting unique elements in a sequence in various ways (absolute and cumulative, count and percentage).

In [None]:
#| default_exp value_counts_plus

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
import pandas as pd

In [None]:
#| export
def value_counts_plus(
    series,
    dropna=False,
    show_top=10,
    sort_others=False,
    style=True,
    background_gradient='cividis'):
    """
    Provide a few ways of showing counts of values of items in ``series``.

    Parameters
    ----------
    series : pandas Series or list
        A sequence of items to count.
    dropna : bool
        Whether or not to drop missing values.
    show_top : int
        How many of the top rows to display.
    sort_others : bool
        Whether or not to place "Others" in the bottom (default) or in its sorted order position
    style : bool
        Whether or not to style values for easier reading. If set to ``True`` the result would
        not be a DataFrame, and cannot be further manipulated. Set the value to ``False`` to
        get aDataFrame as the return value.

    Returns
    -------
    value_counts_df : a pandas.DataFrame showing counts based on the provided arguments
    """
    series = pd.Series(series).rename('data')
    col = 'data'
    val_counts = series.value_counts(dropna=dropna)
    if len(val_counts) > show_top:
        val_counts = pd.concat([
            val_counts[:show_top],
            pd.Series(val_counts[show_top:].sum(), index=['Others:'], name=col)])
        if sort_others:
            val_counts = val_counts.sort_values(ascending=False)
        show_top += 1
    count_df = (val_counts
                .to_frame()
                .assign(cum_count=lambda df: df[col].cumsum(),
                        perc=lambda df: df[col].div(df[col].sum()),
                        cum_perc=lambda df: df['perc'].cumsum())
                .reset_index()
                .rename(columns={'index': col, col: 'count'}))
    if not style:
        return count_df.head(show_top)
    return (count_df.
            head(show_top).style
            .format({'count': '{:,}', 'cumsum': '{:,}', 
                     'perc': '{:.1%}',
                     'cum_count': '{:,}',
                     'cum_perc': '{:.1%}'})
            .background_gradient(background_gradient)
            .highlight_null()
            .set_caption(f'<h2>Counts of <b>{series.name}</b></h2>'))


### Counting a list of status codes - default

In [None]:
import random
from http import HTTPStatus

import pandas as pd

In [None]:
status_codes = random.choices(
    [s.value for s in HTTPStatus],
    weights=[.01, 0.04, 0.1] * 20 + [0.1, 0.3],
    k=10000)


In [None]:
value_counts_plus(status_codes)

Unnamed: 0,data,count,cum_count,perc,cum_perc
0,511,922,922,9.2%,9.2%
1,505,343,1265,3.4%,12.7%
2,502,325,1590,3.2%,15.9%
3,428,322,1912,3.2%,19.1%
4,201,311,2223,3.1%,22.2%
5,508,310,2533,3.1%,25.3%
6,207,309,2842,3.1%,28.4%
7,303,304,3146,3.0%,31.5%
8,300,302,3448,3.0%,34.5%
9,421,298,3746,3.0%,37.5%


In [None]:
value_counts_plus(status_codes)

Unnamed: 0,data,count,cum_count,perc,cum_perc
0,511,864,864,8.6%,8.6%
1,307,329,1193,3.3%,11.9%
2,303,327,1520,3.3%,15.2%
3,413,319,1839,3.2%,18.4%
4,428,318,2157,3.2%,21.6%
5,502,316,2473,3.2%,24.7%
6,404,307,2780,3.1%,27.8%
7,204,304,3084,3.0%,30.8%
8,508,304,3388,3.0%,33.9%
9,401,300,3688,3.0%,36.9%


### Changing the number of displayed rows with `show_top`

In [None]:
value_counts_plus(status_codes, show_top=15)

Unnamed: 0,data,count,cum_count,perc,cum_perc
0,511,864,864,8.6%,8.6%
1,307,329,1193,3.3%,11.9%
2,303,327,1520,3.3%,15.2%
3,413,319,1839,3.2%,18.4%
4,428,318,2157,3.2%,21.6%
5,502,316,2473,3.2%,24.7%
6,404,307,2780,3.1%,27.8%
7,204,304,3084,3.0%,30.8%
8,508,304,3388,3.0%,33.9%
9,401,300,3688,3.0%,36.9%


### Sorting "Others:" 

In [None]:
value_counts_plus(status_codes, sort_others=True)

Unnamed: 0,data,count,cum_count,perc,cum_perc
0,Others:,6312,6312,63.1%,63.1%
1,511,864,7176,8.6%,71.8%
2,307,329,7505,3.3%,75.1%
3,303,327,7832,3.3%,78.3%
4,413,319,8151,3.2%,81.5%
5,428,318,8469,3.2%,84.7%
6,502,316,8785,3.2%,87.9%
7,404,307,9092,3.1%,90.9%
8,204,304,9396,3.0%,94.0%
9,508,304,9700,3.0%,97.0%


### Removing table styling if you want a pure `DataFrame`:

In [None]:
value_counts_plus(status_codes, style=False)

Unnamed: 0,data,count,cum_count,perc,cum_perc
0,511,864,864,0.0864,0.0864
1,307,329,1193,0.0329,0.1193
2,303,327,1520,0.0327,0.152
3,413,319,1839,0.0319,0.1839
4,428,318,2157,0.0318,0.2157
5,502,316,2473,0.0316,0.2473
6,404,307,2780,0.0307,0.278
7,204,304,3084,0.0304,0.3084
8,508,304,3388,0.0304,0.3388
9,401,300,3688,0.03,0.3688


### Changing the colormap used with `background_gradient`

In [None]:
value_counts_plus(status_codes, background_gradient='Greens')

Unnamed: 0,data,count,cum_count,perc,cum_perc
0,511,864,864,8.6%,8.6%
1,307,329,1193,3.3%,11.9%
2,303,327,1520,3.3%,15.2%
3,413,319,1839,3.2%,18.4%
4,428,318,2157,3.2%,21.6%
5,502,316,2473,3.2%,24.7%
6,404,307,2780,3.1%,27.8%
7,204,304,3084,3.0%,30.8%
8,508,304,3388,3.0%,33.9%
9,401,300,3688,3.0%,36.9%


### Convert the table to raw HTML for embedding in a blog or web page

In [None]:
print(value_counts_plus(status_codes).to_html()[:700])

<style type="text/css">
#T_0e3a9_row0_col1, #T_0e3a9_row0_col3 {
  background-color: #053371;
  color: #f1f1f1;
}
#T_0e3a9_row0_col2, #T_0e3a9_row0_col4, #T_0e3a9_row3_col1, #T_0e3a9_row3_col3, #T_0e3a9_row4_col1, #T_0e3a9_row4_col3, #T_0e3a9_row5_col1, #T_0e3a9_row5_col3, #T_0e3a9_row6_col1, #T_0e3a9_row6_col3, #T_0e3a9_row7_col1, #T_0e3a9_row7_col3, #T_0e3a9_row8_col1, #T_0e3a9_row8_col3, #T_0e3a9_row9_col1, #T_0e3a9_row9_col3 {
  background-color: #00224e;
  color: #f1f1f1;
}
#T_0e3a9_row1_col1, #T_0e3a9_row1_col3, #T_0e3a9_row2_col1, #T_0e3a9_row2_col3 {
  background-color: #00234f;
  color: #f1f1f1;
}
#T_0e3a9_row1_col2, #T_0e3a9_row1_col4 {
  background-color: #00295d;
  color: #f1f1f1


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()