# Counting Values

> Counting unique elements in a sequence in various ways (absolute and cumulative, count and percentage).

In [None]:
#| default_exp value_counts_plus

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
import pandas as pd

In [None]:
#| export
def value_counts_plus(
    series,
    dropna=False,
    show_top=10,
    sort_others=False,
    style=True,
    size=10,
    thousands=',',
    decimal='.',
    name='data',
    background_gradient='cividis'):
    """
    Provide a few ways of showing counts of values of items in ``series``.

    Parameters
    ----------
    series : pandas Series or list
        A sequence of items to count.
    dropna : bool
        Whether or not to drop missing values.
    show_top : int
        How many of the top rows to display.
    sort_others : bool
        Whether or not to place "Others" in the bottom (default) or in its
        sorted order position.
    style : bool
        Whether or not to style values for easier reading. If set to ``True``
        the result would not be a DataFrame, and cannot be further manipulated.
        Set the value to ``False`` to get aDataFrame as the return value.
    size : int
        The size in points of the font of the table. This results in the whole
        table being resized.
    thousands : str
        The character to use to separate thousands if `style=True`. Defaults to
        `,` but you can change to `.` or space, or any oher character you want.
    decimal : str
        The character to use to display decimal number if `style=True`. Defaults to
        `.` but you can change to `,`or any oher character you want.

    name : str
        The name of the column that you want displayed in the final table. It
        appears in the caption and defaults to "data".
    background_gradient: str
        The name of the color map to be used as the gradient. Many color maps
        are available: cividis, viridis, copper, cool, magma, and more. You can
        reverse the color by appending _r to the end of the colormap name
        cividis_r for example. Enter a random string to get an error message
        with all available colormaps.

    Returns
    -------
    value_counts_df : pandas.DataFrame
        A DataFrame showing counts based on the provided arguments
    """
    final_col_names = ['count', 'cum_count', 'perc', 'cum_perc']
    if name in final_col_names:
        raise ValueError(f"Please make sure you use a name other than {final_col_names}")
    val_counts = pd.Series(series).rename(name).value_counts(dropna=dropna).reset_index()
    if len(val_counts) > show_top:
        others_df = pd.DataFrame({
            name: ['Others:'],
            'count': val_counts[show_top:]['count'].sum()
            }, index=[show_top])
        val_counts = pd.concat([
            val_counts[:show_top],
            others_df
        ])
        if sort_others:
            val_counts = val_counts.sort_values(by=['count'], ascending=False)

    count_df = (val_counts
                .assign(
                    cum_count=lambda df: df['count'].cumsum(),
                    perc=lambda df: df['count'].div(df['count'].sum()),
                    cum_perc=lambda df: df['perc'].cumsum())
                )
    if not style:
        return count_df
    return (count_df
            .style
            .format({'count': '{:,}', 'cumsum': '{:,}', 
                     'perc': '{:.1%}',
                     'cum_count': '{:,}',
                     'cum_perc': '{:.1%}'},
                    thousands=thousands,
                    decimal=decimal)
            .background_gradient(background_gradient)
            .relabel_index(range(1, len(count_df)+1), axis=0)
            .relabel_index([name, 'count', 'cum. count', '%', 'cum. %'], axis=1)
            .highlight_null()
            .set_caption(f'<h2>Counts of <b>{name}</b></h2>')
            .set_table_attributes(f'style=font-size:{size}pt;'))


In [None]:
from functools import partial
value_counts_plus = partial(value_counts_plus, size=15)

### Counting a list of status codes - default

In [None]:
# import random
# from http import HTTPStatus

import pandas as pd

import random
import numpy as np
import matplotlib as mpl



In [None]:
# status_codes = random.choices(
#     [s.value for s in HTTPStatus],
#     weights=[.01, 0.04, 0.1] * 20 + [0.1, 0.3],
#     k=10000)

colors = list(mpl.colors.cnames.keys())
colors = random.choices(colors, weights=[0.9, 0.04, 0.05, 0.09]*37, k=10_000)
colors += [np.nan for i in range(240)]
colors[:20]

['olivedrab',
 'darkred',
 'cyan',
 'pink',
 'red',
 'darkred',
 'cyan',
 'rosybrown',
 'midnightblue',
 'red',
 'plum',
 'fuchsia',
 'palegoldenrod',
 'lightsteelblue',
 'limegreen',
 'lightcoral',
 'lightcoral',
 'darkmagenta',
 'ghostwhite',
 'olivedrab']

In [None]:
#| hide
assert type(value_counts_plus(colors)) == pd.io.formats.style.Styler

### Changing the number of displayed rows with `show_top`

In [None]:
#| echo
value_counts_plus(colors, show_top=15)

Unnamed: 0,data,count,cum. count,%,cum. %
1,darkred,254,254,2.5%,2.5%
2,slategray,247,501,2.4%,4.9%
3,darkslategray,241,742,2.4%,7.2%
4,,240,982,2.3%,9.6%
5,lavenderblush,240,1222,2.3%,11.9%
6,darkmagenta,240,1462,2.3%,14.3%
7,steelblue,238,1700,2.3%,16.6%
8,lightseagreen,237,1937,2.3%,18.9%
9,blanchedalmond,236,2173,2.3%,21.2%
10,palegoldenrod,236,2409,2.3%,23.5%


In [None]:
#| hide
assert value_counts_plus(colors, show_top=15, style=False).shape[0] == 16

In [None]:
#| hide
# make sure it works with low unique values < show_top
assert type(value_counts_plus([1.1, 2, 3, 4], show_top=15)) == pd.io.formats.style.Styler

### Sorting "Others:" 

In [None]:
#| echo
value_counts_plus(colors, sort_others=True)

Unnamed: 0,data,count,cum. count,%,cum. %
1,Others:,7831,7831,76.5%,76.5%
2,darkred,254,8085,2.5%,79.0%
3,slategray,247,8332,2.4%,81.4%
4,darkslategray,241,8573,2.4%,83.7%
5,,240,8813,2.3%,86.1%
6,lavenderblush,240,9053,2.3%,88.4%
7,darkmagenta,240,9293,2.3%,90.8%
8,steelblue,238,9531,2.3%,93.1%
9,lightseagreen,237,9768,2.3%,95.4%
10,blanchedalmond,236,10004,2.3%,97.7%


In [None]:
#| hide
assert value_counts_plus(colors, sort_others=True, style=False)['data'].iloc[0] == 'Others:'

## Changing the name of the data and caption title with `name`

In [None]:
#| echo
value_counts_plus(colors, name='Status codes')

Unnamed: 0,Status codes,count,cum. count,%,cum. %
1,darkred,254,254,2.5%,2.5%
2,slategray,247,501,2.4%,4.9%
3,darkslategray,241,742,2.4%,7.2%
4,,240,982,2.3%,9.6%
5,lavenderblush,240,1222,2.3%,11.9%
6,darkmagenta,240,1462,2.3%,14.3%
7,steelblue,238,1700,2.3%,16.6%
8,lightseagreen,237,1937,2.3%,18.9%
9,blanchedalmond,236,2173,2.3%,21.2%
10,palegoldenrod,236,2409,2.3%,23.5%


## Change the size of table: `size`

In [None]:
value_counts_plus(colors, size=5)

Unnamed: 0,data,count,cum. count,%,cum. %
1,darkred,254,254,2.5%,2.5%
2,slategray,247,501,2.4%,4.9%
3,darkslategray,241,742,2.4%,7.2%
4,,240,982,2.3%,9.6%
5,lavenderblush,240,1222,2.3%,11.9%
6,darkmagenta,240,1462,2.3%,14.3%
7,steelblue,238,1700,2.3%,16.6%
8,lightseagreen,237,1937,2.3%,18.9%
9,blanchedalmond,236,2173,2.3%,21.2%
10,palegoldenrod,236,2409,2.3%,23.5%


In [None]:
value_counts_plus(colors, size=20)

Unnamed: 0,data,count,cum. count,%,cum. %
1,darkred,254,254,2.5%,2.5%
2,slategray,247,501,2.4%,4.9%
3,darkslategray,241,742,2.4%,7.2%
4,,240,982,2.3%,9.6%
5,lavenderblush,240,1222,2.3%,11.9%
6,darkmagenta,240,1462,2.3%,14.3%
7,steelblue,238,1700,2.3%,16.6%
8,lightseagreen,237,1937,2.3%,18.9%
9,blanchedalmond,236,2173,2.3%,21.2%
10,palegoldenrod,236,2409,2.3%,23.5%


## Completely change the caption using an HTML string with `set_caption`

In [None]:
caption = '<h4>Status codes</h4>Top 5 values <a href="https://example.com">raw data</a>'
value_counts_plus(
    colors,
    name='Statuses',
    show_top=5).set_caption(caption)

Unnamed: 0,Statuses,count,cum. count,%,cum. %
1,darkred,254,254,2.5%,2.5%
2,slategray,247,501,2.4%,4.9%
3,darkslategray,241,742,2.4%,7.2%
4,,240,982,2.3%,9.6%
5,lavenderblush,240,1222,2.3%,11.9%
6,Others:,9018,10240,88.1%,100.0%


### Removing table styling if you want a pure `DataFrame`:
* Counting in non-styled DataFrames is 0-based in case you want to further process it
* Columns are displayed in a slightly different manner

In [None]:
value_counts_plus(colors, style=False)

Unnamed: 0,data,count,cum_count,perc,cum_perc
0,darkred,254,254,0.024805,0.024805
1,slategray,247,501,0.024121,0.048926
2,darkslategray,241,742,0.023535,0.072461
3,,240,982,0.023438,0.095898
4,lavenderblush,240,1222,0.023438,0.119336
5,darkmagenta,240,1462,0.023438,0.142773
6,steelblue,238,1700,0.023242,0.166016
7,lightseagreen,237,1937,0.023145,0.18916
8,blanchedalmond,236,2173,0.023047,0.212207
9,palegoldenrod,236,2409,0.023047,0.235254


In [None]:
#| hide
assert type(value_counts_plus(colors, style=False)) == pd.DataFrame

### Changing the theme used `theme`

In [None]:
value_counts_plus(colors, background_gradient='Greens')

Unnamed: 0,data,count,cum. count,%,cum. %
1,darkred,254,254,2.5%,2.5%
2,slategray,247,501,2.4%,4.9%
3,darkslategray,241,742,2.4%,7.2%
4,,240,982,2.3%,9.6%
5,lavenderblush,240,1222,2.3%,11.9%
6,darkmagenta,240,1462,2.3%,14.3%
7,steelblue,238,1700,2.3%,16.6%
8,lightseagreen,237,1937,2.3%,18.9%
9,blanchedalmond,236,2173,2.3%,21.2%
10,palegoldenrod,236,2409,2.3%,23.5%


In [None]:
value_counts_plus(colors, background_gradient='cool')

Unnamed: 0,data,count,cum. count,%,cum. %
1,darkred,254,254,2.5%,2.5%
2,slategray,247,501,2.4%,4.9%
3,darkslategray,241,742,2.4%,7.2%
4,,240,982,2.3%,9.6%
5,lavenderblush,240,1222,2.3%,11.9%
6,darkmagenta,240,1462,2.3%,14.3%
7,steelblue,238,1700,2.3%,16.6%
8,lightseagreen,237,1937,2.3%,18.9%
9,blanchedalmond,236,2173,2.3%,21.2%
10,palegoldenrod,236,2409,2.3%,23.5%


### Convert the table to raw HTML for embedding in a blog or web page

In [None]:
print(value_counts_plus(colors).to_html()[:700])

<style type="text/css">
#T_ea459_row0_col1, #T_ea459_row0_col2, #T_ea459_row0_col3, #T_ea459_row0_col4, #T_ea459_row1_col1, #T_ea459_row1_col3, #T_ea459_row2_col1, #T_ea459_row2_col3, #T_ea459_row3_col1, #T_ea459_row3_col3, #T_ea459_row4_col1, #T_ea459_row4_col3, #T_ea459_row5_col1, #T_ea459_row5_col3, #T_ea459_row6_col1, #T_ea459_row6_col3, #T_ea459_row7_col1, #T_ea459_row7_col3, #T_ea459_row8_col1, #T_ea459_row8_col3, #T_ea459_row9_col1, #T_ea459_row9_col3 {
  background-color: #00224e;
  color: #f1f1f1;
}
#T_ea459_row1_col2, #T_ea459_row1_col4 {
  background-color: #002758;
  color: #f1f1f1;
}
#T_ea459_row2_col2, #T_ea459_row2_col4 {
  background-color: #002b62;
  color: #f1f1f1;
}
#T_ea4


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()