In [1]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go

np.random.seed(1337)

In [2]:
n = 500
d_plot = pd.DataFrame(
    {
        "name": [f"id_{i:03d}" for i in range(n)],
        "x": np.random.randn(n),
        "category": np.random.choice(["foo", "bar", "baz"], n),
        "subcategory": np.random.choice(["hello", "world"], n),
    }
)

In [3]:
# Basic vertical box plot

d_plot = d_plot
val_name = "x"
cat_name = "category"
cat_values = ["foo", "bar", "baz"]
if len(d_plot[d_plot[cat_name].isin(cat_values)]) > 2000:
    raise ValueError("Too many values to plot")

fig = go.Figure()
for i, category in enumerate(cat_values):
    fig.add_trace(
        go.Box(
            y=d_plot[d_plot[cat_name] == category][val_name],
            name=category,
            showlegend=True,
            boxpoints="all",
            line_width=1,
            marker=dict(color="cornflowerblue", size=3),
            notched=True,
        )
    )
fig.update_layout(
    template="plotly_white",
    width=100 + len(fig.data) * 120,
    height=800,
    title=f"Distribution of {val_name} by {cat_name}",
    showlegend=False,
)
fig.update_xaxes(title=cat_name.capitalize())
fig.update_yaxes(title=val_name.capitalize(), range=None, dtick=None)
fig.show()

In [4]:
# Basic horizontal box plot

d_plot = d_plot
val_name = "x"
cat = "category"
cat_values = ["foo", "bar", "baz"]
if len(d_plot[d_plot[cat_name].isin(cat_values)]) > 2000:
    raise ValueError("Too many values to plot")

fig = go.Figure()
for i, category in enumerate(cat_values):
    fig.add_trace(
        go.Box(
            x=d_plot[d_plot[cat_name] == category][val_name],
            name=category,
            showlegend=True,
            boxpoints="all",
            line_width=1,
            marker=dict(color="cornflowerblue", size=3),
            notched=True,
        )
    )
fig.update_layout(
    template="plotly_white",
    width=900,
    height=50 + len(fig.data) * 120,
    title=f"Distribution of {val_name} by {cat_name}",
    showlegend=False,
)
fig.update_xaxes(title=val_name.capitalize(), range=None, dtick=None)
fig.update_yaxes(title=cat_name.capitalize())
fig.show()

In [5]:
# Vertical box plot with multiple categories

d_plot = d_plot
val_name = "x"
cat_name = "category"
cat_values = ["foo", "bar", "baz"]
subcat_name = "subcategory"
subcat_values = ["hello", "world"]
colors = ["cornflowerblue", "coral"]
if (
    len(
        d_plot[
            (d_plot[cat_name].isin(cat_values))
            & (d_plot[subcat_name].isin(subcat_values))
        ]
    )
    > 2000
):
    raise ValueError("Too many values to plot")

fig = go.Figure()
for i, subcategory in enumerate(subcat_values):
    fig.add_trace(
        go.Box(
            x=d_plot[
                (d_plot[cat_name].isin(cat_values))
                & (d_plot[subcat_name] == subcategory)
            ][cat_name],
            y=d_plot[
                (d_plot[cat_name].isin(cat_values))
                & (d_plot[subcat_name] == subcategory)
            ][val_name],
            name=subcategory,
            showlegend=True,
            boxpoints="all",
            line_width=1,
            marker=dict(color=colors[i], size=3),
            notched=True,
        )
    )
fig.update_layout(
    boxmode="group",
    template="plotly_white",
    width=100 + len(cat_values) * len(subcat_values) * 120,
    height=800,
    title=f"Distribution of {val_name} by {cat_name} and {subcat_name}",
    legend_title=subcat_name.capitalize(),
    showlegend=True,
)
fig.update_xaxes(
    title=cat_name.capitalize(), categoryorder="array", categoryarray=cat_values
)
fig.update_yaxes(title=val_name.capitalize(), range=None, dtick=None)
fig.show()

In [6]:
# Horizontal box plot with multiple categories

d_plot = d_plot
val_name = "x"
cat_name = "category"
cat_values = ["foo", "bar", "baz"]
subcat_name = "subcategory"
subcat_values = ["hello", "world"]
colors = ["cornflowerblue", "coral"]
if (
    len(
        d_plot[
            (d_plot[cat_name].isin(cat_values))
            & (d_plot[subcat_name].isin(subcat_values))
        ]
    )
    > 2000
):
    raise ValueError("Too many values to plot")

fig = go.Figure()
for i, subcategory in enumerate(subcat_values):
    fig.add_trace(
        go.Box(
            x=d_plot[
                (d_plot[cat_name].isin(cat_values))
                & (d_plot[subcat_name] == subcategory)
            ][val_name],
            y=d_plot[
                (d_plot[cat_name].isin(cat_values))
                & (d_plot[subcat_name] == subcategory)
            ][cat_name],
            name=subcategory,
            showlegend=True,
            boxpoints="all",
            line_width=1,
            marker=dict(color=colors[i], size=3),
            notched=True,
        )
    )
fig.update_layout(
    boxmode="group",
    template="plotly_white",
    width=900,
    height=50 + len(cat_values) * len(subcat_values) * 120,
    title=f"Distribution of {val_name} by {cat_name} and {subcat_name}",
    legend_title=subcat_name.capitalize(),
    showlegend=True,
)
fig.update_traces(orientation="h")
fig.update_xaxes(title=val_name.capitalize(), range=None, dtick=None)
fig.update_yaxes(
    title=cat_name.capitalize(),
    autorange="reversed",
    categoryorder="array",
    categoryarray=cat_values,
)
fig.show()