In [1]:
import pandas as pd
import numpy as np
import altair as alt

In [2]:
PALETTE = ["#bdb8ad", "#ece7e0", "#c6d4e1", "#44749d"]

In [3]:
dataset_map = {
    "automobile": "automobile",
    "titanic": "titanic",
    "top_women_chess_players": "women",
    "suicide": "suicide",
    "chess": "chess",
    "adult": "adult",
    "asia_conflicts": "conflicts",
    "rain_australia": "rain",
    "hotel_bookings": "hotel",
    "diamonds": "diamonds",
    "basketball": "basketball",
    "rain_australia": "rain",
    "heart": "heart",
    "diabetes": "diabetes",
    "default": "credit",
    "solar": "solar",
}

dataset_order = [
    "heart",
    "diabetes",
    "automobile",
    "titanic",
    "women",
    "credit",
    "solar",
    "suicide",
    "diamonds",
    "chess",
    "adult",
    "basketball",
    "conflicts",
    "rain",
    "hotel",
]

In [5]:
# read data
df_plot = pd.read_json("../benchmark/results/plot.json", lines=True)
df_plot_corr = pd.read_json(
    "../benchmark/results/plot_correlation.json", lines=True
)
df_plot_miss = pd.read_json(
    "../benchmark/results/plot_missing.json", lines=True
)

# collect run times in one dataset
df = df_plot[["dataset", "mem_size"]]
df["plot"] = df_plot["elapsed"]
df["plot_correlation"] = df_plot_corr["elapsed"]
df["plot_missing"] = df_plot_miss["elapsed"]

# get list of datasets in increasing order of size for plotting
df = df.sort_values("mem_size")
df = df.drop(columns=["mem_size"])

# shorten dataset names
df["dataset"] = df["dataset"].apply(dataset_map.__getitem__)
df = df.set_index("dataset")
df.columns = ["plot(df)", "plot_correlation(df)", "plot_missing(df)"]

# create new dataset with proportion of total time for each function
df2 = df.div(df.sum(axis=1), axis=0)

# formatted size for tooltip
df["size"] = [
    "24KB",
    "60KB",
    "459KB",
    "3MB",
    "4MB",
    "7MB",
    "13MB",
    "14MB",
    "16MB",
    "231MB",
    "383MB",
    "400MB",
    "1GB",
    "1.1GB",
    "4.5GB",
]

# wrangle data into columns "dataset", "size", "function", and "time"
df = df.reset_index()
df = pd.melt(
    df,
    id_vars=["dataset", "size"],
    value_vars=["plot(df)", "plot_correlation(df)", "plot_missing(df)"],
)
df2 = df2.reset_index()
df2 = pd.melt(
    df2,
    id_vars=["dataset"],
    value_vars=["plot(df)", "plot_correlation(df)", "plot_missing(df)"],
)

# add percent ot total time to the final df
df["pct"] = df2["value"]
df.columns = ["dataset", "size", "function", "time", "pct"]
df = df.join(df.groupby("dataset").time.sum(), on="dataset", rsuffix="_sum")
df["dataset++"] = df["dataset"] + df["time_sum"].apply(lambda n: f" ({n:.1f}s)")
df["sort_order"] = df["dataset"].apply(dataset_order.index)
df = df.sort_values("sort_order")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["plot"] = df_plot["elapsed"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["plot_correlation"] = df_plot_corr["elapsed"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["plot_missing"] = df_plot_miss["elapsed"]


In [6]:
scale = alt.Scale(range=PALETTE)

alt.Chart(df).mark_bar().encode(
    x=alt.X("dataset++:N", sort=df["dataset++"].unique(), title="Dataset (Total Time)"),
    y=alt.Y(
        "pct:Q",
        title="Percent of total time",
        axis=alt.Axis(format="%",titleFontSize=18),
        scale=alt.Scale(domain=[0, 1]),
    ),
    color=alt.Color("function", title="EDA Task", scale=scale, legend=alt.Legend(labelFontSize=16, titleFontSize=12)),
    order=alt.Order("function", sort="ascending"),
    tooltip=[
        alt.Tooltip("pct:Q", title="percent", format=".2f"),
        alt.Tooltip("time:Q", title="runtime", format=".2f"),
        alt.Tooltip("size:O", title="memory"),
        alt.Tooltip("dataset:O", title="dataset"),
    ],
).properties(width=400, height=150).configure_axis(labelFontSize=16, titleFontSize=12)