In [1]:
import matplotlib.pyplot as plt
import polars as pl
from scipy import stats
import seaborn as sns

from lib import *

pl.Config.set_fmt_str_lengths(36);

## Send coefficient experiment

In [2]:
def save(g: sns.FacetGrid, name: str, **kwargs) -> None:
    g.savefig(f"{name}.png", dpi=500, **kwargs)

### Accuracy

In [None]:
df = load_dataset("send-coefficient")

In [None]:
results = compute_accuracy_results(
    process_parameter_dataset(df),
    parameter="send_coefficient",
    percentiles=[0, 0.01, 0.1, *range(1, 7)],
)

In [None]:
results.aggregated

In [None]:
results.aggregated.write_csv('outputs/send-coefficient_accuracy_aggregate.csv')

In [None]:
data = results.tabular.filter(pl.col("send_coefficient") >= 1)
g = sns.FacetGrid(
    data,
    hue="send_coefficient",
    xlim=(0.94, 1.005),
    ylim=(0.90, 1.005),
    palette="GnBu_d",
    height=5,
    legend_out=False,
)
g.map_dataframe(sns.ecdfplot, y="accuracy", complementary=True)
g.set_axis_labels("Proportion", "Accuracy")
g.add_legend(title="Send coefficient")
save(g, "outputs/send-coefficient_accuracy_aggregate")

In [None]:
data = results.tabular.filter(pl.col("send_coefficient") >= 1).sample(fraction=0.1)
data = format_network_types(data)
g = sns.FacetGrid(
    data,
    col="network_type",
    col_order=data["network_type"].unique().sort(),
    hue="send_coefficient",
    col_wrap=1,
    xlim=(0.88, 1.005),
    ylim=(0.90, 1.005),
    despine=False,
    palette="GnBu_d",
    aspect=4,
)
g.map_dataframe(sns.ecdfplot, y="accuracy", complementary=True, alpha=0.7)
g.set_axis_labels("Proportion", "Accuracy")
g.set_titles("{col_name}")
g.add_legend(title="Send coefficient")
save(g, "outputs/send-coefficient_accuracy_network-type")

In [None]:
data = results.tabular
data = data.filter(pl.col("send_coefficient") >= 1)
data = data.sample(fraction=0.5)
g = sns.FacetGrid(
    data,
    hue="send_coefficient",
    col="network_type",
    height=5,
    sharex=False,
    sharey=False,
)
g.map_dataframe(sns.ecdfplot, "n_receives")
g.add_legend()
plt.show()

## Tolerance experiment

In [None]:
df = load_dataset('tolerance')
df.head(3)

In [None]:
results = compute_accuracy_results(
    process_parameter_dataset(df),
    parameter='tolerance',
    percentiles=[0, 0.01, 0.1, 1]
)

In [None]:
results.aggregated

In [None]:
ax = sns.ecdfplot(results.tabular, y='accuracy', hue='tolerance', complementary=True, legend=False)
ax.set_xlim(0.9875)
ax.set_ylim(0.99)
ax.set_ylabel('Accuracy')
ax.set_xlabel('Proportion')
ax.legend(acc['tolerance'].unique().reverse(), loc='lower left')
# plt.savefig('accuracy.png', dpi=500)

## Efficiency experiments

In [None]:
df = load_dataset('send-coefficient')
df = process_parameter_dataset(df)
df.head(3)

In [None]:
sns.kdeplot(df, y='n_receives', x=''

In [None]:
tabular = (
    df
    .group_by("dataset_id", "network_source", "score_source", 'send_coefficient')
    .agg(
        n_influenced_median=pl.col('n_influenced').median()
    )
)

In [None]:
results = compute_efficiency_results(df, parameter='send_coefficient', by_network_type=True)
# results.hvplot.box(y='n_updates', by='send_coefficient')
# results.hvplot.scatter(y='n_updates', x='send_coefficient', by='network_type')

In [None]:
results = compute_efficiency_results(df, parameter='send_coefficient', normalize=True, min_parameter_value=1, aggregate=True)
# results.hvplot.scatter(y='n_receives', x='send_coefficient')

In [None]:
results

In [None]:
kwargs = {'normalize': True, 'min_parameter_value': 0}

efficiency = compute_efficiency_results(df, parameter='send_coefficient', **kwargs)
efficiency_nt = compute_efficiency_results(df, parameter='send_coefficient', by_network_type=True, **kwargs)

In [None]:
def percentiles(df, metric, percentiles=(0, 10, 25, 50, 75, 90, 95, 99, 100),
                group_by=('network_type', 'send_coefficient')):
    return (
        df
        .group_by(*group_by)
        .agg(**{f'$$P_{ {p} }$$': pl.col(metric).quantile(p / 100) for p in percentiles})
        .sort(group_by)
    )

In [None]:
percentiles(df, 'msg_reachability', group_by=('network_type', 'tolerance'))

In [None]:
percentiles(df, 'msg_reachability', group_by=['network_type'])

In [None]:
percentiles(df, 'msg_reachability', group_by=['tolerance'])

In [None]:
percentiles(df, 'n_influenced')

In [None]:
percentiles(df, 'n_influenced', group_by=['network_type'])

In [None]:
percentiles(df, 'n_influenced', group_by=['tolerance'])

In [None]:
percentiles(df, 'n_influences')

In [None]:
percentiles(df, 'n_influences', group_by=['network_type'])

In [None]:
percentiles(df, 'n_influences', group_by=['send_coefficient'])

In [None]:
(
    df
    .filter(pl.col.network_type == 'BarabasiAlbert')
    .hvplot.hist(y='msg_reachability', by='send_coefficient', subplots=True, width=250, height=250, xlim=(-0.5, 7.5),
                 shared_axes=False)
    .cols(3)
)

In [None]:
(
    df
    .filter(pl.col.network_type == 'RandomRegular')
    .hvplot.hist(y='msg_reachability', by='send_coefficient', subplots=True, width=250, height=250, xlim=(-0.5, 7.5),
                 shared_axes=False)
    .cols(3)
)

In [None]:
(
    df
    .filter(pl.col.network_type == 'GnmRandom')
    .plot.hist(y='msg_reachability', by='send_coefficient', subplots=True, width=250, height=250, xlim=(-0.5, 7.5),
                 shared_axes=False)
    .cols(3)
)

In [None]:
(
    df
    .filter(pl.col.network_type == 'WattsStrogatz')
    .hvplot.hist(y='msg_reachability', by='send_coefficient', subplots=True, width=250, height=250, xlim=(-0.5, 7.5),
                 shared_axes=False)
    .cols(3)
)

In [None]:
(
    df
    .hvplot.hist(y='msg_reachability', by='send_coefficient', subplots=True, width=250, height=250, xlim=(-0.5, 7.5),
                 shared_axes=False)
    .cols(3)
)

In [None]:
(
    df
    .hvplot.kde(
        y='n_influenced',
        by='send_coefficient',
        subplots=True,
        shared_axes=True,
        xlim=(0, 100),
        xlabel='Influence set cardinality',
        ylabel='Density',
        clabel='Network type',
        fill_alpha=0.5,
        height=250,
        width=250,
    ).cols(3)
)

In [None]:
(
    df
    .filter(pl.col.network_type == 'WattsStrogatz')
    .hvplot.kde(
        y='n_influenced',
        by='send_coefficient',
        # subplots=True,
        # shared_axes=False,
        xlim=(0, 100),
        xlabel='Influence set cardinality',
        ylabel='Density',
        clabel='Network type',
        fill_alpha=0.5,
    )
)

In [None]:
(
    df
    .hvplot.kde(
        y='n_influenced',
        by='network_type',
        xlim=(0, 100),
        xlabel='Influence set cardinality',
        ylabel='Density',
        clabel='Network type',
        color=[cc.CET_CBTL4[int(i)] for i in np.linspace(0, 255, 4)]
    )
)

In [None]:
(
    df
    .hvplot.kde(
        y='n_influences',
        by='network_type',
        xlim=(0, 100),
        xlabel='Source set cardinality',
        ylabel='Density',
        clabel='Network type',
        color=[cc.CET_CBTL4[int(i)] for i in np.linspace(0, 255, 4)]
    )
)

## Experiment 2: Runtime baseline

**Objective**: Determine if the runtime across distribution types is the same.

In [3]:
df = load_dataset('runtime-baseline')
df = process_runtime_dataset(df)


ANOVA assumes normality. Use the Shapiro-Wilks test.

https://en.wikipedia.org/wiki/Analysis_of_variance#Assumptions

https://www.pythonfordatascience.org/parametric-assumptions-python

https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.shapiro.html#scipy.stats.shapiro

In [7]:
apply_hypothesis_test(df, stats.shapiro, by_distributions=True)

ShapiroResult(statistic=0.67198017890183, pvalue=2.042930122606384e-17)

In [8]:
apply_hypothesis_test(df, stats.shapiro, by_distributions=True, by_network_type=True)

{'BarabasiAlbert': ShapiroResult(statistic=0.6455841858114166, pvalue=1.3871243031399161e-08),
 'GnmRandom': ShapiroResult(statistic=0.47224272708253245, pvalue=8.131432237889243e-11),
 'RandomRegular': ShapiroResult(statistic=0.5834436164177577, pvalue=1.872371357241357e-09),
 'WattsStrogatz': ShapiroResult(statistic=0.43728166802888524, pvalue=3.332332165092599e-11)}

The $p$ values are very low, which suggests the null hypothesis of normality is rejected.

To use non-parametric ANOVA, we must still ensure the homoscedasticity assumption holds.

https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.fligner.html

In [9]:
apply_hypothesis_test(df, stats.fligner, by_distributions=True)

FlignerResult(statistic=2.2556518288003913, pvalue=0.9443471249508537)

In [10]:
apply_hypothesis_test(df, stats.fligner, by_distributions=True, by_network_type=True)

{'BarabasiAlbert': FlignerResult(statistic=4.53526416757902, pvalue=0.7164668174850962),
 'GnmRandom': FlignerResult(statistic=4.8649337519478255, pvalue=0.6764418408563662),
 'RandomRegular': FlignerResult(statistic=3.9002887300636266, pvalue=0.7911879662932961),
 'WattsStrogatz': FlignerResult(statistic=2.4743184854305103, pvalue=0.9290184609818963)}

The $p$ value is high, which indicates the null hypothesis of homoscedasticity cannot be rejected.

The Kruskal-Wallis test is the non-parametric equivalent of one-way ANOVA.

https://en.wikipedia.org/wiki/Kruskal–Wallis_test

https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.kruskal.html#scipy.stats.kruskal

In [11]:
apply_hypothesis_test(df, stats.kruskal, by_distributions=True)

KruskalResult(statistic=7.030418187414728, pvalue=0.425719618877831)

In [12]:
apply_hypothesis_test(df, stats.kruskal, by_distributions=True, by_network_type=True)

{'BarabasiAlbert': KruskalResult(statistic=5.240544856740269, pvalue=0.63063439449162),
 'GnmRandom': KruskalResult(statistic=3.2309255079007024, pvalue=0.8628497317188233),
 'RandomRegular': KruskalResult(statistic=3.0218591549295746, pvalue=0.8829727404160843),
 'WattsStrogatz': KruskalResult(statistic=5.9706147772138, pvalue=0.5431846923466834)}

A high $p$ value indicates that the null hypothesis cannot be rejected.

**Conclusion:** There is no statistically significant difference in runtime across data distributions.

## Runtime experiment

In [None]:
df = load_dataset('runtime')
df = process_runtime_dataset(df)
df.head(5)

In [None]:
dataset = df.select("n_nodes", "n_edges", "msg_runtime").sample(shuffle=True, seed=12345)
x = dataset.select("n_nodes", "n_edges")
y = dataset.select("msg_runtime")
train_fraction = 0.66
test_fraction = 1 - train_fraction
n_train = math.floor(dataset.height * 0.66)
n_test = dataset.height - n_train
x_train = x.head(n_train)
y_train = x.head(n_train)
x_test = x.tail(-n_train)
y_test = y.tail(-n_train)

In [None]:
dataset.select('

In [None]:
df = df.with_columns(
    density=2 * pl.col('n_edges') / (pl.col('n_nodes') ** 2 - pl.col('n_nodes')),
    mean_degree=2 * pl.col('n_edges') / pl.col('n_nodes')
)

In [None]:
sns.scatterplot(df.with_columns((pl.col('n_nodes') * pl.col('mean_degree')).alias('test')), x='test', y='msg_runtime');

In [None]:
rendered = hv.render(
    df
    .filter(pl.col('network_type').eq('BarabasiAlbert'))
    .plot.scatter(x='n_edges', y='msg_runtime', color='network_type')
)
bokeh.io.export_svg(rendered, filename='test.svg', webdriver=driver)