In [1]:
%reload_ext autoreload
%autoreload 2

import colorcet as cc
import holoviews as hv
from hvplot import plotting
import hvplot.polars  # noqa
import numpy as np
import polars as pl
from bokeh import io
from scipy import stats
from selenium import webdriver
from selenium.webdriver.firefox.service import Service as FirefoxService
from sklearn import linear_model
from webdriver_manager.firefox import GeckoDriverManager 

from lib import *

pl.Config.set_tbl_rows(20)

driver = webdriver.Firefox(service=FirefoxService(GeckoDriverManager().install()))

## Runtime basline experiment

**Objective**: Determine if the runtime across distribution types is the same.

We cannot use ANOVA because the samples are not normally distributed.

https://www.pythonfordatascience.org/parametric-assumptions-python

https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.f_oneway.html#scipy.stats.f_oneway

https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.shapiro.html#scipy.stats.shapiro

In [20]:
df = load_dataset('runtime-baseline')
df = process_runtime_dataset(df)

In [24]:
for network_type in get_network_types(df):
    data = get_message_passing_runtimes(df, network_type)
    print(network_type, '\t', stats.shapiro(data))

BarabasiAlbert 	 ShapiroResult(statistic=0.6455841858114167, pvalue=1.3871243031399214e-08)
GnmRandom 	 ShapiroResult(statistic=0.4722427270825328, pvalue=8.131432237889303e-11)
RandomRegular 	 ShapiroResult(statistic=0.5834436164177577, pvalue=1.872371357241357e-09)
WattsStrogatz 	 ShapiroResult(statistic=0.43728166802888524, pvalue=3.332332165092599e-11)


We can use the nonparametric test instead to compare the medians across samples.

https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.kruskal.html#scipy.stats.kruskal

In [25]:
for network_type in get_network_types(df):
    samples = get_message_passing_runtimes(df, network_type)
    print(network_type, '\t', stats.kruskal(*samples))

BarabasiAlbert 	 KruskalResult(statistic=39.00000000000002, pvalue=0.4698781977712059)
GnmRandom 	 KruskalResult(statistic=39.00000000000001, pvalue=0.46987819777120615)
RandomRegular 	 KruskalResult(statistic=39.00000000000001, pvalue=0.46987819777120615)
WattsStrogatz 	 KruskalResult(statistic=39.00000000000002, pvalue=0.4698781977712059)


## Runtime experiment

In [45]:
df = load_dataset('runtime')
df = process_runtime_dataset(df)

In [38]:
(
    df
    # .filter(pl.col('network_type').eq('WattsStrogatz'))
    .plot.scatter(x='n_edges', y='msg_runtime', color='network_type')
)

## Send coefficient experiment

In [26]:
df = load_dataset(name='send-coefficient')

In [10]:
accuracy = df.accuracy_results(percentiles=[0, 0.01, 0.1, *range(1, 7)])
accuracy

send_coefficient,frequency,accuracy_p0,accuracy_p0.01,accuracy_p0.1,accuracy_p1,accuracy_p2,accuracy_p3,accuracy_p4,accuracy_p5,accuracy_p6,normalized_frequency
f64,u32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1.0,11258,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1.1,2899,0.944608,0.964611,0.979388,0.993281,0.997386,0.999814,1.0,1.0,1.0,0.980985
1.2,2622,0.916469,0.951758,0.971798,0.989989,0.995163,0.998132,1.0,1.0,1.0,0.976088
1.3,2535,0.88289,0.937096,0.961036,0.98634,0.992565,0.996313,0.99898,1.0,1.0,0.971659
1.4,2465,0.882113,0.922487,0.95256,0.981569,0.9899,0.994546,0.997636,1.0,1.0,0.967378
1.5,2622,0.860312,0.910898,0.946823,0.979661,0.988392,0.993467,0.996968,0.999669,1.0,0.963214
1.6,2392,0.855067,0.90064,0.939854,0.976501,0.986631,0.992109,0.995943,0.998952,1.0,0.958785
1.7,2329,0.847622,0.890528,0.934432,0.973824,0.985078,0.991087,0.995385,0.998617,1.0,0.954745
1.8,2856,0.840172,0.88372,0.930491,0.97133,0.983497,0.989901,0.994547,0.998071,1.0,0.950811
1.9,3144,0.826147,0.879437,0.924808,0.969326,0.9823,0.989292,0.994121,0.997736,1.0,0.945987


In [49]:
accuracy_nt = df.accuracy_results(percentiles=[0, 0.01, 0.1, *range(1, 12)], by_network_type=True)
accuracy_nt

network_type,send_coefficient,frequency,accuracy_p0,accuracy_p0.01,accuracy_p0.1,accuracy_p1,accuracy_p2,accuracy_p3,accuracy_p4,accuracy_p5,accuracy_p6,accuracy_p7,accuracy_p8,accuracy_p9,accuracy_p10,accuracy_p11,normalized_frequency
str,f64,u32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""BarabasiAlbert""",1.0,7647,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
"""BarabasiAlbert""",1.1,1169,0.950307,0.959333,0.973929,0.988234,0.992071,0.994533,0.99635,0.99766,0.998739,0.9997,1.0,1.0,1.0,1.0,0.947833
"""BarabasiAlbert""",1.2,880,0.916469,0.948319,0.96306,0.983075,0.988704,0.991763,0.994108,0.995937,0.997346,0.99857,0.999682,1.0,1.0,1.0,0.939859
"""BarabasiAlbert""",1.3,941,0.88289,0.927868,0.949986,0.9779,0.985225,0.989179,0.991719,0.994012,0.995881,0.997478,0.99878,1.0,1.0,1.0,0.933855
"""BarabasiAlbert""",1.4,868,0.883863,0.907635,0.940443,0.969804,0.979543,0.985391,0.989104,0.992042,0.994533,0.996278,0.997793,0.999152,1.0,1.0,0.927436
"""BarabasiAlbert""",1.5,868,0.8616,0.892708,0.931439,0.96626,0.978125,0.983806,0.987807,0.990849,0.993401,0.995587,0.997273,0.998848,1.0,1.0,0.921515
"""BarabasiAlbert""",1.6,787,0.855067,0.873413,0.921867,0.962369,0.974672,0.981378,0.985828,0.989146,0.991974,0.994493,0.996459,0.998288,0.999934,1.0,0.915593
"""BarabasiAlbert""",1.7,812,0.847622,0.872279,0.911754,0.95875,0.971658,0.97908,0.984338,0.988042,0.991238,0.994017,0.996234,0.998134,0.99978,1.0,0.910225
"""BarabasiAlbert""",1.8,780,0.840172,0.869234,0.907749,0.954759,0.96796,0.977073,0.982503,0.986757,0.989966,0.992764,0.995358,0.997441,0.999354,1.0,0.904685
"""BarabasiAlbert""",1.9,869,0.826147,0.861794,0.903636,0.951893,0.96664,0.975889,0.981551,0.986155,0.989509,0.99249,0.995197,0.997296,0.999311,1.0,0.899364


In [14]:
melted = (
    accuracy
    .drop('network_type', 'frequency', 'normalized_frequency')
    .melt(id_vars=['send_coefficient'], variable_name='percentile', value_name='accuracy')
    .with_columns(pl.col('percentile').str.strip_prefix("accuracy_p").cast(pl.Float64))
)

In [18]:
box = melted.hvplot.box(by='percentile', y='accuracy', box_fill_alpha=0)

scatter = melted.hvplot.scatter(
    x='percentile', 
    y='accuracy', 
    c='send_coefficient', 
    cmap=cc.b_linear_kry_5_98_c75,
    clabel='Send coefficient', 
    size=10
).opts(jitter=0.0)

plot = (box * scatter).opts(
    show_legend=False, 
    xlabel='Percentile', 
    ylabel='Accuracy',
    backend_opts={"plot.toolbar.autohide": True}
)

io.export_png(hv.render(plot), filename='accuracy-box.png', webdriver=driver)

'/Users/rtatton/Code/sharetrace/sharetrace-akka/evaluation/accuracy-box.png'

In [19]:
plot

In [51]:
results = df.efficiency_results()
# results.hvplot.box(y='n_updates', by='send_coefficient')
results.hvplot.scatter(y='n_updates', x='send_coefficient', by='network_type')

In [46]:
results = df.efficiency_results(normalize=True, by_network_type=True)
results.hvplot.scatter(y='n_ updates', x='send_coefficient', by='network_type')

In [24]:
kwargs = {'normalize': True, 'min_parameter': 0}

efficiency = df.efficiency_results(**kwargs)
efficiency_nt = df.efficiency_results(by_network_type=True, **kwargs)

In [51]:
def percentiles(df, metric, percentiles=(0, 10, 25, 50, 75, 90, 95, 99, 100),
                group_by=('network_type', 'send_coefficient')):
    return (
        df
        .group_by(*group_by)
        .agg(**{f'$$P_{ {p} }$$': pl.col(metric).quantile(p / 100) for p in percentiles})
        .sort(group_by)
        .collect()
    )

In [57]:
percentiles(df, 'msg_reachability', group_by=('network_type', 'tolerance'))

network_type,$$P_{0}$$,$$P_{10}$$,$$P_{25}$$,$$P_{50}$$,$$P_{75}$$,$$P_{90}$$,$$P_{95}$$,$$P_{99}$$,$$P_{100}$$
str,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""BarabasiAlbert""",0.0,0.0,0.0,0.0,1.0,2.0,3.0,4.0,7.0
"""GnmRandom""",0.0,0.0,0.0,0.0,2.0,2.0,3.0,4.0,8.0
"""RandomRegular""",0.0,0.0,0.0,0.0,2.0,2.0,3.0,4.0,8.0
"""WattsStrogatz""",0.0,0.0,0.0,1.0,2.0,3.0,3.0,4.0,9.0


In [53]:
percentiles(df, 'msg_reachability', group_by=['network_type'])

network_type,$$P_{0}$$,$$P_{10}$$,$$P_{25}$$,$$P_{50}$$,$$P_{75}$$,$$P_{90}$$,$$P_{95}$$,$$P_{99}$$,$$P_{100}$$
str,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""BarabasiAlbert""",0.0,0.0,0.0,0.0,1.0,2.0,3.0,4.0,7.0
"""GnmRandom""",0.0,0.0,0.0,0.0,2.0,2.0,3.0,4.0,8.0
"""RandomRegular""",0.0,0.0,0.0,0.0,2.0,2.0,3.0,4.0,8.0
"""WattsStrogatz""",0.0,0.0,0.0,1.0,2.0,3.0,3.0,4.0,9.0


In [55]:
percentiles(df, 'msg_reachability', group_by=['tolerance'])

tolerance,$$P_{0}$$,$$P_{10}$$,$$P_{25}$$,$$P_{50}$$,$$P_{75}$$,$$P_{90}$$,$$P_{95}$$,$$P_{99}$$,$$P_{100}$$
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0.001,0.0,0.0,0.0,0.0,2.0,2.0,3.0,4.0,9.0
0.002,0.0,0.0,0.0,0.0,2.0,2.0,3.0,4.0,9.0
0.003,0.0,0.0,0.0,0.0,2.0,2.0,3.0,4.0,9.0
0.004,0.0,0.0,0.0,0.0,2.0,2.0,3.0,4.0,9.0
0.005,0.0,0.0,0.0,0.0,2.0,2.0,3.0,4.0,9.0
0.006,0.0,0.0,0.0,0.0,2.0,2.0,3.0,4.0,9.0
0.007,0.0,0.0,0.0,0.0,2.0,2.0,3.0,4.0,8.0
0.008,0.0,0.0,0.0,0.0,2.0,2.0,3.0,4.0,9.0
0.009,0.0,0.0,0.0,0.0,2.0,2.0,3.0,4.0,8.0
0.01,0.0,0.0,0.0,0.0,2.0,2.0,3.0,4.0,8.0


In [58]:
percentiles(df, 'n_influenced')

network_type,send_coefficient,$$P_{0}$$,$$P_{10}$$,$$P_{25}$$,$$P_{50}$$,$$P_{75}$$,$$P_{90}$$,$$P_{95}$$,$$P_{99}$$,$$P_{100}$$
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""BarabasiAlbert""",1.0,0.0,0.0,0.0,0.0,9.0,47.0,107.0,446.0,4413.0
"""GnmRandom""",1.0,0.0,0.0,0.0,0.0,15.0,70.0,145.0,394.0,2119.0
"""RandomRegular""",1.0,0.0,0.0,0.0,0.0,16.0,72.0,145.0,383.0,1809.0
"""WattsStrogatz""",1.0,0.0,0.0,0.0,3.0,19.0,57.0,102.0,256.0,1675.0


In [59]:
percentiles(df, 'n_influenced', group_by=['network_type'])

network_type,$$P_{0}$$,$$P_{10}$$,$$P_{25}$$,$$P_{50}$$,$$P_{75}$$,$$P_{90}$$,$$P_{95}$$,$$P_{99}$$,$$P_{100}$$
str,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""BarabasiAlbert""",0.0,0.0,0.0,0.0,9.0,47.0,107.0,446.0,4413.0
"""GnmRandom""",0.0,0.0,0.0,0.0,15.0,70.0,145.0,394.0,2119.0
"""RandomRegular""",0.0,0.0,0.0,0.0,16.0,72.0,145.0,383.0,1809.0
"""WattsStrogatz""",0.0,0.0,0.0,3.0,19.0,57.0,102.0,256.0,1675.0


In [60]:
percentiles(df, 'n_influenced', group_by=['tolerance'])

tolerance,$$P_{0}$$,$$P_{10}$$,$$P_{25}$$,$$P_{50}$$,$$P_{75}$$,$$P_{90}$$,$$P_{95}$$,$$P_{99}$$,$$P_{100}$$
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0.001,0.0,0.0,0.0,0.0,15.0,62.0,127.0,372.0,4067.0
0.002,0.0,0.0,0.0,0.0,15.0,61.0,126.0,371.0,4413.0
0.003,0.0,0.0,0.0,0.0,15.0,61.0,126.0,368.0,4317.0
0.004,0.0,0.0,0.0,0.0,15.0,61.0,125.0,365.0,4321.0
0.005,0.0,0.0,0.0,0.0,15.0,61.0,125.0,364.0,4281.0
0.006,0.0,0.0,0.0,0.0,15.0,61.0,125.0,361.0,4154.0
0.007,0.0,0.0,0.0,0.0,15.0,60.0,124.0,358.0,3775.0
0.008,0.0,0.0,0.0,0.0,15.0,60.0,124.0,359.0,4231.0
0.009,0.0,0.0,0.0,0.0,15.0,60.0,123.0,355.0,3766.0
0.01,0.0,0.0,0.0,0.0,15.0,60.0,122.0,355.0,4192.0


In [None]:
percentiles(df, 'n_influences')

In [None]:
percentiles(df, 'n_influences', group_by=['network_type'])

In [None]:
percentiles(df, 'n_influences', group_by=['send_coefficient'])

In [83]:
(
    df
    .filter(pl.col.network_type == 'BarabasiAlbert')
    .hvplot.hist(y='msg_reachability', by='send_coefficient', subplots=True, width=250, height=250, xlim=(-0.5, 7.5),
                 shared_axes=False)
    .cols(3)
)

In [27]:
(
    df
    .filter(pl.col.network_type == 'RandomRegular')
    .hvplot.hist(y='msg_reachability', by='send_coefficient', subplots=True, width=250, height=250, xlim=(-0.5, 7.5),
                 shared_axes=False)
    .cols(3)
)

In [20]:
(
    df
    .filter(pl.col.network_type == 'GnmRandom')
    .hvplot.hist(y='msg_reachability', by='send_coefficient', subplots=True, width=250, height=250, xlim=(-0.5, 7.5),
                 shared_axes=False)
    .cols(3)
)

In [None]:
(
    df
    .filter(pl.col.network_type == 'WattsStrogatz')
    .hvplot.hist(y='msg_reachability', by='send_coefficient', subplots=True, width=250, height=250, xlim=(-0.5, 7.5),
                 shared_axes=False)
    .cols(3)
)

In [29]:
(
    df
    .hvplot.hist(y='msg_reachability', by='send_coefficient', subplots=True, width=250, height=250, xlim=(-0.5, 7.5),
                 shared_axes=False)
    .cols(3)
)

In [28]:
(
    df
    .hvplot.kde(
        y='n_influenced',
        by='send_coefficient',
        subplots=True,
        shared_axes=True,
        xlim=(0, 100),
        xlabel='Influence set cardinality',
        ylabel='Density',
        clabel='Network type',
        fill_alpha=0.5,
        height=250,
        width=250,
    ).cols(3)
)

In [30]:
(
    df
    .filter(pl.col.network_type == 'WattsStrogatz')
    .hvplot.kde(
        y='n_influenced',
        by='send_coefficient',
        # subplots=True,
        # shared_axes=False,
        xlim=(0, 100),
        xlabel='Influence set cardinality',
        ylabel='Density',
        clabel='Network type',
        fill_alpha=0.5,
    )
)

In [31]:
(
    df
    .hvplot.kde(
        y='n_influenced',
        by='network_type',
        xlim=(0, 100),
        xlabel='Influence set cardinality',
        ylabel='Density',
        clabel='Network type',
        color=[cc.CET_CBTL4[int(i)] for i in np.linspace(0, 255, 4)]
    )
)

In [33]:
(
    df
    .hvplot.kde(
        y='n_influences',
        by='network_type',
        xlim=(0, 100),
        xlabel='Source set cardinality',
        ylabel='Density',
        clabel='Network type',
        color=[cc.CET_CBTL4[int(i)] for i in np.linspace(0, 255, 4)]
    )
)

In [7]:
df = RuntimeDataset.load('runtime')

In [8]:
runtimes = (
    df
    .select('msg_runtime', 'total_runtime')
    .with_columns(runtime_percent=pl.col.msg_runtime / pl.col.total_runtime)
)

In [9]:
df

id,key,network_id,network_type,n_nodes,n_edges,ct_random_type,sv_random_type,st_random_type,send_coefficient,tolerance,msg_runtime,total_runtime
str,str,str,str,i64,i64,str,str,str,f64,f64,i64,i64
"""1bf9a375-f230-4599-a64f-334404…","""1""","""84b256a5-b69e-4372-acfc-645d19…","""GnmRandom""",40000,3000000,"""Uniform""","""Uniform""","""Uniform""",1.0,0.0,45450,107022
"""1bf9a375-f230-4599-a64f-334404…","""2""","""947ed9d5-1119-41d4-bd63-2a573c…","""GnmRandom""",40000,3000000,"""Uniform""","""Uniform""","""Uniform""",1.0,0.0,12714,72063
"""1bf9a375-f230-4599-a64f-334404…","""3""","""060c0aad-4b4e-4369-b01b-d08b8e…","""GnmRandom""",40000,3000000,"""Uniform""","""Uniform""","""Uniform""",1.0,0.0,18017,73006
"""1bf9a375-f230-4599-a64f-334404…","""4""","""723d53dd-4539-41b4-9c45-783fcf…","""GnmRandom""",40000,3000000,"""Uniform""","""Uniform""","""Uniform""",1.0,0.0,13678,71712
"""1bf9a375-f230-4599-a64f-334404…","""5""","""88547a58-edd0-4710-bb91-034578…","""GnmRandom""",40000,3000000,"""Uniform""","""Uniform""","""Uniform""",1.0,0.0,14364,72129
"""1bf9a375-f230-4599-a64f-334404…","""6""","""386f9f45-b551-469a-a36f-8fd225…","""GnmRandom""",40000,3000000,"""Uniform""","""Uniform""","""Uniform""",1.0,0.0,29234,-1
"""e46e6f27-e7da-44b8-8af6-9f3a81…","""1""","""8579e7d4-e1eb-46cc-afb1-476d86…","""GnmRandom""",70000,5000000,"""Uniform""","""Uniform""","""Uniform""",1.0,0.0,38650,94422
"""e46e6f27-e7da-44b8-8af6-9f3a81…","""2""","""b3ae70f7-43fc-4589-bc8c-417368…","""GnmRandom""",70000,5000000,"""Uniform""","""Uniform""","""Uniform""",1.0,0.0,49075,107077
"""e46e6f27-e7da-44b8-8af6-9f3a81…","""3""","""86813600-78a0-4533-acc5-479cba…","""GnmRandom""",70000,5000000,"""Uniform""","""Uniform""","""Uniform""",1.0,0.0,35029,91951
"""e46e6f27-e7da-44b8-8af6-9f3a81…","""4""","""1abd27df-a7fe-45b2-861c-157a77…","""GnmRandom""",70000,5000000,"""Uniform""","""Uniform""","""Uniform""",1.0,0.0,39042,96820


In [12]:
df.filter(pl.col.n_nodes == 10_000).hvplot.scatter(
    x='n_edges',
    y='msg_runtime',
    groupby='network_type',
    color='n_nodes',
    subplots=True, 
    width=500, 
    height=250, 
    size=5
)

In [None]:
data = (
    df
    .select('n_nodes', 'n_edges', 'msg_runtime')
    .collect()
    .to_numpy()
)

x = data[:, :-1]
y = data[:, -1]

In [None]:
reg = linear_model.Ridge(alpha=20)

In [None]:
reg.fit(x, y)

In [None]:
reg.coef_

In [None]:
nodes = np.arange(1e2, 1e6, 100)
edges = np.arange(1e2, 1e6, 100)

In [None]:
df = pl.DataFrame({
    'nodes': np.arange(1e2, 1e6, 100),
    'edges': np.arange(1e2, 1e6, 100)
})

In [None]:
reg.score(x, y)

In [None]:
df = df.with_columns(runtime=reg.intercept_ + pl.col.nodes * reg.coef_[0] + pl.col.edges * reg.coef_[1])

In [None]:
df.hvplot.scatter(x='edges', y='runtime')

In [None]:
import seaborn as sns

In [None]:
sns.regplot(x=x[:, -1], y=y);

In [None]:
x