In [60]:
import hvplot.polars
import matplotlib.pyplot as plt
import numpy as np
import polars as pl
import seaborn as sns
import statsmodels.stats.api as sm

from scipy import stats

In [24]:
def load_dataset(kind: str) -> pl.LazyFrame:
    return pl.scan_parquet(f'./data/{kind}/experiment.parquet')

def get_network_types(df: pl.LazyFrame) -> pl.DataFrame:
    return df.collect().get_column('network_type').unique()

## Runtime basline experiment

**Objective**: Determine if the runtime across distribution types is the same.

We cannot use ANOVA because the samples are not normally distributed.

https://www.pythonfordatascience.org/parametric-assumptions-python

https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.f_oneway.html#scipy.stats.f_oneway

https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.shapiro.html#scipy.stats.shapiro

In [15]:
def get_runtimes(df, network_type, as_numpy=False):
    samples = (
        df
        .filter(pl.col.key != '1')
        .filter(pl.col.network_type == network_type)
        .group_by(['ct_random_type', 'sv_random_type', 'st_random_type'])
        .agg(pl.col.msg_runtime)
        .collect()
        .get_column('msg_runtime')
        .to_list()
    )
    return np.array(samples) if as_numpy else samples

In [25]:
df = load_dataset('runtime-baseline')
network_types = get_network_types(df)

In [17]:
for network_type in network_types:
    data = get_runtimes(df, network_type, as_numpy=True).flatten()
    print(network_type, '\t', stats.shapiro(data))

RandomRegular 	 ShapiroResult(statistic=0.31384527727963507, pvalue=1.1721280890703945e-16)
BarabasiAlbert 	 ShapiroResult(statistic=0.8246320735534654, pvalue=8.402036681101393e-08)
WattsStrogatz 	 ShapiroResult(statistic=0.905059682305744, pvalue=4.909102938770769e-05)
GnmRandom 	 ShapiroResult(statistic=0.46285027473053075, pvalue=8.615878127358096e-15)


We can use the nonparametric test instead to compare the medians across samples.

https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.kruskal.html#scipy.stats.kruskal

In [18]:
for network_type in network_types:
    samples = get_runtimes(df, network_type)
    print(network_type, '\t', stats.kruskal(*samples))

RandomRegular 	 KruskalResult(statistic=6.744449984184931, pvalue=0.45596385748786317)
BarabasiAlbert 	 KruskalResult(statistic=8.056005933551374, pvalue=0.32768128702517296)
WattsStrogatz 	 KruskalResult(statistic=5.17628242929798, pvalue=0.6384603989978268)
GnmRandom 	 KruskalResult(statistic=10.159023455857016, pvalue=0.1797394478879054)


## Parameter experiment

In [113]:
df = load_dataset('parameter')
network_types = get_network_types(df)

In [114]:
df.head(1).collect()

id,key,network_id,network_type,nodes,edges,ct_random_type,sv_random_type,st_random_type,transmission_rate,send_coefficient,n_influenced,n_influences,msg_reachability,msg_runtime,total_runtime,exposure_diffs,exposure_scores,n_receives,n_updates,n_contacts
str,str,str,str,i64,i64,str,str,str,f64,f64,list[i64],list[i64],list[i64],i64,i64,list[f64],list[f64],list[i64],list[i64],list[i64]
"""1714888805622""","""1""","""1714888811775""","""RandomRegular""",5000,50000,"""Uniform""","""Uniform""","""Uniform""",0.8,0.8,"[5, 0, … 22]","[37, 38, … 44]","[1, 0, … 4]",21507,67019,"[0.16572, 0.70054, … 0.0]","[0.687609, 0.769085, … 0.996443]","[77, 76, … 83]","[4, 2, … 1]","[20, 20, … 20]"


In [112]:
(
    df
    # .group_by(['network_type', 'ct_random_type', 'st_random_type', 'sv_random_type', 'send_coefficient'], maintain_order=True)
    .group_by(['network_type', 'send_coefficient'], maintain_order=True)
    .agg(pl.col.exposure_scores.flatten().quantile(0.5))
    .filter(pl.col.network_type == 'RandomRegular')
    # .filter(pl.col.ct_random_type == 'Uniform')
    # .filter(pl.col.sv_random_type == 'Uniform')
    # .filter(pl.col.st_random_type == 'Uniform')
    .collect()
)

network_type,send_coefficient,exposure_scores
str,f64,f64
"""RandomRegular""",0.8,0.77021
"""RandomRegular""",0.9,0.772429
"""RandomRegular""",1.0,0.772429
"""RandomRegular""",1.1,0.772429
"""RandomRegular""",1.2,0.772429
"""RandomRegular""",1.3,0.772429
"""RandomRegular""",1.4,0.772429
"""RandomRegular""",1.5,0.772429
"""RandomRegular""",1.6,0.772429
