Run with the following conda environment: `../../conda_envs/training_env`

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.pylab as pylab
import seaborn as sns
import matplotlib
from matplotlib.lines import Line2D

In [None]:
data_folder = "../data/"
path_caida = f"{data_folder}/caida/preprocessed_5-20pk_tcpudpicmp/135000_all_proto.csv"
path_mawi = f"{data_folder}/mawi/preprocessed_5-20pk_tcpudpicmp/1920_all_proto.csv"
path_uni = f"{data_folder}/uni/preprocessed_5-20pk_tcpudpicmp/0_all_proto.csv" # UNI trace with the most flows

In [None]:
params = {
    'legend.fontsize': 14,
    'legend.title_fontsize': 16,
    'axes.labelsize': 20,
    'axes.titlesize': 22,
    'xtick.labelsize': 18,
    'ytick.labelsize': 18,
    'figure.titlesize': 25,
}
plt.rc('font', size=16)
plt.rc('pdf', fonttype=42)
pylab.rcParams.update(params)

In [None]:
all_data = {}
all_sizes = {}
all_cutoffs = {}

for path in [path_uni, path_caida, path_mawi, ]:
    data = pd.read_csv(path, header=0)
    all_data[path] = data
    cutoff = np.percentile(data["flow_size"], 99)
    all_cutoffs[path] = cutoff
    sizes = data["flow_size"]
    all_sizes[path] = sizes

In [None]:
all_sizes_cdfs = {}

for path, sizes in all_sizes.items():
    sizes_cdf = pd.DataFrame(data={
            "flow_size": sizes.value_counts().index,
            "n_flows": sizes.value_counts().values,
        })
    sizes_cdf = sizes_cdf.sort_values(by="flow_size", ascending=True)
    sizes_cdf["cumsum"] = sizes_cdf["n_flows"].cumsum()
    sizes_cdf["cdf"] = sizes_cdf["cumsum"]  / sizes_cdf["n_flows"].sum()
    sizes_cdf["tmp"] = 1 # For CDF extraction to .txt file fed to the scheduling simulator
    all_sizes_cdfs[path] = sizes_cdf

In [None]:
all_traffic_shares = {}

for path, sizes in all_sizes.items():
    traffic_share = pd.DataFrame(data={
        "flow_size": sizes.value_counts().index,
        "n_flows": sizes.value_counts().values,
    })
    traffic_share["total_size"] = traffic_share["flow_size"] * traffic_share["n_flows"]  
    traffic_share["share"] = traffic_share["total_size"] / traffic_share["total_size"].sum()
    cdf = traffic_share.sort_values(by="flow_size", ascending=True)
    cdf["cum_share"] = cdf["share"].cumsum()
    all_traffic_shares[path] = cdf

In [None]:
for path, data in all_data.items():
    print(f"{path.split('/')[2]}: {data.shape[0]} flows")

In [None]:
palette = {
    "caida": "tab:blue",
    "mawi": "tab:orange",
    "uni": "tab:red",
    "raphael": "tab:red",
}

text_pos = {
    "caida": (5, 0.2),
    "mawi": (10, 0.42),
    "uni": (1000, 0.2),
}

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(6, 4))

for i, ((path, data), cutoff, cdf) in enumerate(zip(all_data.items(), all_cutoffs.values(), all_traffic_shares.values())):
    
    trace_name = path.split("/")[2]
    ax.annotate(f"{trace_name.upper()}", xy=text_pos[trace_name], fontsize=16, color=palette[trace_name])
    sns.ecdfplot(ax=ax, data=data, x="flow_size", log_scale=True, label=f"flows", c=palette[trace_name], lw=2)
    cutoff = int(cutoff)
    ax.vlines(
        x=cutoff, ymax=1, ymin=cdf[cdf["flow_size"] >= int(cutoff)]["cum_share"].values[0],
        color=palette[trace_name], ls="dotted", lw=1, label=f"elephants", zorder=-1
    )

    ax.plot(
        cdf["flow_size"],
        np.cumsum(cdf["share"]), 
        color=palette[trace_name],
        label=f"traffic share", ls="--", lw=2,
    )

ax.set_xlabel("Flow size [# packets]")
ax.set_ylabel("CDF")
ax.set_xticks([1, 10, 100, 1000, 10000, 100000, 1000000])

handles, labels = ax.get_legend_handles_labels()
new_handles = []
new_labels = []
for h, lbl in zip(handles, labels):
    l = Line2D([], [])
    l.set_color("black")
    if "traffic share" in lbl:
        l.set_linestyle("--")
    if "elephant" in lbl:
        l.set_linestyle("dotted")
    new_labels.append(lbl)
    new_handles.append(l)
order = [0, 2, 1]
legend1 = ax.legend(
    [new_handles[idx] for idx in order], 
    [new_labels[idx] for idx in order], 
    loc="lower right", 
    labelspacing=0.25, columnspacing=0.5, ncol=1, framealpha=0., handlelength=1.5, markerfirst=False
)
fig.add_artist(legend1)
plt.tight_layout()
plt.savefig(f'./models/out/01_sizes_distributions.pdf', format='pdf', dpi=1200)
plt.show()