Run with the following conda environment: `../../conda_envs/training_env`

In [None]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.pylab as pylab
from matplotlib import ticker as mticker

In [None]:
data_folder = "../data/"
path_caida = f"{data_folder}/caida/preprocessed_5-20pk_tcpudpicmp/135000_"
path_mawi = f"{data_folder}/mawi/preprocessed_5-20pk_tcpudpicmp/1920_"
path_uni = f"{data_folder}/uni/preprocessed_5-20pk_tcpudpicmp/151_"

In [None]:
params = {
    'legend.fontsize': 14,
    'legend.title_fontsize': 16,
    'axes.labelsize': 20,
    'axes.titlesize': 22,
    'xtick.labelsize': 18,
    'ytick.labelsize': 18,
    'figure.titlesize': 25,
}
plt.rc('font', size=16)
plt.rc('pdf', fonttype=42)
pylab.rcParams.update(params)

In [None]:
features = [
    "pk_size_mean_6",
    "pk_size_std_6",
    "iat_mean_5",
    "iat_std_5",
]
n_pk = 5
protocols = ["tcp", "udp", "icmp"]

In [None]:
data_uni = []
data_mawi = []
data_caida = []
for proto in protocols:
    csv_uni = pd.read_csv(f"{path_uni}{proto}.csv", header=0, engine="pyarrow")
    csv_uni["protocol"] = proto
    data_uni.append(csv_uni)

    csv_caida = pd.read_csv(f"{path_caida}{proto}.csv", header=0, engine="pyarrow")
    csv_caida["protocol"] = proto
    data_caida.append(csv_caida)

    csv_mawi = pd.read_csv(f"{path_mawi}{proto}.csv", header=0, engine="pyarrow",  parse_dates=['first_ts'])
    csv_mawi["protocol"] = proto
    data_mawi.append(csv_mawi)

data_mawi = pd.concat(data_mawi)
data_caida = pd.concat(data_caida)
data_uni = pd.concat(data_uni)
data_mawi["trace"] = "MAWI"
data_caida["trace"] = "CAIDA"
data_uni["trace"] = "UNI"
df = pd.concat([data_caida, data_mawi, data_uni])
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df = df.reset_index()

In [None]:
df_untouched = df.copy(deep=True)

In [None]:
df.groupby(["trace", "protocol"])["first_ts"].count()

In [None]:
df[df["flow_size"] >= n_pk].groupby(["trace", "protocol"])["first_ts"].count()

In [None]:
df[df["trace"] == "CAIDA"]["flow_size"].sort_values()[-10:]

In [None]:
df[df["trace"] == "MAWI"]["flow_size"].sort_values()[-10:]

Protocols repartition

In [None]:
mawi_cutoff_all_proto = np.percentile(df[df["trace"] == "MAWI"]["flow_size"], q=99)
caida_cutoff_all_proto = np.percentile(df[df["trace"] == "CAIDA"]["flow_size"], q=99)
uni_cutoff_all_proto = np.percentile(df[df["trace"] == "UNI"]["flow_size"], q=99)

mawi_cutoff_tcp_udp = np.percentile(df[(df["trace"] == "MAWI") & (df["protocol"].isin(["tcp", "udp"]))]["flow_size"], q=99)
caida_cutoff_tcp_udp = np.percentile(df[(df["trace"] == "CAIDA") & (df["protocol"].isin(["tcp", "udp"]))]["flow_size"], q=99)
uni_cutoff_tcp_udp = np.percentile(df[(df["trace"] == "UNI") & (df["protocol"].isin(["tcp", "udp"]))]["flow_size"], q=99)

mawi_cutoff_tcp = np.percentile(df[(df["trace"] == "MAWI") & (df["protocol"] == "tcp")]["flow_size"], q=99)
caida_cutoff_tcp = np.percentile(df[(df["trace"] == "CAIDA") & (df["protocol"] == "tcp")]["flow_size"], q=99)
uni_cutoff_tcp = np.percentile(df[(df["trace"] == "UNI") & (df["protocol"] == "tcp")]["flow_size"], q=99)

mawi_cutoff_udp = np.percentile(df[(df["trace"] == "MAWI") & (df["protocol"] == "udp")]["flow_size"], q=99)
caida_cutoff_udp = np.percentile(df[(df["trace"] == "CAIDA") & (df["protocol"] == "udp")]["flow_size"], q=99)
uni_cutoff_udp = np.percentile(df[(df["trace"] == "UNI") & (df["protocol"] == "udp")]["flow_size"], q=99)

mawi_cutoff_icmp = np.percentile(df[(df["trace"] == "MAWI") & (df["protocol"] == "icmp")]["flow_size"], q=99)
caida_cutoff_icmp = np.percentile(df[(df["trace"] == "CAIDA") & (df["protocol"] == "icmp")]["flow_size"], q=99)
uni_cutoff_icmp = np.percentile(df[(df["trace"] == "UNI") & (df["protocol"] == "icmp")]["flow_size"], q=99)

In [None]:
mawi_5pkq_all_proto = (df[df["trace"] == "MAWI"]["flow_size"].to_numpy() < 5).mean()
caida_5pkq_all_proto = (df[df["trace"] == "CAIDA"]["flow_size"].to_numpy() < 5).mean()
uni_5pkq_all_proto = (df[df["trace"] == "UNI"]["flow_size"].to_numpy() < 5).mean()

mawi_5pkq_tcp_udp = (df[(df["trace"] == "MAWI") & (df["protocol"].isin(["tcp", "udp"]))]["flow_size"].to_numpy() < 5).mean()
caida_5pkq_tcp_udp = (df[(df["trace"] == "CAIDA") & (df["protocol"].isin(["tcp", "udp"]))]["flow_size"].to_numpy() < 5).mean()
uni_5pkq_tcp_udp = (df[(df["trace"] == "UNI") & (df["protocol"].isin(["tcp", "udp"]))]["flow_size"].to_numpy() < 5).mean()

mawi_5pkq_tcp = (df[(df["trace"] == "MAWI") & (df["protocol"] == "tcp")]["flow_size"].to_numpy() < 5).mean()
caida_5pkq_tcp = (df[(df["trace"] == "CAIDA") & (df["protocol"] == "tcp")]["flow_size"].to_numpy() < 5).mean()
uni_5pkq_tcp = (df[(df["trace"] == "UNI") & (df["protocol"] == "tcp")]["flow_size"].to_numpy() < 5).mean()

mawi_5pkq_udp = (df[(df["trace"] == "MAWI") & (df["protocol"] == "udp")]["flow_size"].to_numpy() < 5).mean()
caida_5pkq_udp = (df[(df["trace"] == "CAIDA") & (df["protocol"] == "udp")]["flow_size"].to_numpy() < 5).mean()
uni_5pkq_udp = (df[(df["trace"] == "UNI") & (df["protocol"] == "udp")]["flow_size"].to_numpy() < 5).mean()

mawi_5pkq_icmp = (df[(df["trace"] == "MAWI") & (df["protocol"] == "icmp")]["flow_size"].to_numpy() < 5).mean()
caida_5pkq_icmp = (df[(df["trace"] == "CAIDA") & (df["protocol"] == "icmp")]["flow_size"].to_numpy() < 5).mean()
uni_5pkq_icmp = (df[(df["trace"] == "UNI") & (df["protocol"] == "icmp")]["flow_size"].to_numpy() < 5).mean()

Flow size distribution

In [None]:
def plot_cdf(df, protocols, cutoff_mawi, cutoff_caida, cutoff_uni, pk_mawi, pk_caida, pk_uni, save_path):

    g = sns.FacetGrid(df[df["protocol"].isin(protocols)], row="trace", hue="trace")
    g.fig.set_size_inches(6,5)
    for i, ax in enumerate(g.axes):
        print(ax[0].get_title())
        if "CAIDA" in ax[0].get_title():
            axes_caida = g.axes[i]
        elif "MAWI" in ax[0].get_title():
            axes_mawi = g.axes[i]
        elif "UNI" in ax[0].get_title():
            axes_uni = g.axes[i]

    g.map(sns.ecdfplot, "flow_size", log_scale=True, legend="brief", stat="percent")

    axes_mawi[0].vlines(x=cutoff_mawi, ls='dotted', ymin=0, ymax=99, color="black")
    axes_mawi[0].hlines(y=99, ls='dotted', xmin=1, xmax=cutoff_mawi, color="black")
    axes_mawi[0].annotate(f"Top 1%: {int(round(cutoff_mawi, 0))} pk", xy=(cutoff_mawi, 50), xytext=(5, 0), textcoords="offset points", size=16)
    if protocols != ["icmp"]:
        axes_mawi[0].vlines(x=5, ls='dotted', ymin=0, ymax=pk_mawi*100, color="black")
        axes_mawi[0].hlines(y=pk_mawi*100, ls='dotted', xmin=1, xmax=5, color="black")
        axes_mawi[0].annotate(f"5pk", xy=(5, 10), xytext=(5, 0), textcoords="offset points", size=16)
    axes_mawi[0].set_ylabel("CDF [%]")
    axes_mawi[0].set_xlabel(f'Flow size ({"+".join([p.upper() for p in protocols])})')
    axes_mawi[0].set_title("MAWI")

    try:
        axes_uni[0].vlines(x=cutoff_uni, ls='dotted', ymin=0, ymax=99, color="black")
        axes_uni[0].hlines(y=99, ls='dotted', xmin=1, xmax=cutoff_uni, color="black")
        axes_uni[0].annotate(f"Top 1%: {int(round(cutoff_uni, 0))} pk", xy=(cutoff_uni, 50), xytext=(5, 0), textcoords="offset points", size=16)
        if protocols != ["icmp"]:
            axes_uni[0].vlines(x=5, ls='dotted', ymin=0, ymax=pk_uni*100, color="black")
            axes_uni[0].hlines(y=pk_uni*100, ls='dotted', xmin=1, xmax=5, color="black")
            axes_uni[0].annotate(f"5pk", xy=(5, 10), xytext=(5, 0), textcoords="offset points", size=16)
        axes_uni[0].set_ylabel("CDF [%]")
        axes_uni[0].set_xlabel(f'Flow size ({"+".join([p.upper() for p in protocols])})')
        axes_uni[0].set_title("UNI")
    except UnboundLocalError:
        print("No UNI")

    axes_caida[0].vlines(x=cutoff_caida, ls='dotted', ymin=0, ymax=99, color="black")
    axes_caida[0].hlines(y=99, ls='dotted', xmin=1, xmax=cutoff_caida, color="black")
    axes_caida[0].annotate(f"Top 1%: {int(round(cutoff_caida, 0))} pk", xy=(cutoff_caida, 50), xytext=(5, 0), textcoords="offset points", size=16)
    axes_caida[0].vlines(x=5, ls='dotted', ymin=0, ymax=pk_caida*100, color="black")
    axes_caida[0].hlines(y=pk_caida*100, ls='dotted', xmin=1, xmax=5, color="black")
    axes_caida[0].annotate(f"5pk", xy=(5, 10), xytext=(5, 0), textcoords="offset points", size=16)
    axes_caida[0].set_ylabel("CDF [%]")
    axes_caida[0].set_xlabel(f"Flow size ({'+'.join([p.upper() for p in protocols])})")
    axes_caida[0].set_title("CAIDA")

    plt.locator_params(axis='y', nbins=3)
    plt.tight_layout()
    plt.savefig(f'{save_path}', format='pdf', dpi=1200)
    plt.show()

In [None]:
plot_cdf(
    df=df, 
    protocols=["tcp"], 
    cutoff_mawi=mawi_cutoff_tcp, 
    cutoff_caida=caida_cutoff_tcp, 
    cutoff_uni=uni_cutoff_tcp, 
    pk_mawi=mawi_5pkq_tcp, 
    pk_caida=caida_5pkq_tcp, 
    pk_uni=uni_5pkq_tcp, 
    save_path="./models/out/12Aapp_TCP-flow-sizes_caida_mawi.pdf"
)

In [None]:
plot_cdf(
    df=df, 
    protocols=["udp"], 
    cutoff_mawi=mawi_cutoff_udp, 
    cutoff_caida=caida_cutoff_udp, 
    cutoff_uni=uni_cutoff_udp, 
    pk_mawi=mawi_5pkq_udp, 
    pk_caida=caida_5pkq_udp, 
    pk_uni=uni_5pkq_udp, 
    save_path="./models/out/12Bapp_UDP-flow-sizes_caida_mawi.pdf"
)

In [None]:
plot_cdf(
    df=df, 
    protocols=["icmp"], 
    cutoff_mawi=mawi_cutoff_icmp,
    cutoff_caida=caida_cutoff_icmp, 
    cutoff_uni=uni_cutoff_icmp, 
    pk_mawi=mawi_5pkq_icmp, 
    pk_caida=caida_5pkq_icmp, 
    pk_uni=uni_5pkq_icmp, 
    save_path="./models/out/12Capp_ICMP-flow-sizes_caida_mawi.pdf"
)

In [None]:
g = sns.FacetGrid(df, row="trace", hue="protocol", sharey=False)
g.fig.set_size_inches(6, 5)
g.map(sns.histplot, "protocol", palette="viridis")
axes = g.fig.axes
for i, ax in enumerate(axes):
    if "MAWI" in ax.get_title():
        ax.set_title("MAWI")
    elif "CAIDA" in ax.get_title():
        ax.set_title("CAIDA")
    elif "UNI" in ax.get_title():
        ax.set_title("UNI")
    ax.set_ylabel("# flows")
    if i == len(axes) - 1:
        ax.set_xticklabels([p.get_text().upper() for p in ax.get_xticklabels()])
    
axes[0].set_yticks([500_000, 1_000_000])
axes[1].set_yticks([500_000, 1_000_000])
axes[2].set_yticks([2000, 4000])
axes[2].set_xlabel("Protocol")
plt.locator_params(axis='y', nbins=3)
plt.tight_layout()
plt.savefig(f'./models/out/13Aapp_protocols-count_caida_mawi.pdf', format='pdf', dpi=1200)
plt.show()

In [None]:
plot_cdf(
    df=df, 
    protocols=["tcp", "udp"], 
    cutoff_mawi=mawi_cutoff_tcp_udp, 
    cutoff_caida=caida_cutoff_tcp_udp, 
    cutoff_uni=uni_cutoff_tcp_udp, 
    pk_mawi=mawi_5pkq_tcp_udp, 
    pk_caida=caida_5pkq_tcp_udp, 
    pk_uni=uni_5pkq_tcp_udp, 
    save_path='./models/out/13Bapp_TCP+UDP-flow-sizes_caida_mawi.pdf'
)

In [None]:
plot_cdf(
    df=df, 
    protocols=["tcp", "udp", "icmp"], 
    cutoff_mawi=mawi_cutoff_all_proto, 
    cutoff_caida=caida_cutoff_all_proto, 
    cutoff_uni=uni_cutoff_all_proto, 
    pk_mawi=mawi_5pkq_all_proto, 
    pk_caida=caida_5pkq_all_proto, 
    pk_uni=uni_5pkq_all_proto, 
    save_path='./models/out/13Capp_TCP+UDP+ICMP-flow-sizes_caida_mawi.pdf'
)