##  Length of embedding vector on final score

In [1]:
cd /home/mateuszg/http2vec

/home/mateuszg/http2vec


In [2]:
import ipywidgets as widgets
from ipywidgets import Dropdown, IntSlider
import pandas as pd

from http2vec.evaluation import metrics_generator
from http2vec.evaluation import get_data
from http2vec.evaluation import get_split
from http2vec.evaluation import get_predictions


language_models = ["bow", "roberta", "fasttext"]
datasets = ["MALICIOUSURL", "UNSW-NB15", "CSIC2010", "ISCXURL2016"]


@widgets.interact(
    dataset=Dropdown(options=datasets),
    classifier=Dropdown(options=["rf", "lr", "svc", "mlp"]),
)
def update(
    dataset="CSIC2010",
    classifier="lr"
):
    for lm in language_models:
        df = metrics_generator(
            language_model=lm,
            dataset=dataset,
            classifier=classifier,
        )
        print(lm)
        display(df)

interactive(children=(Dropdown(description='dataset', options=('MALICIOUSURL', 'UNSW-NB15', 'CSIC2010', 'ISCXU…

In [3]:
import os
import json

def metrics_generator_clust(language_model, dataset, classifier="kmeans"):
    """Generate DataFrame with metrics.
    Args:
        language_model (str): roberta, bow
        dataset (str): name of dateset like bow-CSIC2010
    
    Returns: DataFrame
    """
    ms = []
    lens = []

    dataset = f"{language_model}-{dataset}"
    for filename in os.listdir("data/clustering/"):
        length = filename.split("-")[-1]
        if filename.startswith(dataset) and length.isdigit():
            length = int(length)
            try:
                with open(f"data/clustering/{filename}/metrics.json") as f:
                    metrics = json.load(f)[classifier]
            except Exception as e:
                continue
            ms.append(metrics)
            lens.append(length)
            
    ms = pd.DataFrame(ms)
    ms["length"] = lens
    ms = ms.sort_values(by="length")
    ms.style.set_caption(dataset)
    return ms

@widgets.interact(
    dataset=Dropdown(options=datasets),
    classifier=Dropdown(options=["kmeans", "ac", "dbscan"]),
)
def update(
    dataset="CSIC2010",
    classifier="kmeans"
):
    for lm in language_models:
        df = metrics_generator_clust(
            language_model=lm,
            dataset=dataset,
            classifier=classifier,
        )
        print(lm)
        display(df)

interactive(children=(Dropdown(description='dataset', options=('MALICIOUSURL', 'UNSW-NB15', 'CSIC2010', 'ISCXU…

In [4]:
import matplotlib.pyplot as plt
import tikzplotlib
plt.style.use("ggplot")

In [5]:
import tikzplotlib

@widgets.interact(
    dataset=Dropdown(options=datasets),
    classifier=Dropdown(options=["rf", "lr", "svc"]),
)
def update(
    dataset="CSIC2010",
    classifier="lr"
):
    ax = None
    f1s_avg = []
    for lm in language_models:
        df = metrics_generator(language_model=lm, dataset=dataset, classifier=classifier)
        f1s_avg.append([df["f1"].min(), df["f1"].mean(), df["f1"].max()])
        ax = df.plot(x="length", y="f1", kind="scatter", title=dataset, ax=ax)
        df.plot.line(x="length", y="f1", ax=ax)
    plt.legend(language_models)
    #tikzplotlib.clean_figure()
    tikzplotlib.save(f'{dataset}-lr-length.tex')
    #plt.savefig(f'{dataset}-lr-length.pgf', backend='pgf')
    display(pd.DataFrame(f1s_avg, index=language_models, columns=["min F1", "mean F1", "max F1"]))    

interactive(children=(Dropdown(description='dataset', options=('MALICIOUSURL', 'UNSW-NB15', 'CSIC2010', 'ISCXU…

# Two-way table (full truth table)

In [6]:
from sklearn.metrics import f1_score

@widgets.interact(
    dataset=Dropdown(options=datasets),
    lm=Dropdown(options=language_models),
    size=Dropdown(options=[96, 192, 384, 768, 1536, 3072]),
    classifier=Dropdown(options=["rf", "lr", "svc"]),
)
def update(
    dataset="CSIC2010",
    lm="bow",
    size=96,
    classifier="lr"
):
    data = get_data(dataset)
    data.index = data["id"].astype(str)
    train_ids, test_ids = get_split(dataset=dataset, language_model=lm, size=size)
    predictions = get_predictions(dataset=dataset, language_model=lm, size=size, classifier=classifier)
    
    data = data.loc[test_ids]
    if (data["attack_cat"] == "").all():
        data["attack_cat"] = data["label"]
    
    predictions = pd.DataFrame(predictions.items(), columns=["index", "pred"])
    predictions.index = predictions["index"]
    predictions.drop(["index"], axis=1, inplace=True)
    f1 = f1_score(data["label"], predictions, pos_label="anomaly")
    print("F1-score: ", f1)
    
    data = pd.merge(predictions, data["attack_cat"], left_index=True, right_index=True)
    display(pd.crosstab(data.pred, data.attack_cat))

interactive(children=(Dropdown(description='dataset', options=('MALICIOUSURL', 'UNSW-NB15', 'CSIC2010', 'ISCXU…