#  Validation between datasets

In [1]:
cd ~/http2vec

/home/mateuszg/http2vec


In [2]:
import json
import pandas as pd

from http2vec.evaluation import *


sizes=[96, 192, 384, 768, 1536, 3072]
language_models = ["bow", "roberta", "fasttext"]
datasets = ["MALICIOUSURL", "UNSW-NB15", "CSIC2010", "ISCXURL2016"]

def load_crossvalidation(
    language_model,
    dataset,
    size,
    against_dataset
):
    path = f"data/cross-validation/{language_model}-{dataset}-{size}-{against_dataset}/metrics.json"
    with open(path) as f:
        f1 = json.load(f)["f1"]
    return f1

dataset = "CSIC2010"
a_dataset = "UNSW-NB15"


In [3]:
import matplotlib.pyplot as plt
import tikzplotlib
plt.style.use("ggplot")


In [4]:
import ipywidgets as widgets
from ipywidgets import Dropdown, IntSlider

@widgets.interact(
    dataset=Dropdown(options=[
        "CSIC2010->UNSW-NB15",
        "MALICIOUSURL->ISCXURL2016",
    ])
)
def update(
    dataset="CSIC2010->UNSW-NB15"
):
    dataset, a_dataset = dataset.split("->")
    f1s = {size: load_crossvalidation("bow", dataset, size, a_dataset) for size in sizes}
    df = pd.DataFrame(f1s.items(), columns=["length", "BoW"])
    
    f1s = {size: load_crossvalidation("fasttext", dataset, size, a_dataset) for size in sizes}
    df = df.merge(pd.DataFrame(f1s.items(), columns=["length", "fastText"]), on="length")
    
    f1s = {size: load_crossvalidation("roberta", dataset, size, a_dataset) for size in sizes}
    df = df.merge(pd.DataFrame(f1s.items(), columns=["length", "RoBERTa"]), on="length")

    display(df)
    
    ax = None
 
    ax = df.plot(x="length", y="BoW", kind="scatter", title=dataset, ax=ax)
    df.plot.line(x="length", y="BoW", ax=ax)   
    
    df.plot(x="length", y="fastText", kind="scatter", title=dataset, ax=ax)
    df.plot.line(x="length", y="fastText", ax=ax) 
    
    df.plot(x="length", y="RoBERTa", kind="scatter", title=dataset, ax=ax)
    df.plot.line(x="length", y="RoBERTa", ax=ax)   
    
    ax.legend(["BoW", "fastText", "RoBERTa"])
    ax.set_title(f'{dataset}->{a_dataset}')
    ax.set_xlabel("length")
    ax.set_ylabel("F1")
    
    tikzplotlib.save(f'{dataset}->{a_dataset}.tex')

interactive(children=(Dropdown(description='dataset', options=('CSIC2010->UNSW-NB15', 'MALICIOUSURL->ISCXURL20…

In [5]:
import ipywidgets as widgets
from ipywidgets import Dropdown, IntSlider

@widgets.interact(
    lm=Dropdown(options=["bow", "roberta", "fasttext"]),
)
def update(
    lm="bow",
):
    
    f1s = {size: load_crossvalidation(lm, dataset, size, a_dataset) for size in sizes}
    df = metrics_generator(language_model=lm, dataset=dataset, classifier="rf")[["f1", "length"]]
    df = pd.DataFrame(f1s.items(), columns=["length", "f1-cross"]).merge(df, on="length")
    display(df)
    
    ax = None
 
    ax = df.plot(x="length", y="f1", kind="scatter", title=dataset, ax=ax)
    df.plot.line(x="length", y="f1", ax=ax)
    df.plot.line(x="length", y="f1-cross", ax=ax)    
    ax.legend(["f1", "f1-cross"])

interactive(children=(Dropdown(description='lm', options=('bow', 'roberta', 'fasttext'), value='bow'), Output(…

In [6]:
from sklearn.metrics import f1_score


def get_predictions_against(dataset, language_model, size, against_dataset):
    exp = f"{language_model}-{dataset}-{size}-{against_dataset}"
    path = f"data/cross-validation/{exp}/saved/predictions-cross.json"
    with open(path) as f:
        data = json.load(f)
    return data


@widgets.interact(
    dataset=Dropdown(options=[
        "CSIC2010->UNSW-NB15",
        "MALICIOUSURL->ISCXURL2016",
    ]),
    lm=Dropdown(options=language_models),
    size=Dropdown(options=[96, 192, 384, 768, 1536, 3072]),
)
def update(
    dataset="CSIC2010->UNSW-NB15",
    lm="bow",
    size=96, 
):
    
    dataset, against_dataset = dataset.split("->")
    classifier="rf"
    data = get_data(against_dataset)
    data.index = data["id"].astype(str)
    train_ids, test_ids = get_split(dataset=against_dataset, language_model=lm, size=size)
    predictions = get_predictions(dataset=against_dataset, language_model=lm, size=size, classifier=classifier)
    
    data = data.loc[test_ids]
    if (data["attack_cat"] == "").all():
        data["attack_cat"] = data["label"]
    
    predictions = pd.DataFrame(predictions.items(), columns=["index", "pred"])
    predictions.index = predictions["index"]
    predictions.drop(["index"], axis=1, inplace=True)
    
    
    predictions2 = get_predictions_against(
        dataset=dataset,
        language_model=lm,
        size=size,
        against_dataset=against_dataset
    )
    predictions2 = pd.DataFrame(predictions2.items(), columns=["index", "pred_cross"])
    predictions2.index = predictions2["index"]
    predictions2.drop(["index"], axis=1, inplace=True)
    
    f1 = f1_score(data["label"], predictions, pos_label="anomaly")
    f1_cross = f1_score(data["label"], predictions2, pos_label="anomaly")
    
    data = pd.merge(predictions, data["attack_cat"], left_index=True, right_index=True)
    data = pd.merge(predictions2, data, left_index=True, right_index=True)
    
    print("Original dataset results, F1:", f1)
    display(pd.crosstab(data.pred, data.attack_cat))
    

    print("Crossvalidation, F1", f1_cross)

    display(pd.crosstab(predictions2.pred_cross, data.attack_cat))

interactive(children=(Dropdown(description='dataset', options=('CSIC2010->UNSW-NB15', 'MALICIOUSURL->ISCXURL20…