In [5]:
import sys
import pathlib

from dotenv import dotenv_values

# honeyquest path hack to make module imports work
__package__ = "honeyquest"
modulepath = pathlib.Path.cwd().joinpath("../../../").resolve().as_posix()
if modulepath not in sys.path:
    sys.path.append(modulepath)

from honeyquest.data.ops.loading import parse_all_queries, parse_index_buckets

In [6]:
NB_CONFIG = {**dotenv_values(".env.shared"), **dotenv_values(".env.local")}

QUERY_DATA_PATH = pathlib.Path.cwd().joinpath(NB_CONFIG["QUERY_DATA_PATH"]).resolve().as_posix()
QUERY_INDEX_PATH = pathlib.Path.cwd().joinpath(NB_CONFIG["QUERY_INDEX_PATH"]).resolve().as_posix()

QUERIES = parse_all_queries(QUERY_DATA_PATH)
BUCKETS = parse_index_buckets(QUERY_INDEX_PATH)

In [7]:
# number of queries with label neutral, risky, deceptive
def label_statistic(queries):
    n = len([q for q in QUERIES.values() if q.label == "neutral" and q.id in queries])
    r = len([q for q in QUERIES.values() if q.label == "risky" and q.id in queries])
    d = len([q for q in QUERIES.values() if q.label == "deceptive" and q.id in queries])
    return n, r, d


# number of queries of type filesystem, httpheaders, htaccess, networkrequests
def type_statistics(queries):
    f = len([q for q in QUERIES.values() if q.type == "filesystem" and q.id in queries])
    h = len([q for q in QUERIES.values() if q.type == "httpheaders" and q.id in queries])
    a = len([q for q in QUERIES.values() if q.type == "htaccess" and q.id in queries])
    n = len([q for q in QUERIES.values() if q.type == "networkrequests" and q.id in queries])
    return f, h, a, n


for bucket, queries in BUCKETS.items():
    print(bucket.upper())
    print("length ", len(queries))
    print("labels ", label_statistic(queries))
    print("types  ", type_statistics(queries))
    print()


TUTORIAL
length  8
labels  (6, 1, 1)
types   (0, 0, 0, 0)

WARMUP
length  8
labels  (6, 0, 2)
types   (2, 2, 2, 2)

RELATED_WORK_ROWE
length  2
labels  (1, 0, 1)
types   (2, 0, 0, 0)

MAIN_SECTION_1
length  63
labels  (20, 12, 31)
types   (17, 17, 5, 24)

MAIN_SECTION_2
length  24
labels  (13, 0, 11)
types   (6, 14, 0, 4)

EXTRA_SECTION
length  77
labels  (40, 11, 26)
types   (9, 25, 8, 35)

