In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
from datetime import date
from multiprocessing.pool import Pool, ThreadPool
from h2o.tree import H2OTree

In [None]:
with open("../../database.env") as f:
    for line in f:
        key, value = line.strip().split('=', 1)
        os.environ[key] = value

In [None]:
cons = []
for db_name in ["cf", "cf_win", "webkitmac"]:
    os.environ["DB_NAME"] = db_name
    cons.append({
            "host"      : os.getenv("DB_HOST"),
            "database"  : os.getenv("DB_NAME"),
            "user"      : os.getenv("DB_USER"),
            "password"  : os.getenv("DB_PASSWORD"),
            "port": os.getenv("DB_PORT"),
        })

## Load data and prepare

In [None]:
#import importlib
import helper
#importlib.reload(helper)
from helper import get_data, transform_data, remove_cannot_work, remove_unstable, observation_stats, check_sameness, create_trees
from tree import create_tree_dirs

In [None]:
def run(args, tree_creation=True):
    browser, inc_method, base_dir, cons = args
    if browser == "webkit":
        param_dict = cons[2]
    elif inc_method == "window.open":
        param_dict = cons[1]
    else:
        param_dict = cons[0]
    # Load data
    res, uf = get_data(browser, inc_method, param_dict, limit, log)
    if log:
        # Some stats
        print(res["error_run"].value_counts())
        print(res["run"].value_counts())
    # Tranform every observation method to own row + smooth some of them
    results, methods = transform_data(res, log)
    # Remove methods that cannot work
    results = remove_cannot_work(results, log)
    # Check stability and remove unstable ones
    results = remove_unstable(results, base_dir, conf=0.01, log=log)
    # Drop all retests to make tree building and stuff faster
    results = results.loc[results["run"] == 0]
    # Drop all duplicate entries (they have the same result)
    results = results.loc[~results.duplicated(subset=["browser", "inc_method", "observation_method", "url_id"], keep="first")]
    if len(results) == 0:
        res = f"{browser}-{inc_method} has no working methods"
        if log:
            print(res)
        return [browser, inc_method, "does not work"]
    if log:
        # Display count of observations
        print("Count of observations:")
        observation_stats(results, threshold=100000000000)
        
    # Check sameness of observation methods
    # (Only one tree has to be checked manually for same observation methods)
    # Also remove very rare observations (probably errors)
    # Webkit has 32 as the smallest regular count
    results, same_channels = check_sameness(results, base_dir, log, threshold=32)


    if tree_creation:
        # Create trees
        results = create_trees(results, uf, base_dir, log)
        return [browser, inc_method, same_channels, results]
    else:
        return [results, uf]

In [None]:
base_dir = f"/data/data/tbf/data_analysis/trees/{date.today().isoformat()}"
create_tree_dirs([], {"base_dir": base_dir})

In [None]:
#####################################################

In [None]:
log = True
limit = None
# limit = 10000
results, uf = run(["webkit", "img", base_dir, cons], tree_creation=False)


In [None]:
#_ = check_sameness(results, base_dir, log=True, threshold=32)

In [None]:
def change_status_groups(row):
    status_code = row["Status-Code"]
    if status_code in ["204", "205", "304", "305"]:
        return f"['{status_code}']"
    else:
        return row["ecohd_status_groups"]
# uf["ecohd_status_groups2"] = uf.apply(change_status_groups, axis=1)
# tree_model, rule_fit_model = create_trees(results, uf, base_dir, log=True, return_early=False)

In [None]:
###########################################

In [None]:
log = False
limit = None
browsers = ["chromium", "firefox", "webkit"]
browsers = ["webkit"]
pool_size = 5
pool_size = 2
inc_methods = ["img", "iframe-dircsp", "iframe", "fetch-creds-no-cors-integrity", "audio", "link-stylesheet", 
               "fetch-creds-cors-manual", "video", "object", "link-prefetch", "fetch-creds-no-cors",
               "double-script", "window.open", "embed", "style-import", "embed-img", "iframe-csp",
               "script", "fetch-creds-cors-integrity", "fetch-creds-cors", ]
results = []
if log:
    with ThreadPool(1) as p:
        results = p.map(run, [[browser, inc_method, base_dir, cons] for browser in browsers for inc_method in inc_methods])
else:
    with Pool(pool_size) as p:
        results = p.map(run, [[browser, inc_method, base_dir, cons] for browser in browsers for inc_method in inc_methods])

for res in results:
    print(res)

In [None]:
import pickle
if 'results' in locals():
    with open("results.pickle", "wb") as f:
        pickle.dump(results, f)
else:
    with open("results.pickle", "rb") as f:
        results = pickle.load(f)

In [None]:
import pandas as pd
import re


def get_same(row):
    inc_method = row["inc_method"]
    browser = row["browser"]
    try:
        return sorted(map(sorted, row["same_channels"][inc_method][browser][2]))
    except Exception as e:
        return []
    

def get_working(row):
    browser = row["browser"]
    inc_method = row["inc_method"]
    entries = row["working_methods"]
    if entries is None:
        return [], []
    working = []
    non_working = []
    for entry in entries:
        match = re.match(f"{browser}-{inc_method}-(.*?) (.*)", entry)
        method = match[1]
        if match[2] == "works":
            working.append(method)
        else:
            non_working.append(method)
    return sorted(working), sorted(non_working)

def get_df(results):
    df = pd.DataFrame(results, columns=["browser", "inc_method", "same_channels", "working_methods"])
    df["same_channels"] = df.apply(get_same, axis=1).apply(str)
    df[["working_methods", "failing_methods"]] = df.apply(get_working, axis=1, result_type="expand")
    return df

df_res = get_df(results)

for df in [df_res]:
    with pd.option_context("display.max_colwidth", 500):
        with pd.option_context("display.max_rows", 80):
            #df = df.reindex(sorted(df.columns), axis=1)
            display(df.sort_values(["inc_method", "browser"]))

In [None]:
####################################################

## Time estimation
- How long it will take depending on response space and server properties

In [None]:
from itertools import combinations
from functools import reduce

In [None]:
repetitions = 2
browsers = 3
inc_methods = 20
response_space = 63 * (12+1) * 2 * 2 * 8 * 2 * 2 * 2 * 3 * 3

############################################### Reduction of tests
# Do not test all inclusion methods
#inc_methods = 16  # Removed some methods: fetch-cors-(1,2,3) (needs cors, were rare bugs), embed-img (does not work in webkit, embed in firefox, img in chrome)

# Do not test all status-codes (for every combination)
status_groups = [(101), (100,102,103), (200), (204,205), 
                 (201,202,203,206,207,208,226), (300), (304, 305), (301,302,303,307,308),
                 (407), (400, 401, 402, 403, 404, 405, 406, 408, 410, 411, 412, 413, 414, 415, 416, 417, 418, 421, 422, 423, 424, 425, 426, 428, 429, 431, 451),
                 (500, 501, 502, 503, 504, 505, 506, 507, 508, 510, 511),
                 (999)]
print(f"Different status-code groups (from prior research): {len(status_groups)}")
# Removed some responses: merge all status-codes that behave the same together (manually extracted from the previous trees)
# Might miss some edge cases?
# One idea to not miss them: always use one random status-code from the group (and not only decide for one prototype code)? 
# -> however, this might be problematic for over-fitting trees 
# we can remap back to the group if this happens/just build trees twice!

response_space_groups = (12+1) * (len(status_groups)) * 2 * 2 * 8 * 2 * 2 * 2 * 3 * 3  

# Do not test all combinations:
# Reduce max combinationdepth from 10 to 6?
# only combinations up to depth 5 are needed? (manually extracted from the previous trees)
# Get this automatically!
# for some (e.g, postMessage) "empty" of 3.5 values is needed
# to be safe test up to depth 6?? (or better only use 5?; when checking the trees empty was included)
# how? status-code is always needed; other things can be empty, thus empty case does not need to be included?
# Choose 5 out of 9 = 126; 
# (63 * N1 * N2 * N3 * N4 * N5) + (63 * N1 * N2 * N3 * N4 * N6) ...
# values = [13, 2, 2, 8, 2, 2, 2, 3, 3]
values = [12, 1, 1, 7, 1, 1, 1, 2, 2]  # Empty value is not needed anymore: when not chosen it is empty!
combs = list(combinations(values, 5))  # (5+1) chosen and (4) empty
# combs = list(combinations(values, 4))  # (4+1) chosen and (5) empty
space = []
for comb in combs:
    space.append(reduce(lambda a,b: a*b, comb))
# print(space)

response_space_lim = reduce(lambda a,b: a+(b*63), space)
response_space_lim_groups = reduce(lambda a,b: a+(b*12), space)  # Both response space reductions at the same time

# Or random X percent sampling? (might be problematic for the trees/overfitting)
sampling = 1
#sampling = X. 

##############################################################

print(f"Total response space: {response_space:,}, status-code groups: {response_space_groups:,}, combination depth 5+1: {response_space_lim:,}, both: {response_space_lim_groups:,}\n")
response_space = response_space 
response_space = response_space_groups
# response_space = response_space_lim
# response_space = response_space_lim_groups
total_tests = repetitions * browsers * inc_methods * response_space * sampling
print(f"total tests: {total_tests:,}")

In [None]:
test_time = 1.7  # Roughly, also depends on browser, inc_method, ...
parallel = 100
#parallel = 300
total_time = total_tests * test_time
estimated_time = total_time / parallel
estimated_hours = estimated_time / (60*60)
estimated_days = estimated_hours / 24

print(f"Total time: {total_time:,}s; estimated time ({parallel} browsers): {estimated_time:,}s")
print(f"Estimated time in hours: {estimated_hours}, days: {estimated_days}")

In [None]:
# Time server Chromium/Firefox; ~100 browsers, id_runs=1000
print(f"Days server: {(360 * 20 * 2 * 60) / (60 * 60 * 24)}")
# Time iMac Webkit; 8 browsers, id_runs=40
print(f"Days iMac: {(8986 * 20 * 1 * 30) / (60 * 60 * 24)}")