In [None]:
import glob
import json
from anytree.importer import JsonImporter
from anytree.exporter import UniqueDotExporter
from anytree import RenderTree, PreOrderIter
import pandas as pd

In [None]:
base_dir = "trees/2022-04-22-2"

tree_files = []
svg_files = []
channel_files = []
for base_dir in [base_dir]:
    tree_files += glob.glob(f"{base_dir}/anytree/*Status-Code*")
    svg_files += glob.glob(f"{base_dir}/svg/*Status-Code*")
    channel_files += glob.glob(f"{base_dir}/obs/*Status-Code*")
    print(len(svg_files))

tree_list = []
for tree_file in tree_files:
    with open(tree_file, "r") as f:
        tree_txt = f.read().replace('"path":', '"path2":')
        tree_name = tree_file.split("anytree/")[1]
        tree_list.append((tree_name, JsonImporter().import_(tree_txt)))

In [None]:
def create_path(leaf):
    path_dict = {}
    pred = leaf.pred
    for node in leaf.path:
        prop, value = node.path2.split(":", maxsplit=1)
        if prop == "root":
            continue
        old_value = path_dict.get(prop, None)
        if old_value is not None:
            if len(value) < len(old_value):
                path_dict[prop] = value
        else:
            path_dict[prop] = value
    return sorted(list(path_dict.items())), pred
          

In [None]:
paths_dict = {}
trees_dict = {}
nodes_dict = {}
all_paths = 0
for tree_name, tree in tree_list:
    paths = [create_path(leaf) for leaf in tree.leaves]
    nodes = set([node.path2 for node in PreOrderIter(tree)])
    for node in nodes:
        vals = nodes_dict.get(node, {"tree_names": []})
        vals["tree_names"].append(tree_name)
        nodes_dict[node] = vals
    # Have a more relaxed fingerprint? (Ignore some of the paths or something like that?)
    # ...
    path_fingerprint = sorted([path for path, pred in paths])
    vals = trees_dict.get(json.dumps(path_fingerprint), {"tree_names": [], "org_paths": [], "tree": tree})
    vals["tree_names"].append(tree_name)
    vals["org_paths"].append(paths)
    trees_dict[json.dumps(path_fingerprint)] = vals
    all_paths += len(paths)
    for path, pred in paths:
        path_string = json.dumps(path)
        vals = paths_dict.get(path_string, {"tree_names": [], "preds": [], "org_path": path})
        vals["tree_names"].append(tree_name)
        vals["preds"].append(pred)
        paths_dict[path_string] = vals


df = pd.DataFrame(columns=["browser", "inclusion_method", "observation_method", "channel", "obs_num", "tree_num"])

# Sort paths by most occuring ones, second sorting key length of path
class reversor:
    def __init__(self, obj):
        self.obj = obj

    def __eq__(self, other):
        return other.obj == self.obj

    def __lt__(self, other):
        return other.obj < self.obj
paths_dict = {k: v for k, v in sorted(paths_dict.items(), key=lambda item: (reversor(len(item[1]["preds"])), len(item[1]["org_path"])))}
trees_dict = {k: v for k, v in sorted(trees_dict.items(), key=lambda item: reversor(len(item[1]["tree_names"])))}

enum_trees_dict = list(enumerate(trees_dict.items()))
for num, (fp, vals) in enum_trees_dict:
    for tmp in vals["tree_names"]:
        tmp = tmp.replace("fetch_response", "fetch-response")
        tmp = tmp.replace("fetch_errormessage", "fetch-errormessage")
        inc_method, observation_method, browser, _, obs_num = tmp.split("_")
        df.loc[len(df)] = [browser, inc_method, observation_method, f"{inc_method}_{observation_method}_{browser}", obs_num.split(".json")[0], num]
        
print(f"All paths: {all_paths}, all unique paths: {len(paths_dict.keys())}")
print(f"All trees: {len(tree_list)}, all unique trees: {len(trees_dict.keys())}, all svg trees: {len(svg_files)}, all channels: {len(channel_files)}")
print(f"All unique nodes: {len(nodes_dict.keys())}")
# Number of channels with unique trees
chan_group = df.groupby("tree_num")["channel"].unique().to_frame()
chan_group["len"] = chan_group["channel"].str.len()
channels_with_unique_trees = set(chan_group.loc[chan_group["len"] == 1]["channel"].apply(str).to_list())
print(f"All channels with unique trees: {len(channels_with_unique_trees)}")

print("Keep in mind that binary outcomes result in one tree, and non-binary outcomes result in num-outcomes trees. So, there should be more trees than svg trees.")
print("Total number of svg trees should be 187 (chromium + firefox cf), 280 (cf without win, cf_win, webkitmac)")

In [None]:
def to_list(string):
    return string[1:-1].split()

def check_browser(ll):
    browsers = set()
    for l in ll:
        if "chromium" in l:
            browsers.add("chromium")
        if "firefox" in l:
            browsers.add("firefox")
        if "webkit" in l:
            browsers.add("webkit")
    return list(browsers)

with pd.option_context("display.max_rows", 137):
    with pd.option_context("display.max_colwidth", None):
        groups = df.groupby("channel")["tree_num"].unique().to_frame().reset_index()
        display(groups)
        groups["tree_num_str"] = groups["tree_num"].apply(str)
        groups = groups.groupby("tree_num_str")["channel"].unique().to_frame().reset_index()
        groups["tree_num"] = groups["tree_num_str"].apply(to_list)
        groups["len"] = groups["tree_num"].str.len()
        groups["tree_num_str_len"] = groups["tree_num_str"].str.len()
        groups = groups.sort_values(["len", "tree_num_str_len", "tree_num_str"])[["tree_num", "channel"]].reset_index(drop=True)
        groups["browser"] = groups["channel"].apply(check_browser)
        display(groups)

In [None]:
from IPython.display import SVG
def render(node):
    if node.is_leaf:
        if node.pred < 0.5:
            return f"\x1b[31m{node.pred}\x1b[0m-{node.path2}"
        else:
            return f"\x1b[32m{node.pred}\x1b[0m-{node.path2}"
    return node.path2
last_trees = []
for _, row in groups.iterrows():
    current_trees = row["tree_num"]
    channels = row["channel"]
    print(f"Current channels: {channels}, current trees: {current_trees}")
    unique_trees = sorted(set(current_trees) - set(last_trees))
    for tree_num in unique_trees:
        tree_num = int(tree_num)
        print(RenderTree(enum_trees_dict[tree_num][1][1]["tree"]).by_attr(render))
    example_tree = channels[0]
    last_trees = current_trees
    print("\n")


In [None]:
import asyncio

import ipywidgets as widgets


out = widgets.Output()
out.layout.width='1800px'


def wait_for_click(btn):
    future = asyncio.Future()
    
    def on_button_clicked(btn):
        future.set_result(btn.description)
    
    btn.on_click(on_button_clicked)
    
    return future

btn = widgets.Button(description="Next channel")


global example_tree
def show_others(btn): 
    tree_name, browser = example_tree.rsplit("_", 1)

    firefox_tree = SVG(f"{base_dir}/svg/{tree_name}_firefox_Status-Code.svg")
    chromium_tree = SVG(f"{base_dir}/svg/{tree_name}_chromium_Status-Code.svg")
    webkit_tree = SVG(f"{base_dir}/svg/{tree_name}_webkit_Status-Code.svg")
    if browser == "firefox":
        out.append_stdout("Chromium:")
        out.append_display_data(chromium_tree)
        out.append_stdout("Webkit:")
        out.append_display_data(webkit_tree)
    elif browser == "chromium":
        out.append_stdout("Firefox:")
        out.append_display_data(firefox_tree)
        out.append_stdout("Webkit:")
        out.append_display_data(webkit_tree)
    else:
        out.append_stdout("Chromium:")
        out.append_display_data(chromium_tree)
        out.append_stdout("Firefox:")
        out.append_display_data(firefox_tree)
    
    return
    
btn_show_others = widgets.Button(description="Show others!")
btn_show_others.on_click(show_others)

async def f():
    global example_tree
    last_trees = []
    all_trees = set()
    for i, row in groups.iterrows():
        current_trees = row["tree_num"]
        channels = row["channel"]
        out.append_stdout(f"{i+1}/{len(groups)}; Current channels: {channels}, \n current trees: {current_trees}\n")
        unique_trees = sorted(set(current_trees) - set(all_trees))
        out.append_stdout(f"Trees different to any previous channel: {unique_trees}\n")
        for tree_num in unique_trees:
            tree_num = int(tree_num)
            out.append_stdout(RenderTree(enum_trees_dict[tree_num][1][1]["tree"]).by_attr(render))
            out.append_stdout("\n")
        example_tree = channels[0]
        out.append_stdout("\n")
        tree_name = f"{base_dir}/svg/{example_tree}_Status-Code.svg"
        out.append_stdout(tree_name)
        out.append_display_data(SVG(tree_name))
        for tree in current_trees:
            all_trees.add(tree)
        await wait_for_click(btn)
        out.outputs = ()
        
asyncio.ensure_future(f())

display(btn, out, btn_show_others)

# Tree export 
- img width for paper
- link-stylesheet_events-fired
- object_events-fired 
- convert/merge several trees into one?!

In [None]:
display(df.loc[(df["inclusion_method"] == "object") & (df["observation_method"] == "events-fired")])

In [None]:
channel_dict = {}
for channel in channel_files:
    channel_name  = channel.split("/")[-1].split("_Status-Code")[0]
    with open(channel) as f:
        info = json.load(f)
        codes = {}
        for key in info:
            codes[key] = info[key]["observation"]
        channel_dict[channel_name] = codes

In [None]:
print(RenderTree(enum_trees_dict[0][1][1]["tree"]).by_attr(render))
print(RenderTree(enum_trees_dict[1][1][1]["tree"]).by_attr(render))
print(RenderTree(enum_trees_dict[75][1][1]["tree"]).by_attr(render))
print(RenderTree(enum_trees_dict[81][1][1]["tree"]).by_attr(render))


In [None]:
for tree_num in [0, 1, 75, 81]:
    paths = enum_trees_dict[tree_num][1][1]["org_paths"]
    print(len(paths))
    org_path = paths[0]
    print(enum_trees_dict[tree_num][1][1]["tree_names"])
    for entry in paths:
        if entry != org_path:
            print("Path:", entry)
            print("Org Path:", org_path)
        else:
            print(True)

In [None]:
tree_name_dict = {}
for name, tree in tree_list:
    tree_name_dict[name] = tree

In [None]:
# Tree export for paper
# Img width trees
height_trees = df.loc[(df["inclusion_method"] == "img") & (df["observation_method"] == "height")].sort_values("browser")
display(height_trees)

def process_row(proto):
    tree_num = proto["tree_num"]
    obs_num = proto["obs_num"]
    channel = proto["channel"]
    observations = channel_dict[channel]
    browser = proto["browser"]
    inclusion_method = proto["inclusion_method"]
    observation_method = proto["observation_method"]
    positive = observations[obs_num]
    negative = list(set(observations.values()) - set([positive]))
    return (browser, inclusion_method, observation_method, obs_num, positive, negative)

def process_group(group):
    res = []
    if len(group) == 1:
        proto = group.iloc[0]
        res.append(process_row(proto))
    else:
        for _, proto in group.iterrows():
            res.append(process_row(proto))
        
    return res
    
res = height_trees.groupby("browser").apply(process_group)
res

# print(RenderTree(enum_trees_dict[9][1][1]["tree"]).by_attr(render))
# print(RenderTree(enum_trees_dict[15][1][1]["tree"]).by_attr(render))

In [None]:
from chefboost import Chefboost as chef
import shutil

In [None]:
def convert_paths(paths):
    rows = []
    for path, pred in paths:
        if pred < 0.5:
            pred = negative
        else:
            pred = positive
        if type(pred) == list:
            if len(pred) != 1:
                return
        row = {}
        row["pred"] = pred
        for prop, value in path:
            row[prop] = value.replace("'", "")
        rows.append(row)
    data = pd.DataFrame(rows)
    return data

def expand_nans(data):
    mapping = {}
    cols = data.columns
    for col in cols:
        mapping[col] = data[col].dropna().unique()
    for col in cols:
        data[col] = data[col].apply(lambda x: mapping[col] if pd.isna(x) else x)
        data = data.explode(col)
    return data 

def create_trees(data, name):
    for algo in ["ID3", "C4.5", "CART", "CHAID"]:
        config = {'algorithm': algo, 'enableParallelism': False} # ID3, C4.5, CART, CHAID
        model = chef.fit(data, config = config, target_label = 'pred')
        shutil.move("outputs/rules/rules.py", f"trees/vendors/py/{name}_{algo}.py")


In [None]:
import hashlib
import subprocess

node_dict = {}
node_id = 0

def nodenamefunc(node):
    global positive, negative
    if node.is_leaf:
        if node.pred < 0.5:
            return negative
        else:
            return positive
    else:
        md5 = hashlib.md5(bytes(repr(node.path).encode("utf-8"))).hexdigest()
        id_var = node_dict.get(md5, len(node_dict))
        node_dict[md5] = id_var
        name = f"{node.split}?{id_var}"
        return name
    
def edgeattrfunc(node, child):
    path = child.path2.split(":", maxsplit=1)[1]
    path_string = ""
    count = 0
    for char in path:
        count += 1
        if char == '"':
            continue
        if char == "," and count > 20:
            path_string += ",\n"
            count = 0
        else:
            path_string += char
    return f'label="{path_string}"'

def edgetypefunc(node, child):
    return '->'

def nodeattrfunc(node):
    global positive, negative
    if node.is_leaf:
        if node.pred < 0.5:
            return f'shape=box, label="{negative}"'
        else:
            return f'shape=box, label="{positive}"'
    else:
        name = f'shape=box, label="{node.split}?"'
        return name

global positive, negative

for inc, obs in [("img", "height"), ("object", "events-fired"), ("link-stylesheet", "events-fired"), ("audio", "error"), ("iframe", "el-securitypolicyviolation.smooth"), ("fetch-creds-cors", "performanceAPI")]:
    res_trees = df.loc[(df["inclusion_method"] == inc) & (df["observation_method"] == obs)].sort_values("browser")
    res = res_trees.groupby("browser").apply(process_group)
    for group in res:
        tree_frame = pd.DataFrame()
        for num, (browser, inclusion_method, observation_method, obs_num, positive, negative) in enumerate(group):
            if type(negative) == list:
                if len(negative) == 1:
                    negative = negative[0]
            positive = f"{positive}".replace('"', '')
            negative = f"{negative}".replace('"', '')
            #root = enum_trees_dict[tree_num][1][1]["tree"]
            root = tree_name_dict[f"{inclusion_method}_{observation_method}_{browser}_Status-Code_{obs_num}.json"]
            paths = [create_path(leaf) for leaf in root.leaves]
            data = convert_paths(paths)
            tree_frame = pd.concat([tree_frame, data])
            
            
            exp = UniqueDotExporter(root, graph="digraph",
                                 nodenamefunc=nodenamefunc,
                                 nodeattrfunc=nodeattrfunc,
                                 edgeattrfunc=edgeattrfunc,
                                 edgetypefunc=edgetypefunc)
            #for line in exp:
            #    print(line)
            dot_file = f"trees/vendors/dot/{browser}_{inclusion_method}_{observation_method}_{num}.dot"
            pdf_file = f"trees/vendors/pdf/{browser}_{inclusion_method}_{observation_method}_{num}.png"
            exp.to_dotfile(dot_file)
            #print(pdf_file)
            subprocess.call(["dot", dot_file, "-T", "png", "-o", pdf_file])
            try:
                continue
                if browser != "webkit":
                    display(SVG(pdf_file))
            except Exception as e:
                print(e)
        tree_frame = expand_nans(tree_frame)
        # Too slow for large WebKit trees
        #create_trees(tree_frame, f"{browser}_{inclusion_method}_{observation_method}")
                
    

## Old path analysis

In [None]:
for num, (fp, vals) in enumerate(trees_dict.items()):
    print(vals["tree_names"])
    print()
    print(RenderTree(vals["tree"]).by_attr("path2"))
    print()
    print()

In [None]:
for path_dict in paths_dict.values():
    print(f"Example tree: {path_dict['tree_names'][0]}")
    print(f"Trees with this path: {len(path_dict['tree_names'])}")
    incs = set([path.split("_", maxsplit=1)[0] for path in path_dict["tree_names"]])
    print(f"Inclusion methods with this path: {len(incs)}")
    print(f"Lenght of this path: {len(path_dict['org_path'])}")
    print(f"Properties in this path: {[val for val, _ in path_dict['org_path']]}")
    browsers = []
    if "chromium" in str(path_dict["tree_names"]):
        browsers.append("chromium")
    if "firefox" in str(path_dict["tree_names"]):
        browsers.append("firefox")
    if "webkit" in str(path_dict["tree_names"]):
        browsers.append("webkit")
    print(f"Browsers in this path: {browsers}")
    if len(path_dict["org_path"]) >= 8:
        if "chromium" in str(path_dict["tree_names"]):
            print(path_dict["tree_names"])
            print(path_dict["org_path"])
            print()
    print()