In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import pandas as pd
import json

In [None]:
from helper import check_5_vals, stricter_pm, get_uniques, Conn

In [None]:
os.environ["PGDATABASE"] = "dil_cf"

with open("../../database.env") as f:
    for line in f:
        key, value = line.strip().split('=', 1)
        os.environ[key] = value

In [None]:
conn = Conn()

In [None]:
tuples = conn.select("SELECT * FROM site")

sites = pd.DataFrame(tuples, columns=["id", "rank", "site", "urls", "crawl_urls", "timeout_crawl", "error", "error_py", "crawled_urls", "after_basic", "after_trees", "after_trees_limit", "actual_urls", "insertion_time", "confirmed_urls", "timeout_dyn", "finished"]).sort_values("rank")

display(sites.head())

tuples = conn.select("SELECT * FROM accept")
accept = pd.DataFrame(tuples, columns=["id", "site", "rank", "browser", "version", "clicked_count", "clicked", "locator_count", "unique_locators", "locators", "cookies_before", "cookies_after", "cookies_new", "cookies_removed", "cookies_changed", "error", "insertion_time"]).sort_values("rank")
display(accept.head())

more = False
if more:
    tuples = conn.select("SELECT * FROM dyn_conf")
    dyn_conf = pd.DataFrame(tuples, columns=["id", "browser", "version", "site", "opg_url", "url", "inc_method", "state", "run", "observation", "error", "notes", "response", "insertion_time"])
    display(dyn_conf.head())

    tuples = conn.select("SELECT * FROM responses")
    resp = pd.DataFrame(tuples, columns=["id", "site", "url", "state", "req_headers", "resp_code", "resp_headers",
                                           "resp_body_hash", "resp_body_info", "frames", "error_text",
                                           "insertion_time"])
    display(resp.head())

# Results

## Tranco Top 10K

1. **URL Collection**:
    - Visit homepage (https://{site}/) wait until "load" (max: 30s) in Chromium
    - Extract all HTTP(S) links
    - Record all outgoing HTTP(S) requests


In [None]:
# Some entries are duplicated (race condition in dil.js)
# Only keep the last entry
sites = sites.sort_values("id")
sites = sites.drop_duplicates(subset="rank", keep="last")

In [None]:
sites["crawl_urls"] = sites["crawl_urls"].apply(lambda x: sorted(x))
sites["crawled_urls"] = sites["crawled_urls"].apply(lambda x: sorted(x))
sites["crawled_any"] = sites["crawled_urls"].str.len() != 0

In [None]:
# Overview of table structure
print("Overview:")
display(sites.head(2))


# Errors on the tested sites (URL + response collection)
print("Crawled sites:")
display(sites["crawled_any"].value_counts())
print("Errors on tested sites (crawled-any):")
#display(sites[["crawled_any", "error"]].apply(lambda x: (x["crawled_any"], x["error"].split("at http")[0].split("exceeded")[0].split("Browser closed")[0].split("{}")[0].split("keys")[0].split("sed!\n==")[0].split("hed!\n==")[0]), axis=1).value_counts().to_frame())
display(sites[["crawled_any", "error"]].apply(lambda x: (x["crawled_any"], x["error"].split("\n")[0].split(" at ")[0]), axis=1).value_counts().to_frame())

display(sites["error_py"].value_counts().to_frame())

# URLs collected:
print(f"URLs collected on: {sites.loc[sites['urls'].str.len() != 0].shape[0]} sites")

print(f"URLs attempted to crawl on: {sites.loc[sites['crawl_urls'].str.len() != 0].shape[0]} sites")

# URLs crawled:
print(f"URLs crawled on: {sites.loc[sites['crawled_urls'].str.len() != 0].shape[0]} sites (the ones that are missing here crashed in collect_responses)")

# Same URLs crawled as tried:
print(f"All wanted URLs crawled on: {sites.loc[(sites['crawled_urls'] == sites['crawl_urls']) & (sites['crawled_urls'].str.len() != 0)].shape[0]} sites (either timeout or othe issue occured, e.g., crash in collect_responses)")

# Limit to crawled any sites
sites_crawled = sites.loc[sites["crawled_any"]]
print("Timeouts:")
display(sites_crawled[["timeout_crawl"]].value_counts().to_frame())
display(sites_crawled[["timeout_dyn"]].value_counts().to_frame())
display(sites_crawled[["timeout_crawl", "timeout_dyn"]].value_counts().to_frame())

2. **Response Collection**:
    - Open three chromium instances
    - Fresh state **_ano**, visited homepage state **_visited** + optional: "accepted cookies" state **_accepted**
        - **_accepted** additional infos: 
            - visit homepage (wait until "load", max: 30s)
            - wait 2s, save cookies
            - locate potential cookie confirmation "buttons"
                - e.g., 'button:has-text("Accept all cookies")'
                - list of ~100 locators
                - try to locate them for 10s
            - take screenshot: before
            - try to click on all located locators
                - one after the other
                - most specific first (length of locator string)
                - first hover them (2s), then click (2s)
            - wait 2s, then take screenshot: after
            - save cookies
            - if at least one locator was clicked and cookies changed (either new, removed, or value changed) -> success
    - Visit URLs in all states (2 or 3)
        - on every URL wait until "load" (max: 30s) (top-level request)
        - max 500 URLs (if more than 500 exist, random selection of all recorded URLs)
        - max 1 hour
        - Record traffic/responses (with playwright; does not record everything for errors and similar; other option would be HAR or proxy?)


In [None]:
def count(row):
    ll = row["urls"]
    links = []
    requests = []
    total = len(ll)
    for l in ll:
        if l["link"]:
            links.append(l)
        if l["request"]:
            requests.append(l)
    return {"Links": len(links), "Requests": len(requests), "Total": total}
print("URLs collected stats:")
display(sites.loc[sites['crawl_urls'].str.len() != 0].apply(count, result_type="expand", axis=1).describe())
display(sites.loc[sites['crawl_urls'].str.len() != 0].apply(count, result_type="expand", axis=1).sum())

print("URLs crawled:")
display(sites.loc[sites['crawled_urls'].str.len() != 0].apply(count, result_type="expand", axis=1).describe())
display(sites.loc[sites['crawled_urls'].str.len() != 0].apply(count, result_type="expand", axis=1).sum())

3. **Pruning**:
    - Get all traffic data for all crawled URLs
    - Fit response data to trees:
        - Status-Code
        - smoothed (Security)-Headers: "content-type", "x-frame-options", "location", "content-disposition", "x-content-type-options", "cross-origin-opener-policy", "cross-origin-resource-policy", "content-security-policy"
        - body type: e.g., HTML, img, ... (inferred with `file` command)
    - Basic pruning: only keep URLs that have at least one attribute with more than one recorded value
    - Advanced pruning:
        - All Chromium and Firefox trees
        - Predict the outcome of every tree for every remaining URL-state pair
        - For every tree with at least two different predictions for a URL -> add URL-inclusion method to set of to confirm URLs
            - special cases for some trees (e.g., img-height):
                - even if all predictions are the same, they might be distinguishable (artifact of the smoothing)
                - if all predictions are positive (e.g, height=50), compare other property (e.g., bodyhash) and if that property differs -> add to set
            - example:     `urls = {"img": {"https://google.com/": "cfw"}, "https://google.com/search/": "c"}, "iframe": {"https://google.com/": "f"}}`; every inc-url pair is tested in both browsers regardless of prediction

In [None]:
# sites crawled
sc = sites_crawled
def get_urls(dat):
    url_set = set()
    url_list = []
    for inc, entry in dat.items():
        for url, browser in entry.items():
            url_list.append(url)
            url_set.add(url)
    return url_list, url_set

def count_pruning(row):
    pc = row["crawl_urls"]
    c = row["crawled_urls"]
    ab = row["after_basic"]
    at = row["after_trees"]
    atl = row["after_trees_limit"]
    act = row["actual_urls"]
    ul_at, us_at = get_urls(at)
    ul_atl, us_atl = get_urls(atl)
    ul_act, us_act = get_urls(act)
                
    return {"crawl_urls": len(pc), "crawled_urls": len(c), "after basic": len(ab), "after trees (total inc-url pairs)": len(ul_at), "after trees (unique urls)": len(us_at), "after trees limit (total inc-url pairs)": len(ul_atl), "after trees limit (unique urls)": len(us_atl), "actual URLs": len(ul_act), "actual URLs (unique)": len(us_act)}
# The data describes it without browsers!
print("Pruning stats:")
display(sc[["crawl_urls", "crawled_urls", "after_basic", "after_trees", "after_trees_limit", "actual_urls"]].apply(count_pruning, axis=1, result_type="expand").describe())

In [None]:
# Which inclusion methods are predicted?
# Sites/URLs
def collect_incs(row):
    row = row["after_trees"]
    res = {"any": {}}
    for inc in row.keys():
        for url, browser_str in row[inc].items():
            entry = res["any"].get(inc, 0)
            entry += 1
            res["any"][inc] = entry
    return res
met = sc[["after_trees"]].apply(collect_incs, axis=1, result_type="expand")
met_any = pd.json_normalize(met["any"]).agg(["count", "sum"]).T
met_any[["count", "sum"]]

4. **"Dynamic confirmation**:
    - Test all remaining inclusion_method-url-browser pairs
        - max 25 URLs for one inclusion method
        - max 3h
    - Test all possible states (regardless of whether the prediction was only for one state-pair)
    - Prepare states: 
        - Same as in **response collection**
        - Additionally for Firefox
    - For every inc method:
        - For every URL:
            - For every browser; If browser should be tested:
                - For every state:
                    - wait 1s
                    - visit `http://observer.org/opg/<inc>/?url=<url>`
                    - wait until "networkidle", max: 30s; for window.open wait for "networkidle" or "domcontentloaded" of the new window
                    - wait another 750ms (2000ms)
                    - extract observations
                    - (record responses)
                - If observations for every state are the same -> remove browser from to_test list
         - Repeat up to 5 times
     - Get confirmed distinguishable pairs:
         - 5 times different observations for one observation method -> confirmed browser-inc_method-url-state_a-state_b(-observation_method) pair
         - additional sanity checking: 
             - the same observation is not allowed to be present in both states (e.g., random frame counts: [(0, 1), (0, 1), (1, 0), (1, 0), (0, 1)] -> not a confirmed pair)
             - additional constraints for some methods: 
                 - e.g., custom code for postMessage, frame_count
                 - heuristic: at least one value should occur two times for the same state?

In [None]:
# Timout of dynamic sites
sites_dyn = sites.loc[sites["actual_urls"] != {}]
display(len(sites_dyn))
display(sites_dyn["timeout_dyn"].value_counts())

In [None]:
# Early abort stats
dyn_conf_run = pd.DataFrame(conn.select("SELECT run, COUNT(id) from dyn_conf GROUP BY run"))
dyn_conf_run.loc["sum"] = dyn_conf_run.sum()
display(dyn_conf_run)

In [None]:
# Total time taken
sites["insertion_time"].max() - sites["insertion_time"].min() 

In [None]:
# Convert data (one entry for every confirmed URL)
conf = sc.loc[sc["confirmed_urls"].str.len() != 0]
confs_raw = pd.DataFrame()
for row in conf[["confirmed_urls", "site"]].iterrows():
    row = row[1]
    site = row["site"]
    for state, df in row["confirmed_urls"].items():
        new = pd.DataFrame.from_dict(df)
        new["site"] = site
        new["state"] = state
        new = new.rename(columns={"0": "observation_methods"})
        confs_raw = pd.concat([confs_raw, new])
display(confs_raw.head())

confs_raw["observation_methods"] = confs_raw["observation_methods"].apply(sorted)
from publicsuffix2 import get_sld
from urllib.parse import urlparse

confs_raw["real_site"] = confs_raw["url"].apply(lambda x: get_sld(urlparse(json.loads(x)).hostname))
confs_raw["same_site"] = confs_raw["site"] == confs_raw["real_site"]

In [None]:
# Only sites that distinguish visited from ano
confs_ano = confs_raw.loc[~confs_raw["state"].str.contains("acc")]
# Only sites that either distinguish ano from accepted or visited from accepted
confs_acc = confs_raw.loc[confs_raw["state"].str.contains("acc")]

## History sniffing results!

In [None]:
confs = confs_ano
# Confs same-site only
confs = confs.loc[confs["same_site"] == True]

In [None]:
print("Unique sites", confs["site"].nunique())
print(confs["observation_methods"].value_counts().to_frame().head(10))

### pM and co. check heuristics
- check heuristics implemented in dil.py
- mostly good
- by using the additional heuristic: at least one state is not allowed to have 5 different values, for most methods almost nothing is lost
- only is noisy el-message (16738): additional restriction, one state is only allowed to have one value!

In [None]:
confs_5 = confs_raw[~confs_raw.apply(check_5_vals, axis=1)]
confs_5["observation_methods"] = confs_5["observation_methods"].astype(str)
print(f"Methods that have entries with 5 unique observations in both vals_a and vals_b: {confs_5['observation_methods'].unique()}")
display(confs_5.groupby("observation_methods").count())
with pd.option_context("display.max_colwidth", 1000):
    display(confs_5[["observation_methods", "vals_a", "vals_b"]].drop_duplicates(subset="observation_methods"))

In [None]:
# Switch to stricter heuristic for analysis!
confs_heuristic = confs[confs.apply(check_5_vals, axis=1)]

In [None]:
import difflib
import diff_match_patch as dmp_module
from IPython.core.display import HTML
dmp = dmp_module.diff_match_patch()

for method in ["['contentDocument']", "['events-fired-all', 'events-fired']", "['length']", "['performanceAPI.smooth']", "['el-message']"]:
#for method in ["['el-securitypolicyviolation']", "['fetch_response']", "['el-blur']", "['history.length']", "['el-error']"]:
    def get_diff(row):
        vals_a = str(row["vals_a"])[:5000]
        vals_b = str(row["vals_b"])[:5000]
        diff = dmp.diff_main(vals_a, vals_b)
        return row["site"], row["url"][:100], row["inc_method"], dmp.diff_prettyHtml(diff)

    with pd.option_context("display.max_colwidth", 100):
        for (conf, name) in [(confs_5, "5 values"), (confs_heuristic, "less than 5 values")]:
            conf = conf.copy()
            conf.loc[:, "observation_methods"] = conf.loc[:, "observation_methods"].astype(str)
            print(name, method)
            display(HTML(conf.loc[conf["observation_methods"] == method].iloc[:5].apply(get_diff, axis=1, result_type="expand").to_html(escape=False)))

In [None]:
# Even with strict (!=5) heuristic, pM still contains many FP likely values
with pd.option_context("display.max_colwidth", 1000):
    pm = confs_heuristic.loc[confs_heuristic["observation_methods"].apply(str) == "['el-message']"]
    display(pm[["vals_a", "vals_b"]].head(3))
    pm = pm.loc[pm.apply(stricter_pm, axis=1)]
    display(pm[["vals_a", "vals_b"]].head(3))
    
# Use even stricter heuristic for pMs:
# one state is only allowed to have a maximum of one observation, this leads to some FNs, but should remove all FPs

confs_heuristic = confs_heuristic.loc[confs_heuristic.apply(stricter_pm, axis=1)]

## Continue main history sniffing analysis

In [None]:
confs_heuristic["channel"] = confs_heuristic["inc_method"] + "-" +  confs_heuristic["observation_methods"].apply(str)
display(confs_heuristic["site"].nunique())
display(confs_heuristic.groupby(["site"])["browser"].unique().apply(sorted).astype(str).to_frame().value_counts().to_frame())
display(confs_heuristic.groupby(["browser", "state"])["site"].nunique().to_frame())

In [None]:
# Explode the observation methods to have one row for every observation method


c_exp = confs_heuristic.explode("observation_methods")
c_exp["channel"] = c_exp["inc_method"] + "-" + c_exp["observation_methods"]
# Appy pM Heuristic again
c_exp = c_exp.loc[c_exp.apply(stricter_pm, axis=1)]

In [None]:
# Average URLs/site
display(c_exp.groupby("site")["url"].nunique().to_frame().describe())
# Average inc-url-pairs/site
display(c_exp.groupby("site")["opg_url"].nunique().to_frame().describe())

In [None]:
# Percentage of vulnerabel sites
c_exp["site"].nunique()/len(sc)

In [None]:
c_exp_c = c_exp.loc[c_exp["browser"] == '"chromium"']

In [None]:
# Fancy tables with Sites both browser, only chromium, only firefox, (sorted by sum)
browser_data = {}

for grouping, name in [(["inc_method"], "incs"), (["observation_methods"], "methods"), (["inc_method", "observation_methods"], "channels")]:
    df = c_exp.loc[c_exp["observation_methods"] != "events-fired-all"].groupby(grouping).apply(get_uniques).apply(pd.Series).sort_values("Sum", ascending=False)
    df = df.reset_index().rename(columns={"inc_method": "Inclusion Method", "observation_methods": "Observation Method"})
    if name == "incs":
        df["Inclusion Method"] = df["Inclusion Method"].apply(json.loads)
        df = df.set_index(["Inclusion Method"])
    elif name == "methods":
        df = df.set_index(["Observation Method"])
    else:
        df["Inclusion Method"] = df["Inclusion Method"].apply(json.loads)
        df = df.set_index(["Inclusion Method", "Observation Method"])
    df = df.rename(index={"fetch_response": "fetch-response"})
    browser_data[name] = df

In [None]:
#for name in ["incs", "methods", "channels"]:
for name in ["channels"]:
    df = browser_data[name][["Both", "Only C", "Only FF", "Sum"]].head(20)
    df.index = pd.MultiIndex.from_tuples([(x[0], x[1].replace('.smooth', '')) for x in df.index]).set_names(['Inclusion Method', 'Observation Method'])
    
    df.columns = pd.MultiIndex.from_arrays([["Vulnerable sites", "Vulnerable sites", "Vulnerable sites", "Vulnerable sites"], ["Both", "Only Chromium", "Only Firefox", "Sum"]])
    display(df)
    latex_table = df.style.to_latex(hrules=True, multicol_align="c")
    print(latex_table)
    with open(f"res/paper_history_{name}.tex", "w") as f:
        f.write(latex_table)

In [None]:
for name in ["channels"]:
    df = browser_data[name][["Both", "Only C", "Only FF", "Sum"]]
    df.index = pd.MultiIndex.from_tuples([(x[0], x[1].replace('.smooth', '')) for x in df.index]).set_names(['Inclusion Method', 'Observation Method'])
    
    df.columns = pd.MultiIndex.from_arrays([["Vulnerable sites", "Vulnerable sites", "Vulnerable sites", "Vulnerable sites"], ["Both", "Only Chromium", "Only Firefox", "Sum"]])
    display(df)
    df.to_csv(f"res/paper_history_{name}_full.csv")

In [None]:
print(f"{len(df.loc[df[('Vulnerable sites', 'Only Chromium')] > 0])} working channels in chrome visited")
print(f"{len(df.loc[df[('Vulnerable sites', 'Only Firefox')] > 0])} working channels in firefox visited")

In [None]:
with pd.option_context("display.max_rows", 84):
    display(browser_data["channels"][["Both", "Only C", "Only FF", "Sum"]])

### Investigation of differences in browsers

In [None]:
c_w = c_exp.loc[(c_exp["browser"] == '"chromium"') & (c_exp["inc_method"] == '"window.open"')]["site"].unique()
f_w = c_exp.loc[(c_exp["browser"] == '"firefox"') & (c_exp["inc_method"] == '"window.open"')]["site"].unique()
c_nw = c_exp.loc[(c_exp["browser"] == '"chromium"') & (c_exp["inc_method"] != '"window.open"')]["site"].unique()
f_nw = c_exp.loc[(c_exp["browser"] == '"firefox"') & (c_exp["inc_method"] != '"window.open"')]["site"].unique()

# Sites only having window.open in Chromium
c_ow = set(c_w) - set(c_nw)
# Sites only having window.open in Chromium and at least one non-window.open in Firefox
print(len(c_ow & set(f_nw)))

# Sites only having window.open in one of both
w_only_one = set(c_w) ^ set(f_w)
print(len(w_only_one))

In [None]:
# Many sites only vuln in Firefox (non-window.open) use SameSite None without Secure (e.g., manderson.org, walmart.ca)!
# some of these site use https://experienceleague.adobe.com/docs/experience-platform/tags/client-side/satellite-object.html?lang=en#cookie-set or similar to set the cookies in javascript
# they do not set SameSite nor Secure, Chromimu defaults to Lax, Firefox defaults to None and warns about Secure not being set

browser_data["incs"]

In [None]:
wo_len = c_exp.loc[(c_exp["observation_methods"] == "length") & (c_exp["inc_method"] == '"window.open"')]
wo_len["a"] = wo_len["vals_a"].apply(lambda x: x["length"]).apply(sorted).astype(str)
wo_len["b"] = wo_len["vals_b"].apply(lambda x: x["length"]).apply(sorted).astype(str)
wo_len_only_one = wo_len.loc[wo_len["site"].isin(w_only_one)]
display(wo_len[["a", "b"]].value_counts().head(10).to_frame())
display(wo_len_only_one[["a", "b"]].value_counts().head(10).to_frame())

In [None]:
# Firefox double-script and style-import
# They all work due to 'timing-allow-origin' which was not in the response space

# Img
# 1 Site only working in Firefox has strange endless redirect in Chromium (thus not setting cookies)
# Sites only working in Chromium, Chromium gets blocked by WAF/bot detection for some reason, Firefox does not

# Iframe history length
# client side redirects?

# Iframe-dircsp
# probably lax?, + all kind of stuff

# el-securitypolicyviolation and not iframe-csp
# frame-ancestors bug in Firefox


with pd.option_context("display.max_colwidth", 500):
    #display(confs_heuristic.loc[(confs_heuristic["browser"] == '"firefox"') & (confs_heuristic["inc_method"] == '"double-script"')])
    #display(confs_heuristic.loc[(confs_heuristic["browser"] == '"firefox"') & (confs_heuristic["inc_method"] == '"style-import"')])
    #display(confs_heuristic.loc[(confs_heuristic["inc_method"] == '"img"')].drop_duplicates(subset=["browser", "site"]))
    #display(c_exp.loc[(c_exp["inc_method"] == '"iframe"') & (c_exp["observation_methods"] == "history.length")].drop_duplicates(subset=["browser", "site"]))
    #display(c_exp.loc[(c_exp["inc_method"] == '"iframe-dircsp"')].drop_duplicates(subset=["browser", "site"]))
    #display(c_exp.loc[(c_exp["observation_methods"] == "sheet")].drop_duplicates(subset=["browser", "site"]))
    display(c_exp.loc[(c_exp["observation_methods"] == "el-securitypolicyviolation") & (c_exp["inc_method"] != '"iframe-csp"')].drop_duplicates(subset=["browser", "site"]))   

## Cookie acceptance sniffing
- Chromimum only

In [None]:
# Chromium x2 + Firefox
ac = accept
# Worked: clicked + changes
worked = ac.loc[(ac["clicked_count"] != 0) & ((ac["cookies_new"].str.len() != 0) | (ac["cookies_removed"].str.len() != 0) | (ac["cookies_changed"].str.len() != 0))]
# Did not work: clicked + no changes
dnw = ac.loc[(ac["clicked_count"] != 0) & ((ac["cookies_new"].str.len() == 0) & (ac["cookies_removed"].str.len() == 0) & (ac["cookies_changed"].str.len() == 0))]
# Changed without click
cnc = ac.loc[(ac["clicked_count"] == 0) & ((ac["cookies_new"].str.len() != 0) | (ac["cookies_removed"].str.len() != 0) | (ac["cookies_changed"].str.len() != 0))]
# No click no change
ncnc = ac.loc[(ac["clicked_count"] == 0) & ((ac["cookies_new"].str.len() == 0) & (ac["cookies_removed"].str.len() == 0) & (ac["cookies_changed"].str.len() == 0))]

for name, df in [("worked", worked), ("DNW", dnw), ("CNC", cnc), ("NCNC", ncnc)]:
    print(name)
    display(df.groupby("browser")["site"].nunique())

In [None]:
# Initial Chromium accept crawl
ac = accept.loc[accept["browser"] == "chromium"].sort_values("insertion_time").drop_duplicates(subset="site").sort_values("rank")

In [None]:
# Worked: clicked + cookies changed
worked = ac.loc[(ac["clicked_count"] != 0) & ((ac["cookies_new"].str.len() != 0) | (ac["cookies_removed"].str.len() != 0) | (ac["cookies_changed"].str.len() != 0))]
# Did not work: clicked + no changes
dnw = ac.loc[(ac["clicked_count"] != 0) & ((ac["cookies_new"].str.len() == 0) & (ac["cookies_removed"].str.len() == 0) & (ac["cookies_changed"].str.len() == 0))]
# Changed without click
cnc = ac.loc[(ac["clicked_count"] == 0) & ((ac["cookies_new"].str.len() != 0) | (ac["cookies_removed"].str.len() != 0) | (ac["cookies_changed"].str.len() != 0))]
# No click no change
ncnc = ac.loc[(ac["clicked_count"] == 0) & ((ac["cookies_new"].str.len() == 0) & (ac["cookies_removed"].str.len() == 0) & (ac["cookies_changed"].str.len() == 0))]
print("Accept cookies module stats:")
print(f"Worked: clicked + cookies changed on {len(worked)} sites\nDid not work: clicked + no changes on {len(dnw)} sites\nChanged without click on: {len(cnc)} sites\nNo change no click on: {len(ncnc)} sites")

In [None]:
# Second Chromium accept crawl (for dyn confirm)
# Only use sites that were attempted to be crawled?!
ac = accept.loc[accept["browser"] == "chromium"].sort_values("insertion_time").drop_duplicates(subset="site", keep="last").sort_values("rank")
ac = ac.loc[ac["site"].isin(sites_dyn["site"])]

In [None]:
# Worked: clicked + cookies changed
worked = ac.loc[(ac["clicked_count"] != 0) & ((ac["cookies_new"].str.len() != 0) | (ac["cookies_removed"].str.len() != 0) | (ac["cookies_changed"].str.len() != 0))]
# Did not work: clicked + no changes
dnw = ac.loc[(ac["clicked_count"] != 0) & ((ac["cookies_new"].str.len() == 0) & (ac["cookies_removed"].str.len() == 0) & (ac["cookies_changed"].str.len() == 0))]
# Changed without click
cnc = ac.loc[(ac["clicked_count"] == 0) & ((ac["cookies_new"].str.len() != 0) | (ac["cookies_removed"].str.len() != 0) | (ac["cookies_changed"].str.len() != 0))]
# No click no change
ncnc = ac.loc[(ac["clicked_count"] == 0) & ((ac["cookies_new"].str.len() == 0) & (ac["cookies_removed"].str.len() == 0) & (ac["cookies_changed"].str.len() == 0))]
print("Accept cookies module stats:")
print(f"Worked: clicked + cookies changed on {len(worked)} sites\nDid not work: clicked + no changes on {len(dnw)} sites\nChanged without click on: {len(cnc)} sites\nNo change no click on: {len(ncnc)} sites")
print(f"Locator found but no click on {len(ac.loc[(ac['clicked_count'] == 0) & (ac['locator_count'] != 0)])} sites (subset of two previous ones)")

In [None]:
# Comparative data (history sniffing, only chromium, strict heuristics, accepted worked)
c_exp_comp = c_exp_c.loc[c_exp_c["site"].isin(worked["site"].unique())]

In [None]:
# apply strict heuristics!
cookie = confs_acc[confs_acc.apply(check_5_vals, axis=1)]
co_exp = cookie.explode("observation_methods")
co_exp["channel"] = co_exp["inc_method"] + "-" + co_exp["observation_methods"]
# Appy pM Heuristic
co_exp = co_exp.loc[co_exp.apply(stricter_pm, axis=1)]
# Only chromium
co_exp = co_exp.loc[co_exp["browser"] == '"chromium"']
# Only first-party URLs
co_exp = co_exp.loc[co_exp["same_site"]]

In [None]:
# Percentage of vulnerable sites
co_exp["site"].nunique()/len(worked)

In [None]:
co_hist = pd.concat([c_exp_comp, co_exp])
display(co_hist.groupby("site")["state"].unique().apply(sorted).value_counts().to_frame())

In [None]:
from matplotlib_venn import venn3, venn3_circles, venn3_unweighted
from matplotlib import pyplot as plt

venn3_unweighted(subsets = (20, 28, 701, 12, 203, 10, 123),
      set_labels = ('Accepted-Anonymous', 'Accepted-Visited', 'Visited-Anonymous'), alpha = 0.5,
      subset_areas = (20, 28, 701 - 500, 12, 203 - 100, 10 + 50, 123 - 50))
plt.savefig('res/venn_states.pdf')

In [None]:
print("Attempted sites", len(worked))
print("Cookie accepted sites", co_exp["site"].nunique())
print("History comparative data", c_exp_comp["site"].nunique())
print("Sites vuln in both", len(set(co_exp["site"].unique()) & set(c_exp_comp["site"].unique())))
print("Sites vuln only to cookie", len(set(co_exp["site"].unique()) - set(c_exp_comp["site"].unique())))
print("Sites vuln only to hist", len(set(c_exp_comp["site"].unique()) - set(co_exp["site"].unique())))
print(set(c_exp_comp["site"].unique()) - set(co_exp["site"].unique()))

display(co_exp.groupby(["browser", "state"])["site"].nunique().to_frame())
display(co_exp.groupby("site")["state"].unique().apply(sorted).value_counts().to_frame())
display(co_exp.groupby("url")["state"].unique().apply(sorted).value_counts().to_frame())

In [None]:
# Channel and co analysis!
browser_data = {}

for grouping, name in [(["inc_method"], "incs"), (["observation_methods"], "methods"), (["inc_method", "observation_methods"], "channels")]:
    df = co_hist.loc[co_hist["observation_methods"] != "events-fired-all"].groupby(grouping).apply(get_uniques, cat="state").apply(pd.Series).sort_values("Sum", ascending=False)
    df = df.reset_index().rename(columns={"inc_method": "Inclusion Method", "observation_methods": "Observation Method"})
    if name == "incs":
        df["Inclusion Method"] = df["Inclusion Method"].apply(json.loads)
        df = df.set_index(["Inclusion Method"])
    elif name == "methods":
        df = df.set_index(["Observation Method"])
    else:
        df["Inclusion Method"] = df["Inclusion Method"].apply(json.loads)
        df = df.set_index(["Inclusion Method", "Observation Method"])
    df = df.rename(index={"fetch_response": "fetch-response"})
    browser_data[name] = df

In [None]:
#for name in ["incs", "methods", "channels"]:
for name in ["channels"]:
    df = browser_data[name][["Both", "Only Cookie"]].head(20)# "Only Hist", "Sum"]].head(20)
    df.index = pd.MultiIndex.from_tuples([(x[0], x[1].replace('.smooth', '')) for x in df.index]).set_names(['Inclusion Method', 'Observation Method'])
    df.columns = pd.MultiIndex.from_arrays([["Vulnerable sites", "Vulnerable sites"], ["History & Acceptance", "Only Acceptance"]])
    display(df)
    latex_table = df.style.to_latex(hrules=True, multicol_align="c")
    print(latex_table)
    with open(f"res/paper_cookie_{name}.tex", "w") as f:
        f.write(latex_table)

In [None]:
#for name in ["incs", "methods", "channels"]:
for name in ["channels"]:
    df = browser_data[name][["Both", "Only Cookie"]]# "Only Hist", "Sum"]].head(20)
    df.index = pd.MultiIndex.from_tuples([(x[0], x[1].replace('.smooth', '')) for x in df.index]).set_names(['Inclusion Method', 'Observation Method'])
    df.columns = pd.MultiIndex.from_arrays([["Vulnerable sites", "Vulnerable sites"], ["History & Acceptance", "Only Acceptance"]])
    display(df)
    df.to_csv("res/paper_cookie_channels_full.csv")

In [None]:
print(f"{len(df.loc[df[('Vulnerable sites', 'Only Acceptance')] > 0])} working channels in chrome acceptance")

In [None]:
# Investigate strange cases?!

# Only in history sniffing
test = co_hist.loc[co_hist["site"].isin(['tinhte.vn', 'pngwing.com', 'psychologytoday.com', 'sondakika.com', 'sfweekly.com', 'analdin.com', 'gamebanana.com', 'top.gg', 'ct.gov', 'rediff.com', 'united.com', 'cambridge.org'])]
# Probably mostly noise? (or only works in completly fresh browser), it is unlikely that the 5+5 test were different, but not impossible

# El-blur: 
# (for visited vs ano: mostly bot detection for some reasons??)
# for accepted vs *: blocking cookie banner has auto focus!

# securitypolicyviolation
# redirect to accept cookie page!

with pd.option_context("display.max_colwidth", 500):
    display(test.drop_duplicates(subset=["browser", "site"]))
    #display(co_hist.loc[co_hist["observation_methods"] == "el-blur"].drop_duplicates(subset=["site"]))
    #display(co_exp.loc[co_exp["observation_methods"] == "el-securitypolicyviolation"].drop_duplicates(subset=["site"]))



## Third-parties
- third-parties (vs first parties)
- links vs requests and stuff

In [None]:
all_parties = confs_ano
all_parties = all_parties[all_parties.apply(check_5_vals, axis=1)]
all_parties = all_parties.explode("observation_methods")
all_parties["channel"] = all_parties["inc_method"] + "-" + all_parties["observation_methods"]
# Appy pM Heuristic again
all_parties = all_parties.loc[all_parties.apply(stricter_pm, axis=1)]

In [None]:
display(all_parties["site"].nunique())
display(all_parties.groupby("same_site")["site"].nunique())
display(all_parties.groupby("site")["same_site"].unique().reset_index()["same_site"].apply(sorted).astype(str).value_counts())

In [None]:
# Third-parties that occur often

# A lot of cookie syncing
# Many popular third-parties
third_parties = all_parties.loc[all_parties["same_site"] == False].rename(columns={"real_site": "Third-Party"}).groupby("Third-Party")["site"].nunique().to_frame().sort_values("site", ascending=False)
display(third_parties.head(10))
latex_table = third_parties.head(10).style.to_latex()
with open(f"res/paper_third_popular.tex", "w") as f:
     f.write(latex_table)
display(third_parties.describe())

In [None]:
# URLs that occur on more than one site
all_parties.groupby("url")["site"].nunique().to_frame().sort_values("site", ascending=False).head(10)

In [None]:
# Check how methods change with third-party vs first-party
# Channel and co analysis!
browser_data = {}

for grouping, name in [(["inc_method"], "incs"), (["observation_methods"], "methods"), (["inc_method", "observation_methods"], "channels")]:
    df = all_parties.loc[all_parties["observation_methods"] != "events-fired-all"].groupby(grouping).apply(get_uniques, cat="site").apply(pd.Series).sort_values("Sum", ascending=False)
    df = df.reset_index().rename(columns={"inc_method": "Inclusion Method", "observation_methods": "Observation Method"})
    if name == "incs":
        df["Inclusion Method"] = df["Inclusion Method"].apply(json.loads)
        df = df.set_index(["Inclusion Method"])
    elif name == "methods":
        df = df.set_index(["Observation Method"])
    else:
        df["Inclusion Method"] = df["Inclusion Method"].apply(json.loads)
        df = df.set_index(["Inclusion Method", "Observation Method"])
    df = df.rename(index={"fetch_response": "fetch-response"})
    browser_data[name] = df

In [None]:
for name in ["incs", "methods", "channels"]:
    df = browser_data[name][["Both", "Only First", "Only Third", "Sum"]].head(20)
    display(df)
    latex_table = df.style.to_latex()
    with open(f"res/paper_third_{name}.tex", "w") as f:
        f.write(latex_table)

In [None]:
# Check all channels/methods for percentage of cross-site URLs
# Some have very high third-party rates (e.g., perfAPI), others have low third-party rates (e.g., length)
conf_t = all_parties
conf_t["observation_methods"] = conf_t["observation_methods"].astype(str)
conf_t.groupby("observation_methods")["same_site"].agg(["mean", "count"]).sort_values("count", ascending=False).head(20)

In [None]:
# Third-party investigations and stuff:
# Firefox:
# - State partioning is doing "weird" stuff? Load site, load attack site in same tab -> works; load site, load attack site in another tab -> does not work
# Chromium:
# - Tabs do not matter, they are "synced"
# Third-party:
# - cookies are set for domain (e.g.,: .doubleclick.com) -> thus even though every site has a different subdomain, the information that leaks is only that doubleclick cookies are set already? :(
# - for e.g., both perfAPI (doubleclick) and fetch_response (demdex)
# PerfAPI:
# - when the `timing-allow-origin=*` header allows timing information to leak cross-origin, it also allows to leak the resource size!!
# Fetch_response:
# - CORS replay "misconfig"! "Access-Control-Allow-Credentials: true, Access-Control-Allow-Methods: GET, POST, OPTIONS, Access-Control-Allow-Origin: http://observer.org:8001"
# - more common then thought?
# - redirect to set cookies, if no cookies are set already

with pd.option_context("display.max_colwidth", None):
    display(all_parties.loc[all_parties["observation_methods"] == "performanceAPI.smooth"].drop_duplicates(subset="site").tail(5))
    #display(all_parties.loc[all_parties["observation_methods"] == "fetch_response"].drop_duplicates(subset="site").head(10))
    #display(all_parties.loc[all_parties["observation_methods"] == "win.performanceAPI.smooth"].drop_duplicates(subset="site").head(5))
    #display(all_parties.loc[all_parties["observation_methods"] == "width"].drop_duplicates(subset="site").head(5))

In [None]:
# Links vs resources/requests

In [None]:
site_dict = {}
for row in sites.iterrows():
    row = row[1]
    site = row["site"]
    url_dict = {}
    for entry in row["urls"]:
        url_dict[entry["url"]] = entry
        if entry["link"] and entry["request"]:
            print(entry)
    site_dict[site] = url_dict

In [None]:
def get_source(row):
    site = row["site"]
    url = json.loads(row["url"])
    entry = site_dict[site][url]
    # The later two should be redirects caused by visiting the first one
    if url in [f"https://{site}/", f"http://{site}/", f"https://www.{site}/"]:
        return "hompage"  # "homepage"
    if entry["request"]:
        return "request"
    if entry["link"]:
        return "link"
    else:
        return "invalid"
all_parties["source"] = all_parties[["site", "url"]].apply(get_source, axis=1)

In [None]:
display(all_parties["source"].value_counts())
display(all_parties.groupby("source")["site"].nunique())
display(all_parties.groupby("site")["source"].unique().reset_index()["source"].apply(sorted).astype(str).value_counts())

In [None]:
display(all_parties[["same_site", "source"]].value_counts().to_frame())
all_parties["source_site"] = all_parties["same_site"].apply(str) + "-" + all_parties["source"]
source_table = all_parties.groupby("site")["source_site"].unique().reset_index()["source_site"].apply(sorted).astype(str).value_counts().to_frame()
display(source_table)

In [None]:
def split_data(row):
    data = row["index"][1:-1]
    third_party = []
    first_party = []
    for entry in data.split(", "):
        party, source = entry[1:-1].split("-")
        if party == "False":
            third_party.append(source)
        else:
            first_party.append(source)
    return {"First-Party": sorted(first_party), "Third-Party": sorted(third_party), "Sites": row["source_site"]}
st = source_table.reset_index().apply(split_data, axis=1, result_type="expand").astype(str)
st["First-Party"] = st["First-Party"].apply(lambda x: x.replace("[", "").replace("]", "").replace("'", ""))
st["Third-Party"] = st["Third-Party"].apply(lambda x: x.replace("[", "").replace("]", "").replace("'", ""))
st = st.set_index(["First-Party", "Third-Party"])
display(st)
latex_table = st.head(10).style.to_latex()
with open(f"res/paper_third_source.tex", "w") as f:
    f.write(latex_table)