# hide
title: interactive referrer heatmap (top 10k)
enable: datatables
tags: data web interactive

In [1]:
# hide
import sys
sys.path.insert(0, "..")

import pandas as pd
import numpy as np

from html_tools import *
from har_research.har import *
from har_research.whois import WhoisCache

In [2]:
# hide
def build_host_to_host():
    host_to_host = dict()
    filename_set = set()
    num_requests = 0
    for filename, first_e, e in tqdm(iter_har_entries("automatic/recordings/*/*.json")):
        filename_set.add(filename)
        num_requests += 1
        url = parse_url(e["request"]["url"])

        referer = None
        for h in e["request"]["headers"]:
            if h["name"].lower() == "referer":
                if h["value"]:
                    referer = h["value"]
                break
        if not referer:
            continue

        referer = parse_url(referer)["host"]
        goal_host = url["host"]

        if referer not in host_to_host:
            host_to_host[referer] = dict()
        host_to_host[referer][goal_host] = host_to_host[referer].get(goal_host, 0) + 1

        #if len(host_to_host) >= 10:
        #    break
    return host_to_host, len(filename_set), num_requests

In [3]:
# hide
def reduce_host_to_host_by_count_requests(host_to_host, min_count):
    h_count = dict()
    for host, other_hosts in host_to_host.items():
        h_count[host] = h_count.get(host, 0) + 1
        for host in other_hosts:
            h_count[host] = h_count.get(host, 0) + 1
    
    h_to_h = dict()
    for host, other_hosts in host_to_host.items():
        if h_count[host] < min_count:
            continue
        other_hosts = {
            key: value
            for key, value in other_hosts.items()
            if h_count[key] >= min_count
        }
        if other_hosts:
            h_to_h[host] = other_hosts
    return h_to_h

def reduce_host_to_host_by_count_hosts(host_to_host, min_count):
    h_in_set = dict()
    for host, other_hosts in host_to_host.items():
        for ohost, count in other_hosts.items():
            if ohost not in h_in_set:
                h_in_set[ohost] = set()
            h_in_set[ohost].add(host)
    h_count = {key: len(value) for key, value in h_in_set.items()}
    
    h_to_h = dict()
    for host, other_hosts in host_to_host.items():
        if h_count.get(host, 0) < min_count:
            continue
        other_hosts = {
            key: value
            for key, value in other_hosts.items()
            if h_count.get(key, 0) >= min_count
        }
        if other_hosts:
            h_to_h[host] = other_hosts
    return h_to_h

def get_labels(host_to_host: dict):
    hosts_x = sorted(host_to_host.keys())
    hosts_y = set()
    for hosts in host_to_host.values():
        hosts_y |= set(hosts.keys())
    hosts_y = sorted(hosts_y)
    return hosts_x, hosts_y

In [4]:
# hide
def trim_host_name(host: str) -> str:
    if host.endswith(".safeframe.googlesyndication.com"):
        return "*.safeframe.googlesyndication.com"
    if host.endswith(".fls.doubleclick.net"):
        return "*.fls.doubleclick.net"
    if host.endswith(".redintelligence.net") and host.startswith("hal9"):
        return "hal9*.redintelligence.net"
    if host.endswith(".cloudfront.net"):
        return "*.cloudfront.net"
    if host.endswith(".ssl.cf2.rackcdn.com"):
        return "*.ssl.cf2.rackcdn.com"
    if host.endswith(".doubleverify.com") and host.startswith("tps"):
        return "tps*.doubleverify.com"
    return host

def trim_host_to_host(host_to_host: dict) -> dict:
    h_to_h = dict()
    for host, other_hosts in host_to_host.items():
        host_t = trim_host_name(host)
        if host_t not in h_to_h:
            h_to_h[host_t] = dict()
        for other_host, count in other_hosts.items():
            other_host_t = trim_host_name(other_host)
            h_to_h[host_t][other_host_t] = h_to_h[host_t].get(other_host_t, 0) + count
    return h_to_h

In [5]:
# hide
FILENAME = "10k-host-to-host.json"
if os.path.exists(FILENAME):
    with open(FILENAME) as fp:
        data = json.load(fp)
    host_to_host_org = data["host_to_host"]
    num_all_sessions = data["num_sessions"]
    num_all_requests = data["num_requests"]
else:
    host_to_host_org, num_all_sessions, num_all_requests  = build_host_to_host()
    with open(FILENAME, "w") as fp:
        json.dump({
            "num_sessions": num_all_sessions, 
            "num_requests": num_all_requests, 
            "host_to_host": host_to_host_org, 
        }, fp)

all_xy = get_labels(host_to_host_org)
print("all:     from", len(all_xy[0]), "hosts to", len(all_xy[1]), "hosts")

host_to_host = trim_host_to_host(host_to_host_org)
hosts_x, hosts_y = get_labels(host_to_host)
print("trimmed: from", len(hosts_x), "hosts to", len(hosts_y), "hosts")

host_to_host = reduce_host_to_host_by_count_hosts(host_to_host, 3)
hosts_x, hosts_y = get_labels(host_to_host)
print("limited: from", len(hosts_x), "hosts to", len(hosts_y), "hosts")       

host_to_host_org = reduce_host_to_host_by_count_hosts(host_to_host_org, 3)

all:     from 9958 hosts to 27084 hosts
trimmed: from 8379 hosts to 24462 hosts
limited: from 608 hosts to 1537 hosts


In [6]:
# hide
def get_whois(host: str, ask_the_web=True) -> str:
    url = parse_url(host)
    data = WhoisCache().get_best_effort(url["short_host"], ask_the_web=ask_the_web)
    if not data["network"]:
        data2 = WhoisCache().get_best_effort(host, ask_the_web=ask_the_web)
        for key, value in data2.items():
            if value and not data[key]:
                data[key] = value
    return data

def get_whois_to_whois(host_to_host: dict, field: str, verbose=False):
    whois_to_whois = dict()
    items = host_to_host.items()
    if verbose:
        items = tqdm(items)
    for host, other_hosts in items:
        whost = get_whois(host)[field] or "-"
        if whost not in whois_to_whois:
            whois_to_whois[whost] = dict()
        for other_host, count in other_hosts.items():
            owhost = get_whois(other_host)[field] or "-"
            whois_to_whois[whost][owhost] = whois_to_whois[whost].get(owhost, 0) + count
    
    return whois_to_whois

#whois_to_whois = get_whois_to_whois(host_to_host, "network")
#whois_x, whois_y = get_labels(whois_to_whois)
#print("from", len(whois_x), "hosts to", len(whois_y), "hosts")     

In [7]:
# hide
def render_table(host_to_host: dict, description: bool = False, with_whois: bool = False):
    # hide-code
    h_in = dict()
    h_out = dict()
    h_in_set = dict()
    for host, other_hosts in host_to_host.items():
        h_out[host] = h_out.get(host, 0) + sum(other_hosts.values())
        for ohost, count in other_hosts.items():
            h_in[ohost] = h_in.get(ohost, 0) + count
            if ohost not in h_in_set:
                h_in_set[ohost] = set()
            h_in_set[ohost].add(host)
    
    num_requests = sum(sum(hosts.values()) for hosts in host_to_host.values()) 
    
    front_data = {}
    if with_whois:
        front_data = {"registrant": "", "network": ""}
    df = pd.DataFrame([
        {
            **front_data, 
            "referred by hosts": len(h_in_set.get(host, {})), 
            "referred hosts": len(host_to_host.get(host, {})), 
            "referred by requests %": round(h_in.get(host, 0) / num_requests * 100, 2),
            "referred requests %": round(h_out.get(host, 0) / num_requests * 100, 2),
        }
        for host in sorted(set(h_in) | set(h_out))
    ])
    df.index = sorted(set(h_in) | set(h_out))
    if with_whois:
        whois_data = [get_whois(h) for h in df.index]
        df["registrant"] = [w["registrant"] or "-" for w in whois_data]
        df["network"] = [w["network"] or "-" for w in whois_data]
    if not description:
        description = None
    else:
        description = {}
        if with_whois:
            description = {
                "registrant": "The 'whois' registrant organization",
                "network": "The network provider according to whois-data",
            }
        description = {
            **description,
            "referred by hosts": "The number of servers that referred to this server.",
            "referred hosts": "The number of servers that haven been referred by this server.",
            "referred by requests %": "The number of requests that referred to this server (in percent of all filtered requests).",
            "referred requests %": "The number of requests that haven been referred by this server (in percent of all filtered requests).",
        }
    datatable(df, max_rows=df.shape[0], order=[[1, "desc"]], description=description)

#render_table(host_to_host, with_whois=True)
#render_table(whois_to_whois)

In [8]:
# hide
def render_heatmap(host_to_host: dict, hosts_x: list, hosts_y: list, filter_x: str = None):
    # hide-code
    embeddings = [
        [0] * len(hosts_x)
        for _ in hosts_y
    ]
    for from_host, to_hosts in host_to_host.items():
        x_index = hosts_x.index(from_host)
        for to_host, count in to_hosts.items():
            y_index = hosts_y.index(to_host)
            embeddings[y_index][x_index] += count

    embeddings = pd.DataFrame(embeddings).replace({0: np.nan})

    max_y_label_len = max(len(l) for l in hosts_y)
    html_display("<hr>")
    html_heatmap(
        embeddings, labels_x=hosts_x, labels_y=hosts_y, 
        label_width=f"250px", 
        max_label_length=36,
        keep_label_front=True,
        colors="viridis",
        min_cells_x=40,
        max_cells_x=50,
        max_cells_y=50,
        show_empty_x=False, show_empty_y=False,
        filter_x=filter_x,
    )
    html_display("    <hr>")

#render_heatmap(host_to_host, hosts_x, hosts_y)

In [9]:
# hide
def render_all(host_to_host, description: bool = False, filter_x: str = None, with_whois: bool = False):
    host_x, host_y = get_labels(host_to_host)
    render_table(host_to_host, description=description, with_whois=with_whois)
    render_heatmap(host_to_host, host_x, host_y, filter_x=filter_x)

A matrix of who requested whom according to the HTTP [Referer](https://en.wikipedia.org/wiki/HTTP_referer) value. Whenever a browser loads an [iframe](https://en.wikipedia.org/wiki/Iframe#Frames) or another object that itself requested more resources from the web, the `Referer` value in the HTTP header is set to the address of the server that provided the iframe or object. 

Obviously, **ad exchange networks** are referring around a lot and this can be clearly seen from the data below. The tables list each *referring* host and the accumulated numbers while the heatmaps/matrices show the concrete amount of connections between two hosts, where the X axis is the referrer (e.g. source of iframe) and Y axis is the destination (e.g. image loaded from another server into that iframe).    

Those matrices are shown for

- [domain names](#domain): The actual full domain name of the server (multiple sub-domains that just change by number or hash-values are aggregated with `*`).
- [domain registrants](#domain-registrant): The `Registrant Organization` of the domain, according to [whois](https://en.wikipedia.org/wiki/WHOIS) information, if available. 
- [network providers](#network-provider): The network provider according to `whois` information.
- [network countries](#network-country): The country of the network provider according to `whois` information.

Each matrix shows only a 100 by 100 window and allows paging with the little number inputs. The filters accept comma-separated, case-sensitive [regular expressions](https://en.wikipedia.org/wiki/Regular_expression). 

### The dataset

was created by browsing the first 10,000 pages listed by the [Tranco Top Sites](https://tranco-list.eu) data. Some extra pages are in there (e.g. the german newspapers from another article). Also sites that do not support `HTTPS` are ignored. 
 
[A script](https://github.com/defgsus/blog/blob/master/src/har_research/automatic/capture.py) used [selenium](https://www.selenium.dev/) and the firefox browser to load each site and scrolled to the bottom of the page (or at most 20,000 pixels if no end in sight). 

**Cookie consents** have been tried to *accept* by the script but there is no garuantee that it works. Clicking **Accept** or **Agree** buttons is relatively easy. In europe, many websites use a third-party **gdpr consent framework** inside an iframe which can not be accessed by the selenium javascript. It's possible to click *somewhere* on the iframe but it's not possible to get the position of the button. (And no, the Return-key does not work).

In [10]:
# hide-code
html_display(f"""
<p>All-in-all, <b>{num_all_sessions}</b> browsing sessions are recorded, with a sum of <b>{num_all_requests}</b> web requests. 
I actually don't know why it's not at leat 10,000 sessions. Some websites where unreachable and stuff but i need to find out why so many are missing.</p>

<p>When filtering for the <code>Referer</code> usage, <b>{len(all_xy[0])}</b> different domains referred to <b>{len(all_xy[1])}</b> different domains. 
Domains that have been referred to less than 3 times where removed.</p>

<p>This results in <b>{len(hosts_x)}</b> different domains referring to <b>{len(hosts_y)}</b> different domains, 
or <b>{len(set(hosts_x) | set(hosts_y))}</b> domains at all, whose affiliations can be investigated below.</p> 
""")

The full referrer count data can be downloaded: [10k-host-to-host.json](https://raw.githubusercontent.com/defgsus/blog/master/src/har_research/10k-host-to-host.json)

## Domain 

In [11]:
# hide-code
render_all(host_to_host, description=True, with_whois=True)#, filter_x="google")

Unnamed: 0,registrant,network,referred by hosts,referred hosts,referred by requests %,referred requests %
*.cloudfront.net,"Amazon.com, Inc.",-,30,26,0.38,0.34
*.fls.doubleclick.net,Google Inc.,Google LLC,16,121,0.45,1.82
*.safeframe.googlesyndication.com,Google LLC,Google LLC,21,283,0.1,26.86
*.ssl.cf2.rackcdn.com,"Rackspace US, Inc.","Akamai Technologies, Inc.",1,7,0.01,0.03
0.gravatar.com,"Automattic, Inc.","Automattic, Inc",1,0,0.0,0.0
1.www.s81c.com,International Business Machines Corporation,IBM,2,1,0.0,0.0
15.taboola.com,-,Fastly,3,0,0.0,0.0
2.ss.faisys.com,-,Shanghai UCloud Information Technology Company Limited,1,1,0.0,0.0
20059b.ha.azioncdn.net,Azion Technologies Inc.,Hivelocity Inc,1,1,0.0,0.0
4.adsco.re,-,"Total Uptime Technologies, LLC",1,0,0.0,0.0


## Domain registrant

In [12]:
# hide
whois_to_whois = get_whois_to_whois(host_to_host_org, "registrant")
whois_x, whois_y = get_labels(whois_to_whois)
short_domains = set(parse_url(host)["short_host"] for host in set(hosts_x) | set(hosts_y)) 
num_registrants = len(set(whois_x) | set(whois_y))

In [13]:
# hide-code
html_display(f"""
<p>Of <b>{len(short_domains)}</b> short domains (meaning, the smallest domain name that identifies the server, e.g. <code>adform.net</code>), 
the <code>registrant organization</code> could be queried for only <b>{num_registrants}</b> domains. The unknown registrants are listed as -.</p>
""")

In [14]:
# hide-code
render_all(whois_to_whois)#, filter_x="Google, Amazon")

Unnamed: 0,referred by hosts,referred hosts,referred by requests %,referred requests %
"""OTM Vorld Vayd"", LLC",1,0,0.0,0.0
-,82,133,25.83,26.66
1plusX AG,1,0,0.01,0.0
27.865.757/0001-02,1,1,0.02,0.02
A.Mob SAS,2,0,0.01,0.0
"AAX, LLC",2,0,0.01,0.0
"ABC, Inc.",2,1,0.01,0.0
ABS-CBN Corporation,1,2,0.0,0.0
AO Kaspersky Lab,1,0,0.01,0.0
ATG Ad Tech Group GmbH,1,0,0.01,0.0


## Network provider 

The network provider could be determined for almost all servers. 

In [15]:
# hide-code
whois_to_whois = get_whois_to_whois(host_to_host_org, "network")
render_all(whois_to_whois)#, filter_x="Google, Amazon")

Unnamed: 0,referred by hosts,referred hosts,referred by requests %,referred requests %
-,4,6,0.14,0.16
16 COLLYER QUAY,1,0,0.02,0.0
"21ViaNet(China),Inc.",1,1,0.04,0.04
"A2 Hosting, Inc.",2,0,0.0,0.0
ACEVILLE PTE.LTD.,2,1,0.02,0.02
"ANS Communications, Inc",4,1,0.01,0.0
APNIC-STUB,4,1,0.01,0.01
AWIN LTD,1,0,0.01,0.0
Abuse-C Role,1,0,0.0,0.0
Adform DTC IPv4 Network,17,28,0.59,0.25


## Network country 

In [16]:
# hide-code
whois_to_whois = get_whois_to_whois(host_to_host_org, "network_country")
render_all(whois_to_whois)

Unnamed: 0,referred by hosts,referred hosts,referred by requests %,referred requests %
-,2,4,0.14,0.16
AU,1,1,0.03,0.03
BD,0,1,0.0,0.0
BE,1,0,0.02,0.0
CA,5,6,1.47,0.74
CN,1,4,0.66,0.68
CZ,1,2,0.01,0.01
DE,9,12,4.11,6.17
DK,6,8,0.72,0.25
ES,3,2,0.05,0.03
