# hide
title: interactive network heatmap
tags: data web interactive

In [None]:
# hide
import sys
sys.path.insert(0, "../..")

import pandas as pd
import numpy as np
#from nb_helpers import *
#from html_stuff import *
#from clusters import *
#from datatables import *
from paper_statistics import WebsiteStatistics
from html_tools import *
from har_research.har import *
from kek import Kek
from har_research.whois import WhoisCache

In [2]:
# hide
def trim_host_name(host: str) -> str:
    if host.endswith("safeframe.googlesyndication.com"):
        return "*.safeframe.googlesyndication.com"
    if host.endswith(".redintelligence.net") and host.startswith("hal9"):
        return "hal9*.redintelligence.net"
    return host

host_to_host = dict()
for ws, har in WebsiteStatistics().iter_website_hars():
    for e in har:
        url = parse_url(e["request"]["url"])
        
        referer = ws["url"]
        for h in e["request"]["headers"]:
            if h["name"].lower() == "referer":
                if h["value"]:
                    referer = h["value"]
                break
        referer = trim_host_name(parse_url(referer)["host"])
        if not referer:
            print(e)   
            break
        goal_host = trim_host_name(url["host"])
        
        if referer not in host_to_host:
            host_to_host[referer] = dict()
        host_to_host[referer][goal_host] = host_to_host[referer].get(goal_host, 0) + 1
    
    #if len(host_to_host) >= 10:
    #    break

#host_to_host
hosts_x = sorted(host_to_host.keys())
hosts_y = set()
for hosts in host_to_host.values():
    hosts_y |= set(hosts.keys())
hosts_y = sorted(hosts_y)
print("from", len(hosts_x), "hosts to", len(hosts_y), "hosts")       

100%|██████████| 92/92 [00:13<00:00,  6.98it/s]

from 310 hosts to 1594 hosts





A matrix of who requested whom according to the HTTP [Referer](https://en.wikipedia.org/wiki/HTTP_referer) value. The dataset is the same german newspapers as in
[this article]({% post_url 2021-03-19-ad-servers-today %}). For example, if one loads an [iframe](https://en.wikipedia.org/wiki/Iframe#Frames) from `safeframe.googlesyndication.com` it will pull in all kinds of stuff from other servers and the browser will set the `Referer` to the address where the iframe came from.

You can filter and page around in this map. X = source, Y = destination. 

In [9]:
# hide-code
embeddings = [
    [0] * len(hosts_x)
    for _ in hosts_y
]
for from_host, to_hosts in host_to_host.items():
    x_index = hosts_x.index(from_host)
    for to_host, count in to_hosts.items():
        y_index = hosts_y.index(to_host)
        embeddings[y_index][x_index] += count

embeddings = pd.DataFrame(embeddings).replace({0: np.nan})

max_y_label_len = max(len(l) for l in hosts_y)

html_heatmap(
    embeddings, labels_x=hosts_x, labels_y=hosts_y, 
    label_width=f"250px", 
    max_label_length=36,
    keep_label_front=True,
    colors="viridis",
    min_cells_x=40,
    show_empty_x=False, show_empty_y=False,
    filter_x="google, doubleclick",
)