In [1]:
import math
import secrets
import html as html_lib
from typing import List
import sys
sys.path.insert(0, "..")

from IPython.display import display as idisplay
import plotly
import plotly.express as px
import plotly.graph_objects as pgo
import numpy as np
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
plotly.io.templates.default = "plotly_dark"

from html_tools import *
from lib.cache import FileCache
from har import *
from web import get_web_file
from clusters import *
from image_distance_matrices import filter_histograms, get_distance_matrix

FileCache.verbose = True

In [15]:
#px.line(x=[0], y=[0])

In [2]:
def log(*args, **kwargs):
    print(*args, **kwargs)

In [3]:
# hide
with open("image-histograms.json") as fp:
    histograms_raw = json.load(fp)

# convert histogram string indices back to int 
for key in histograms_raw:
    for hkey, hist in histograms_raw[key].items():
        if isinstance(hist, dict):
            histograms_raw[key][hkey] = {int(k): v for k, v in histograms_raw[key][hkey].items()}

print(len(histograms_raw), "hosts")

# THIS MUST MATCH with the settings in image_scan.py
cache_suffix = "-5"
histograms = filter_histograms(histograms_raw, min_count=5)
labels = sorted(histograms)
print(len(histograms), "hosts after filter")


11192 hosts
5760 hosts after filter


In [4]:
# hide
pd.DataFrame([c["count"], c["error_count"]] for c in histograms.values()).describe()

Unnamed: 0,0,1
count,5760.0,5760.0
mean,51.779514,0.063194
std,225.486029,0.492873
min,5.0,0.0
25%,9.0,0.0
50%,17.0,0.0
75%,42.0,0.0
max,8609.0,12.0


In [5]:
# hide
def bergi_value_histogram_distance(h1: dict, h2: dict) -> float:
    h1_max = max(h1.values())
    h2_max = max(h2.values())
    keys1 = sorted(h1.keys(), key=lambda k: -h1[k])
    keys2 = sorted(h2.keys(), key=lambda k: -h2[k])
    dist = 0.
    for k1, k2 in zip(keys1, keys2):
        dist += abs(k1 - k2) * (h1[k1] / h1_max + h2[k2] / h2_max)
    for k in set(keys1) - set(keys2):
        dist += k * h1[k] / h1_max
    for k in set(keys2) - set(keys1):
        dist += k * h2[k] / h2_max
    return dist
    #return abs(len(values1) - len(values2))

In [6]:
# hide
def get_max_mean_color(key: str) -> str:
    rgb = []
    for field in ("mean_r", "mean_g", "mean_b"):
        hist = histograms_raw[key][field]
        highest = sorted(hist.keys(), key=lambda k: -hist[k])[0]
        rgb.append(int(highest))
    return "#" + "".join(
        "%02x" % max(0, min(255, int(50 + v/255*205)))
        for v in rgb
    )

def get_avg_mean_color(key: str) -> str:
    rgb = []
    for field in ("mean_r", "mean_g", "mean_b"):
        hist = histograms_raw[key][field]
        color_agg, agg_sum = 0, 0
        for c in sorted(hist.keys(), key=lambda k: -hist[k])[:3]:
            count = hist[c]
            color_agg += c * count
            agg_sum += count
        rgb.append(color_agg // agg_sum)
    return "#" + "".join(
        "%02x" % max(0, min(255, int(50 + v/255*205)))
        for v in rgb
    )
# get_avg_mean_color("assets.keap.com")

In [7]:
def cached_tsne(
    data,
    read_cache: bool = True,
    write_cache: bool = True,
    cache_name: str = None,
    **kwargs,
):
    def _run():
        solver = TSNE(**kwargs)
        return solver.fit_transform(data)

    if cache_name is None:
        cache_name = f"tsne/{FileCache.to_hash(data)}-{FileCache.to_hash(kwargs)}.npy"
    
    return FileCache.execute(
        cache_name, 
        _run,
        lambda n: np.load(n),
        lambda n, d: np.save(n, d),
        read_cache=read_cache,
        write_cache=write_cache,
    )
    

In [8]:
# hide
distance_fields = ["width", "height", "mean_r", "mean_g", "mean_b", "mean_a", "channels"]
distances = {
    field: get_distance_matrix(histograms, labels, field, cache_suffix=cache_suffix)
    for field in distance_fields
}

loading /home/bergi/prog/python/github/blog/cache/distance-matrix/0f8ef1c923cee3511bb7944605b9b95e-5.npy
loading /home/bergi/prog/python/github/blog/cache/distance-matrix/da012b3f26409cc4153afb88d9d5a7f3-5.npy
loading /home/bergi/prog/python/github/blog/cache/distance-matrix/9ad2b20da7204225ff8328feb1e37ae1-5.npy
loading /home/bergi/prog/python/github/blog/cache/distance-matrix/823c163395a5b2bcfe1711c9598ba31e-5.npy
loading /home/bergi/prog/python/github/blog/cache/distance-matrix/3c3b96171e77d5f013cb177ba981e4c8-5.npy
loading /home/bergi/prog/python/github/blog/cache/distance-matrix/a1b3f4fd44aecb2ba44905a9a9e8b2a9-5.npy
loading /home/bergi/prog/python/github/blog/cache/distance-matrix/4bdb892cec93f93dd68df0c43c312a0d-5.npy


In [9]:
# ------ render data to javascript ---- 

def _histogram_to_label(hist: dict, num=3) -> str:
    text = ", ".join(
        f"{key}: {hist[key]}"
        for key in sorted(hist.keys(), key=lambda k: -hist[k])[:num]
    )
    if len(hist) > num:
        text += ", .."
    return text

# hide-code
js_data = {
    "hosts": labels,
    "colors": [get_avg_mean_color(l) for l in labels],
    "paths": [histograms[key]["paths"] for key in labels],
    "labels": {
        field: [
            _histogram_to_label(histograms[l][field])
            for l in labels
        ]
        for field in ["width", "channels"]
    }
}
html = f"""<script>var web_image_data={json.dumps(js_data)}</script>"""
html_display(html)

In [20]:
%%javascript

window.load_web_plot_image = function(elem) {
    const 
        path = elem.getAttribute("data-path"),
        img = elem.querySelector("img");
    if (img.src && img.src.length)
        img.removeAttribute("src");
    else
        img.src = path;
}

window.on_web_plot_click = function(elem_id, index) {
    const 
        host = web_image_data.hosts[index],
        paths = web_image_data.paths[index];
    
    let html = `<h3>${host}</h3>`;
    html += `<div>` + paths.map(function(path) {
        path = `https://${host}${path}`;
        return `<div onclick="window.load_web_plot_image(this)" data-path="${path}"`
             + ` style="cursor: pointer">${path} <img></div>`;
    }).join("") + "</div>";
    
    document.querySelector(`#post-${elem_id}`).innerHTML = html;
}


window.render_histogram_cloud = function(elem_id, x, y, title, field) {
    const scatter = {
        type: "scattergl",
        mode: "markers",
        marker: {
            color: web_image_data.colors,
            opacity: .5,
        },
        hoverinfo: "text",
        hovertext: web_image_data.hosts.map(function(text, i) {
            if (web_image_data.labels[field])
                return `${text} (${field} ${web_image_data.labels[field][i]})`;
            return text;
        }),
        x: x,
        y: y,
    }
    
    require(["plotly"], function(Plotly) {
        Plotly.newPlot(
            elem_id, 
            [scatter],
            {
                width: 900,
                height: 800,
                title: title,
                hovermode: "closest",
                plot_bgcolor: "#000",
                paper_bgcolor: "#000",
                margin: {"l": 0, "r": 0, "t": 0, "b": 0},
            },
            {
                responsive: true
            },
        );
        document.getElementById(elem_id).on("plotly_click", function(click_data) {
            on_web_plot_click(elem_id, click_data.points[0].pointIndex);
        });
    });
}

<IPython.core.display.Javascript object>

In [21]:
def render_histogram_cloud_js(distances: np.ndarray, title: str = None, field: str = None):
    log("tsne")
    positions = cached_tsne(distances, metric="precomputed", square_distances=True)
    
    log("plot")
    ID = secrets.token_hex(8)
    html = """
    <div id="plot-%(ID)s"></div>
    <div id="post-plot-%(ID)s"></div>
    <script>
        render_histogram_cloud("plot-%(ID)s", %(x)s, %(y)s, "%(title)s", "%(field)s");
    </script>
    """ % {
        "ID": ID,
        "x": json.dumps(positions[:,0].tolist()),
        "y": json.dumps(positions[:,1].tolist()),
        "title": html_lib.escape(title),
        "field": field,
    }
    html_display(html)

render_histogram_cloud_js(distances["width"] + distances["height"], "image size clusters", "width")

tsne
plot


loading cache /home/bergi/prog/python/github/blog/cache/tsne/d5d1e62adda5ce95a88242738fb27a11-a7c89209b640e7c83d2a8df1615f9488.npy


In [16]:
dist = distances["mean_r"] + distances["mean_g"] + distances["mean_b"] + distances["mean_a"]
render_histogram_cloud_js(dist, "image color clusters")

tsne
plot


loading cache /home/bergi/prog/python/github/blog/cache/tsne/4515a08710a56fdf17a19548c61763d2-a7c89209b640e7c83d2a8df1615f9488.npy


In [22]:
dist = distances["channels"]
render_histogram_cloud_js(dist, "image channel clusters", "channels")

tsne
plot


loading cache /home/bergi/prog/python/github/blog/cache/tsne/83711d3d7e4c31fa50e3c8545c4cb2ec-a7c89209b640e7c83d2a8df1615f9488.npy
