In [None]:
import re
import json
import sqlite3
import pandas as pd

In [None]:
# import some analysis utilities from https://github.com/englehardt/crawl_utils
import sys
sys.path.append('./crawl_utils/')
import domain_utils as du
import analysis_utils as au

In [None]:
DB_DIR = 'stateless_sample_crawl'
DB = os.path.join(DB_DIR, 'stateless_sample.sqlite')
LDB = os.path.join(DB_DIR, 'content.ldb')

# HTTP Requests

In [None]:
# Load the data
con = sqlite3.connect(DB)
con.row_factory = sqlite3.Row
cur = con.cursor()

### Add some additional columns to help with analysis

In [None]:
# Add the public suffix + 1 of a bunch of the URL columns
reqs['url_ps1'] = reqs['url'].apply(du.get_ps_plus_1)
reqs['top_ps1'] = reqs['top_level_url'].apply(lambda x: du.get_ps_plus_1(x) if x is not None else None)
reqs['loading_ps1'] = reqs['loading_href'].apply(lambda x: du.get_ps_plus_1(x) if x is not None else None)

In [None]:
# Parse some info out of the call stack
reqs['stack_scripts'] = reqs['req_call_stack'].apply(au.get_script_urls_from_call_stack_as_set)
reqs['stack_ps1s'] = reqs['stack_scripts'].apply(lambda x: set([du.get_ps_plus_1(y) for y in x]))

In [None]:
total_sites = reqs['top_level_url'].nunique()

### How many sites is doubleclick.net loaded on?

In [None]:
reqs[reqs['url_ps1'] == 'doubleclick.net'].top_level_url.nunique() / float(total_sites)

In [None]:
reqs[reqs['url_ps1'] == 'google-analytics.com'].top_level_url.nunique() / float(total_sites)

### What domains does doubleclick.net load other resources from?

In [None]:
reqs[
    reqs.req_call_stack != ''
]['req_call_stack'].iloc[0]

In [None]:
reqs[
    reqs.req_call_stack.apply(lambda x: len(x) > 0)
]['stack_ps1s']

In [None]:
reqs[
    reqs.stack_ps1s.apply(lambda x: 'doubleclick.net' in x)
].groupby('url_ps1').top_level_url.count().sort_values(ascending=False)

# Javascript Calls

In [None]:
# Load the data
table_name = 'javascript'
js = pq.ParquetDataset(
    BUCKET_URI % table_name,
    filesystem=fs,
    metadata_nthreads=4
).read_pandas().to_pandas()

In [None]:
# Add the public suffix + 1 of a bunch of the URL columns
js['script_ps1'] = js['script_url'].apply(lambda x: du.get_ps_plus_1(x) if x is not None else None)
js['top_ps1'] = js['top_level_url'].apply(lambda x: du.get_ps_plus_1(x) if x is not None else None)
js['document_ps1'] = js['document_url'].apply(lambda x: du.get_ps_plus_1(x) if x is not None else None)

#### Canvas font fingerprinting

Filter JS calls and inspect for the use of canvas font fingerprinting

In [None]:
font_shorthand = re.compile(r"^\s*(?=(?:(?:[-a-z]+\s*){0,2}(italic|oblique))?)(?=(?:(?:[-a-z]+\s*){0,2}(small-caps))?)(?=(?:(?:[-a-z]+\s*){0,2}(bold(?:er)?|lighter|[1-9]00))?)(?:(?:normal|\1|\2|\3)\s*){0,3}((?:xx?-)?(?:small|large)|medium|smaller|larger|[.\d]+(?:\%|in|[cem]m|ex|p[ctx]))(?:\s*\/\s*(normal|[.\d]+(?:\%|in|[cem]m|ex|p[ctx])))?\s*([-_\{\}\(\)\&!\',\*\.\"\sa-zA-Z0-9]+?)\s*$")

In [None]:
# Grab all of the canvas calls
js[
    js.symbol.str.startswith('CanvasRenderingContext2D')
]

In [None]:
js[
    (js.symbol == 'CanvasRenderingContext2D.measureText') &
    (js.script_ps1 != js.top_ps1)
].groupby('script_ps1').top_ps1.count().sort_values(ascending=False)

In [None]:
js[
    (js.symbol == 'CanvasRenderingContext2D.measureText') &
    (js.script_ps1 != js.top_ps1) & 
    (js.script_ps1 == 'admicro.vn')
].arguments.apply(lambda x: json.loads(x)["0"]).unique()

In [None]:
js[
    (js.symbol == 'CanvasRenderingContext2D.font') &
    (js.script_ps1 != js.top_ps1) & 
    (js.script_ps1 == 'admicro.vn')
].value.apply(lambda x: re.match(font_shorthand, x).group(6)).unique()