In [None]:
import psycopg2
import csv
import utils
import re
import census
from collections import defaultdict
import dill

# Set-up: Connect to the database
Two databases available:
* census_2016_04_spider_4
* census_2016_08_25k_id_detection_1

This cell will set the 'con' object — the database connection. It gets passed to all census functions.

In [None]:
census_name = 'census_2016_08_25k_id_detection_1' # Change me!

small_crawl = census.Census(census_name)

# Available API

## Check to see that a given top_url is present in the dataset
All data in the census is keyed by each 'top_url' visited. Each top_url follows the format:  

`http://example.com`  

There is never a leading '`www.`', nor is the scheme ever '`https://`'. If a site redirects to https, that will be reflected in the crawl's data.

In [None]:
top_url = 'http://netflix.com'
print(small_crawl.check_top_url(top_url))

top_url = 'http://notincensus.com'
print(small_crawl.check_top_url(top_url))


## Get all third party responses by top_url
`census.get_third_party_responses_by_domain(con, top_url)` returns a two-level results dict containing third party urls loaded on the given top_url.

The dict's structure is:

`dict[third_party_url]['is_tracker']`, contains True if third_party_url is identified on a blocklist.  
`dict[third_party_url]['is_js']`, contains True if third_party_url is a script.  
`dict[third_party_url]['is_img']`, contains True if third_party_url is an image.  
`dict[third_party_url]['url_ps']`, contains the string for the public-suffix+1 of the url.  



In [None]:
top_url = 'http://espn.go.com'

results = small_crawl.get_third_party_responses_by_domain('http://espn.go.com')

third_party_trackers = {results[x]['url_ps'] for x in results if results[x]['is_tracker']}

print("Number of third_party trackers on domain: " + str(len(third_party_trackers)))

for url in results:
    print(url)
    print('\tIs a script? ' + str(results[url]['is_js']))
    print('\tIs a tracker? ' + str(results[url]['is_tracker']))
    print('\tPS+1: ' + results[url]['url_ps'])

## Get all third party responses for a list of sites
A cell to simplify getting all third party responses for a given list of sites.

In [None]:
sites = ['http://cnn.com', 'http://wsj.com'] # Change me!

# Resulting dictionaries
tracker_js_by_top = defaultdict(set)
tracker_img_by_top = defaultdict(set)
non_tracker_js_by_top = defaultdict(set)
non_tracker_img_by_top = defaultdict(set)

tracker_other_by_top = defaultdict(set)
non_tracker_other_by_top = defaultdict(set)
for site in sites:
    tp_data = small_crawl.get_third_party_responses_by_domain(site)
    for url in tp_data:
        url_ps = tp_data[url]['url_ps']
        is_tracker = tp_data[url]['is_tracker']
        if is_tracker:
            if tp_data[url]['is_js']:
                tracker_js_by_top[site].add(url_ps)
            elif tp_data[url]['is_img']:
                tracker_img_by_top[site].add(url_ps)
            else:
                tracker_other_by_top[site].add(url_ps)
        else:
            if tp_data[url]['is_js']:
                non_tracker_js_by_top[site].add(url_ps)
            elif tp_data[url]['is_img']:
                non_tracker_img_by_top[site].add(url_ps)           
            else:
                non_tracker_other_by_top[site].add(url_ps)


In [None]:
# Save output as .dill

with open('tracker_js_by_top.dill', 'wb') as f:
    dill.dump(tracker_js_by_top, f)
with open('tracker_js_by_top.dill', 'wb') as f:
    dill.dump(non_tracker_js_by_top, f)
with open('tracker_img_by_top.dill', 'wb') as f:
    dill.dump(tracker_img_by_top, f)
with open('non_tracker_img_by_top.dill', 'wb') as f:
    dill.dump(non_tracker_img_by_top, f)
with open('tracker_other_by_top.dill', 'wb') as f:
    dill.dump(tracker_other_by_top, f)
with open('non_tracker_other_by_top.dill', 'wb') as f:
    dill.dump(non_tracker_other_by_top, f)

In [None]:
# Save output in CSVs
with open('tracker_js_by_top.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['top_url', 'tp_domain'])
    for top in tracker_js_by_top:
        for tp in tracker_js_by_top[top]:
            writer.writerow([top, tp])
with open('non_tracker_js_by_top.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['top_url', 'tp_domain'])
    for top in non_tracker_js_by_top:
        for tp in non_tracker_js_by_top[top]:
            writer.writerow([top, tp])
with open('tracker_img_by_top.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['top_url', 'tp_domain'])
    for top in tracker_img_by_top:
        for tp in tracker_img_by_top[top]:
            writer.writerow([top, tp])
with open('non_tracker_img_by_top.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['top_url', 'tp_domain'])
    for top in non_tracker_img_by_top:
        for tp in non_tracker_img_by_top[top]:
            writer.writerow([top, tp])
with open('tracker_other_by_top.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['top_url', 'tp_domain'])
    for top in tracker_other_by_top:
        for tp in tracker_other_by_top[top]:
            writer.writerow([top, tp])
with open('non_tracker_other_by_top.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['top_url', 'tp_domain'])
    for top in non_tracker_other_by_top:
        for tp in non_tracker_other_by_top[top]:
            writer.writerow([top, tp])

## Get top_urls that load a resource from a given third party domain

`census.get_top_urls_with_third_party_domain(con, tp_domain)` returns a dictionary mapping top_urls in the census to a list of urls that were loaded on that top_url belonging to a certain tp_domain.

In [None]:
tp_domain = 'addthis.com'

tps_by_top = small_crawl.get_top_urls_with_third_party_domain(tp_domain)

print("Number of top_urls with given third party : " + str(len(tps_by_top)))

## Get "cookie sync" events on a given top_url
Note: This does not include logic for isolating "identifying cookies." Any cookies of a sufficient cookie length that are shared with other domains will be identified.

In [None]:
# For a single top_url...

results = small_crawl.get_cookie_syncs_v2('http://microsoft.com', cookie_length=8)

for receiving_url in results:
    print("R: " + receiving_url)
    for sending_url, val in results[receiving_url]:
        print("\tS: " + sending_url)
        print("\tV: " + val)

In [None]:
# For a list of top_urls...
# Warning: this method is slow.

sites = ['http://microsoft.com', 'http://cnn.com']  # Change me!

cookie_sync_data = defaultdict(defaultdict)
for i, site in sites:
    print(site)
    cookie_sync_data[site] = small_crawl.get_cookie_syncs_v2(site, cookie_length=8)


In [None]:
# Write output as .dill
with open('cookie_syncs.dill', 'w') as f:
    dill.dump(cookie_sync_data, f)

# Write complete output as csv
with open('cookie_syncs.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['top_url', 'sending_domain', 'receiving_url', 'cookie_value'])
    for site in cookie_sync_data:
        for receiving_url in cookie_sync_data[site]:
            for sending_url, cookie_value in cookie_sync_data[site][receiving_url]:
                writer.writerow([site, sending_url, receiving_url, cookie_value])

# Write partial output as CSV, only identifying sending domain and receiving domain
# (rather than the full receiving URL)

cooks_just_domains = defaultdict(defaultdict)
for site in cookie_sync_data:
    cooks_just_domains[site] = defaultdict(set)
    for receiving_url in cookie_sync_data[site]:
        for sending_domain, value in cookie_sync_data[site][receiving_url]:
            cooks_just_domains[site][utils.get_host_plus_ps(receiving_url)].add(sending_domain)
with open('../wsj/cookie_syncs_v2_just_domains.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['top_url', 'sending_domain', 'receiving_domain'])
    for site in cooks_just_domains:
        for receiving_domain in cooks_just_domains[site]:
            if len(cooks_just_domains[site][receiving_domain]) > 1 and 'NOT_FOUND' in cooks_just_domains[site][receiving_domain]:
                cooks_just_domains[site][receiving_domain].discard('NOT_FOUND')
            for sending_domain in cooks_just_domains[site][receiving_domain]:
                writer.writerow([site, sending_domain, receiving_domain])

## Check a given url against a blocklist
Available blocklists:
* easylist.txt
* easyprivacy.txt

In [None]:
print(utils.is_tracker('http://tags.bkrtx.com/js/bk-coretag.js', is_js=True, is_img=False, 
                       first_party='http://verizonwireless.com', blocklist='easylist.txt'))


## Get third party scripts on given top_url that call particular javascript symbol

In [None]:
print(small_crawl.get_urls_with('http://cnn.com', 'CanvasRenderingContext2D.fillText'))

## Get the PS+1 of a given top_url

In [None]:
print(utils.get_host_plus_ps('http://subdomain.example.com/this/will/be/deleted.jpg'))