In [1]:
import pandas as pd
from os.path import join
from datetime import datetime
from glob import glob

In [2]:
from crawl_ids import CrawlFireTVManualV2, CrawlRokuManualV2
from df_utils import load_df
from nb_utils import get_crawl_data_path
from log_analysis import add_domain_column
from ott_leaks import load_reqs_as_df

In [3]:
def get_web_requests(crawl_name):
    n_ch_with_web_traffic = 0
    crawl_data_dir = get_crawl_data_path(crawl_name)
    # holds the web requests
    web_req_df = pd.DataFrame([])

    for openwpm_db_path in glob(join(crawl_data_dir, "openwpm-data/*/crawl-data.sqlite")):
        tmp_df = load_reqs_as_df(openwpm_db_path)
        if len(tmp_df):
            n_ch_with_web_traffic+=1
        web_req_df = web_req_df.append(tmp_df)

    add_domain_column(web_req_df)
    print("Num. of channels with web traffic", crawl_name, n_ch_with_web_traffic)
    return web_req_df

In [4]:
def get_channel_domains_map(req_df):
    """Return a dict of channel name to set of domains observed on this channel."""
    return {k: set(v) for k, v in req_df.groupby('channel_name')['req_domain']}

In [5]:
amazon_web_req_df = get_web_requests(CrawlFireTVManualV2)
roku_web_req_df = get_web_requests(CrawlRokuManualV2)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  amazon_df.category[amazon_df.category == ""] = 'Others'


('Num. of channels with web traffic', 'amazon_manual_v2', 11)
('Num. of channels with web traffic', 'roku_manual_v2', 12)


In [6]:
amazon_web_channel_domains = get_channel_domains_map(amazon_web_req_df)
roku_web_channel_domains = get_channel_domains_map(roku_web_req_df)

amazon_dev_reqs_df = load_df(CrawlFireTVManualV2, "http_req")
roku_dev_reqs_df = load_df(CrawlRokuManualV2, "http_req")

print(len(amazon_web_channel_domains), "amazon channels had web traffic")
print(len(roku_web_channel_domains), "roku channels had web traffic")


# we only consider adblocked domains
amazon_device_channel_domains = get_channel_domains_map(amazon_dev_reqs_df[amazon_dev_reqs_df.adblocked])
roku_device_channel_domains = get_channel_domains_map(roku_dev_reqs_df[roku_dev_reqs_df.adblocked])
print(amazon_dev_reqs_df.channel_id.nunique(), "amazon channels had device traffic")
print(len(amazon_device_channel_domains), "amazon channels had adblocked device traffic")
print(roku_dev_reqs_df.channel_id.nunique(), "roku channels had device traffic")
print(len(roku_device_channel_domains), "roku channels had adblocked device traffic")


(11, 'amazon channels had web traffic')
(12, 'roku channels had web traffic')
(20, 'amazon channels had device traffic')
(20, 'amazon channels had adblocked device traffic')
(21, 'roku channels had device traffic')
(14, 'roku channels had adblocked device traffic')


In [7]:
def print_common_domains(web_domains_map, device_domains_map):
    overlap_cnt = 0
    for channel_name, web_domains in web_domains_map.items():
        # print(channel_name)
        if channel_name not in device_domains_map:
            print("** No adblocked device traffic", channel_name)
            continue
        device_domains = device_domains_map[channel_name]
        overlap = web_domains.intersection(device_domains)
        if overlap:
            overlap_cnt += 1
            print(channel_name, len(overlap), len(device_domains), len(web_domains), overlap)
        else:
            print(channel_name, "No overlap")
    print(overlap_cnt, "channels had overlapping tracking domains on their device and web traffic")


## Fire TV - Overlapping domains

In [8]:
print_common_domains(amazon_web_channel_domains, amazon_device_channel_domains)

('Cartoon Network App - Watch Videos  Clips and Full Episodes of Your Favorite Shows', 1, 9, 5, set([u'adobe.com']))
('** No adblocked device traffic', 'HBO GO')
('Tubi - Watch Free Movies &amp; TV Shows', 5, 22, 13, set([u'facebook.com', u'appboy.com', u'amazon-adsystem.com', u'doubleclick.net', u'google-analytics.com']))
('NFL', 4, 8, 42, set([u'storage.googleapis.com', u'amazon-adsystem.com', u'doubleclick.net', u'nfl.com']))
('NBC Sports', 7, 15, 55, set([u'demdex.net', u'doubleclick.net', u'omtrdc.net', u'nbcsports.com', u'scorecardresearch.com', u'fwmrm.net', u'imrworldwide.com']))
('** No adblocked device traffic', 'HBO NOW: Stream TV &amp; Movies')
('Showtime Anytime', 1, 3, 42, set([u'2o7.net']))
('** No adblocked device traffic', 'NBC')
('Lifetime', 3, 3, 78, set([u'demdex.net', u'scorecardresearch.com', u'aetn.com']))
('** No adblocked device traffic', 'Pluto TV - It&#39;s Free TV')
('A&amp;E', 9, 15, 79, set([u'demdex.net', u'doubleclick.net', u'google.com', u'scorecardrese

## Roku - Overlapping domains

In [9]:
print_common_domains(roku_web_channel_domains, roku_device_channel_domains)

('** No adblocked device traffic', u'Hotstar')
(u'ABC', 1, 1, 28, set([u'scorecardresearch.com']))
(u'PBS', 2, 2, 19, set([u'scorecardresearch.com', u'doubleclick.net']))
('** No adblocked device traffic', u'Netflix')
(u'Bravo', 5, 5, 45, set([u'demdex.net', u'omtrdc.net', u'scorecardresearch.com', u'imrworldwide.com', u'fwmrm.net']))
('** No adblocked device traffic', u'Prime Video')
('** No adblocked device traffic', u'YouTube')
(u'NBC Sports', 4, 7, 23, set([u'demdex.net', u'nbcsports.com', u'omtrdc.net', u'fwmrm.net']))
('** No adblocked device traffic', u'Spectrum TV')
(u'Cartoon Network', 'No overlap')
('** No adblocked device traffic', u'Watch TNT')
('** No adblocked device traffic', u'STARZ')
(4, 'channels had overlapping tracking domains on their device and web traffic')


In [11]:
roku_dev_reqs_df.channel_name.unique()

array([u'Prime Video', u'NBC News', u'NewsON', u'Watch TNT', u'ABC News',
       u'Newsy', u'fuboTV Watch Live Sports & TV', u'Acorn TV',
       u'Sling TV', u'Fox News Channel', u'Bravo', u'NBC Sports',
       u'DIRECTV NOW', u'Hotstar', u'SHOWTIME', u'PBS',
       u'Cartoon Network', u'STARZ', u'Netflix', u'ABC', u'Hulu'],
      dtype=object)